llm-testrunner-components 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +165 -242
  3. package/dist/cjs/index.cjs.js +305 -237
  4. package/dist/cjs/index.cjs.js.map +1 -1
  5. package/dist/cjs/llm-testrunner.cjs.js +1 -1
  6. package/dist/cjs/loader.cjs.js +1 -1
  7. package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
  8. package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
  9. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
  10. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
  11. package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
  12. package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
  13. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
  14. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
  15. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
  16. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
  17. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
  18. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
  19. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
  20. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
  21. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
  22. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
  23. package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
  24. package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
  25. package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
  26. package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
  27. package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
  28. package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
  29. package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
  30. package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
  31. package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
  32. package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
  33. package/dist/collection/lib/evaluation/index.js +0 -4
  34. package/dist/collection/lib/evaluation/index.js.map +1 -1
  35. package/dist/collection/lib/evaluation/types.js.map +1 -1
  36. package/dist/collection/lib/import-export/test-results-csv.js +47 -33
  37. package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
  38. package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
  39. package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
  40. package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
  41. package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
  42. package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
  43. package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
  44. package/dist/collection/schemas/expected-outcome.js +20 -2
  45. package/dist/collection/schemas/expected-outcome.js.map +1 -1
  46. package/dist/collection/schemas/test-case.js +2 -20
  47. package/dist/collection/schemas/test-case.js.map +1 -1
  48. package/dist/collection/types/llm-test-runner.js.map +1 -1
  49. package/dist/collection/types/test-case.js.map +1 -1
  50. package/dist/components/index.js +1 -1
  51. package/dist/components/llm-test-runner.js +1 -1
  52. package/dist/components/p-JPMPoOC8.js +7 -0
  53. package/dist/components/p-JPMPoOC8.js.map +1 -0
  54. package/dist/esm/index.js +305 -237
  55. package/dist/esm/index.js.map +1 -1
  56. package/dist/esm/llm-testrunner.js +1 -1
  57. package/dist/esm/loader.js +1 -1
  58. package/dist/llm-testrunner/index.esm.js +2 -2
  59. package/dist/llm-testrunner/index.esm.js.map +1 -1
  60. package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
  61. package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
  62. package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
  63. package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
  64. package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
  65. package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
  66. package/dist/types/components.d.ts +9 -0
  67. package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
  68. package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
  69. package/dist/types/lib/evaluation/index.d.ts +0 -1
  70. package/dist/types/lib/evaluation/types.d.ts +26 -0
  71. package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
  72. package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
  73. package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
  74. package/dist/types/schemas/expected-outcome.d.ts +65 -17
  75. package/dist/types/schemas/test-case.d.ts +51 -95
  76. package/dist/types/types/llm-test-runner.d.ts +1 -1
  77. package/dist/types/types/test-case.d.ts +1 -1
  78. package/package.json +9 -2
  79. package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
  80. package/dist/components/p-BF90yb1z.js +0 -7
  81. package/dist/components/p-BF90yb1z.js.map +0 -1
  82. /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
@@ -1,8 +1,8 @@
1
1
  import { describe, it, expect } from "@jest/globals";
2
- import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "./constants";
2
+ import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "../constants";
3
3
  // Using integration tests with actual js-rouge library (no mocks).
4
4
  // This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.
5
- import { performRouge1Evaluation } from "./evaluators/rouge1-evaluator";
5
+ import { performRouge1Evaluation } from "./rouge1-evaluator";
6
6
  const mockRequest = {
7
7
  testCaseId: 'test-000',
8
8
  question: 'What is your name?',
@@ -0,0 +1 @@
1
+ {"version":3,"file":"rouge1-evaluator.test.js","sourceRoot":"","sources":["../../../../src/lib/evaluation/evaluators/rouge1-evaluator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAErD,OAAO,EAAE,wBAAwB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;AAC5E,mEAAmE;AACnE,8FAA8F;AAC9F,OAAO,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAC;AAE7D,MAAM,WAAW,GAAsB;IACrC,UAAU,EAAE,UAAU;IACtB,QAAQ,EAAE,oBAAoB;IAC9B,cAAc,EAAE,6BAA6B;IAC7C,eAAe,EAAE,iBAAiB;IAClC,oBAAoB,EAAE;QACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;QACpC,SAAS,EAAE,GAAG;KACf;CACF,CAAC;AAEF,MAAM,sBAAsB,GAAsB;IAChD,GAAG,WAAW;IACd,oBAAoB,EAAE;QACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;QACpC,SAAS,EAAE,SAAS;KACrB;CACF,CAAC;AAEF,QAAQ,CAAC,yBAAyB,EAAE,GAAG,EAAE;IACvC,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;YACxE,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,iBAAiB;aACnC,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;YACvB,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QACzB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;YACtE,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,oDAAoD;gBACpE,eAAe,EAAE,2CAA2C;aAC7D,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YACpB,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;YAC5E,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,oCAAoC;aACtD,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;YAC9B,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,sBAAsB,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAChD,wBAAwB,CACzB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,yCAAyC;gBACzD,eAAe,EAAE,8BAA8B;gBAC/C,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;oBACpC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7D,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,kBAAkB;gBACnC,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;oBACpC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACxD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;QAC1B,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC;YAEvD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACxE,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC1E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;YAExD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC","sourcesContent":["import { describe, it, expect } from '@jest/globals';\nimport { EvaluationRequest } from '../types';\nimport { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from '../constants';\n// Using integration tests with actual js-rouge library (no mocks).\n// This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.\nimport { performRouge1Evaluation } from './rouge1-evaluator';\n\nconst mockRequest: EvaluationRequest = {\n testCaseId: 'test-000',\n question: 'What is your name?',\n actualResponse: 'I am a large language model',\n expectedOutcome: 'model\\nlanguage',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 0.5,\n },\n};\n\nconst mockRequestNoThreshold: EvaluationRequest = {\n ...mockRequest,\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: undefined,\n },\n};\n\ndescribe('performRouge1Evaluation', () => {\n describe('Basic functionality', () => {\n it('should pass when response contains exact keyword matches', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is a language model system',\n expectedOutcome: 'language\\nmodel',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(2);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThan(0.5);\n expect(result.keywordMatches[1].found).toBe(true);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeGreaterThan(0.5);\n });\n\n it('should fail when keywords are not sufficiently present', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is completely unrelated content about cooking',\n expectedOutcome: 'machine learning\\nartificial intelligence',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(false);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n });\n\n it('should partially pass when only some keywords meet threshold', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'Machine learning is fascinating',\n expectedOutcome: 'machine learning\\ndatabase systems',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThanOrEqual(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n });\n });\n\n describe('Threshold handling', () => {\n it('should use default threshold when not provided', async () => {\n const result = await performRouge1Evaluation(mockRequestNoThreshold);\n\n expect(result.evaluationParameters.threshold).toBe(\n DEFAULT_ROUGE_PASS_SCORE,\n );\n });\n\n it('should pass all keywords with threshold 0.0', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'completely unrelated text about cooking',\n expectedOutcome: 'quantum physics\\nmathematics',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 0.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.every(m => m.found)).toBe(true);\n expect(result.evaluationParameters.threshold).toBe(0.0);\n });\n\n it('should fail when threshold is 1.0 and match is not perfect', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is about learning concepts',\n expectedOutcome: 'machine learning',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 1.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(1.0);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(1.0);\n });\n });\n\n describe('Edge cases', () => {\n it('should handle empty actualResponse', async () => {\n const request = { ...mockRequest, actualResponse: '' };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBe(0);\n expect(result.keywordMatches[1].evaluationApproachResult.score).toBe(0);\n });\n\n it('should handle empty expectedOutcome string', async () => {\n const request = { ...mockRequest, expectedOutcome: '' };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(0);\n });\n });\n});\n"]}
@@ -16,6 +16,7 @@ export class SemanticEvaluator {
16
16
  }
17
17
  }
18
18
  async performEvaluation(request) {
19
+ const threshold = request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;
19
20
  try {
20
21
  await this.initialize();
21
22
  // Split expectedOutcome by newlines to create keywords array
@@ -25,7 +26,7 @@ export class SemanticEvaluator {
25
26
  .map(k => k.trim())
26
27
  .filter(k => k.length > 0)
27
28
  : [];
28
- const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, DEFAULT_SEMANTIC_PASS_SCORE);
29
+ const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, threshold);
29
30
  const totalItems = keywordMatches.length;
30
31
  // calculate the overall score by averaging the score of the keyword matches
31
32
  const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
@@ -33,7 +34,7 @@ export class SemanticEvaluator {
33
34
  const passed = keywordMatches.every(match => match.found);
34
35
  const evaluationParameters = {
35
36
  approach: EvaluationApproach.SEMANTIC,
36
- threshold: DEFAULT_SEMANTIC_PASS_SCORE,
37
+ threshold,
37
38
  };
38
39
  return {
39
40
  testCaseId: request.testCaseId,
@@ -55,7 +56,7 @@ export class SemanticEvaluator {
55
56
  keywordMatches: [],
56
57
  evaluationParameters: {
57
58
  approach: EvaluationApproach.SEMANTIC,
58
- threshold: DEFAULT_SEMANTIC_PASS_SCORE,
59
+ threshold,
59
60
  },
60
61
  evaluationApproachResult: {
61
62
  score: 0,
@@ -1 +1 @@
1
- {"version":3,"file":"SemanticEvaluator.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/SemanticEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAC;AACnD,OAAO,EAAE,4BAA4B,EAAE,MAAM,qBAAqB,CAAC;AAGnE,OAAO,EACL,2BAA2B,EAC3B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAEzB,MAAM,OAAO,iBAAiB;IAC5B,qEAAqE;IAC7D,MAAM,CAAC,SAAS,GAA8B,IAAI,CAAC;IAE3D,KAAK,CAAC,UAAU;QACd,IAAI,iBAAiB,CAAC,SAAS;YAAE,OAAO;QACxC,IAAI,CAAC;YACH,iBAAiB,CAAC,SAAS,GAAG,MAAM,iBAAiB,EAAE,CAAC;QAC1D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,2CAA2C,EAAE,KAAK,CAAC,CAAC;YAClE,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,CAAC,iBAAiB,CACrB,OAA0B;QAE1B,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;YAExB,6DAA6D;YAC7D,MAAM,gBAAgB,GAAG,OAAO,CAAC,eAAe;gBAC9C,CAAC,CAAC,OAAO,CAAC,eAAe;qBACpB,KAAK,CAAC,QAAQ,CAAC;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;qBAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;gBAC9B,CAAC,CAAC,EAAE,CAAC;YAEP,MAAM,cAAc,GAAG,MAAM,4BAA4B,CACvD,iBAAiB,CAAC,SAAS,EAC3B,OAAO,CAAC,cAAc,EACtB,gBAAgB,EAChB,2BAA2B,CAC5B,CAAC;YAEF,MAAM,UAAU,GAAG,cAAc,CAAC,MAAM,CAAC;YACzC,4EAA4E;YAC5E,MAAM,YAAY,GAAG,cAAc,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,wBAAwB,CAAC,KAAK,EACxD,CAAC,CACF,CAAC;YACF,MAAM,YAAY,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,4BAA4B;YACjG,MAAM,MAAM,GAAG,cAAc,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAE1D,MAAM,oBAAoB,GAAG;gBAC3B,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;gBACrC,SAAS,EAAE,2BAA2B;aACf,CAAC;YAE1B,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM;gBACN,cAAc;gBACd,oBAAoB;gBACpB,wBAAwB,EAAE;oBACxB,KAAK,EAAE,YAAY;oBACnB,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;gBACD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,wCAAwC,EAAE,KAAK,CAAC,CAAC;YAC/D,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;oBACrC,SAAS,EAAE,2BAA2B;iBACvC;gBACD,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;gBACD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;IACH,CAAC","sourcesContent":["import { EvaluationResult, EvaluationRequest } from '../../types';\nimport { loadSemanticModel } from './model-loader';\nimport { evaluateKeywordsSemantically } from './evaluate-keywords';\nimport { FeatureExtractionPipeline } from '@xenova/transformers';\nimport { EvaluationParameters } from '../../../../types/evaluation';\nimport {\n DEFAULT_SEMANTIC_PASS_SCORE,\n EvaluationApproach,\n} from '../../constants';\n\nexport class SemanticEvaluator {\n // TODO(LLM-39): Refactor SemanticEvaluator into a singleton pattern.\n private static extractor: FeatureExtractionPipeline = null;\n\n async initialize(): Promise<void> {\n if (SemanticEvaluator.extractor) return;\n try {\n SemanticEvaluator.extractor = await loadSemanticModel();\n } catch (error) {\n console.error('Failed to load semantic evaluation model:', error);\n throw error;\n }\n }\n\n async performEvaluation(\n request: EvaluationRequest,\n ): Promise<EvaluationResult> {\n try {\n await this.initialize();\n\n // Split expectedOutcome by newlines to create keywords array\n const expectedKeywords = request.expectedOutcome\n ? request.expectedOutcome\n .split(/[\\n,]+/)\n .map(k => k.trim())\n .filter(k => k.length > 0)\n : [];\n\n const keywordMatches = await evaluateKeywordsSemantically(\n SemanticEvaluator.extractor,\n request.actualResponse,\n expectedKeywords,\n DEFAULT_SEMANTIC_PASS_SCORE,\n );\n\n const totalItems = keywordMatches.length;\n // calculate the overall score by averaging the score of the keyword matches\n const keywordScore = keywordMatches.reduce(\n (acc, curr) => acc + curr.evaluationApproachResult.score,\n 0,\n );\n const overallScore = totalItems > 0 ? keywordScore / totalItems : 0; // to avoid division by zero\n const passed = keywordMatches.every(match => match.found);\n\n const evaluationParameters = {\n approach: EvaluationApproach.SEMANTIC,\n threshold: DEFAULT_SEMANTIC_PASS_SCORE,\n } as EvaluationParameters;\n\n return {\n testCaseId: request.testCaseId,\n passed,\n keywordMatches,\n evaluationParameters,\n evaluationApproachResult: {\n score: overallScore,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n timestamp: new Date().toISOString(),\n };\n } catch (error) {\n console.error('Failed to perform semantic evaluation:', error);\n return {\n testCaseId: request.testCaseId,\n passed: false,\n keywordMatches: [],\n evaluationParameters: {\n approach: EvaluationApproach.SEMANTIC,\n threshold: DEFAULT_SEMANTIC_PASS_SCORE,\n },\n evaluationApproachResult: {\n score: 0,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n timestamp: new Date().toISOString(),\n };\n }\n }\n}\n"]}
1
+ {"version":3,"file":"SemanticEvaluator.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/SemanticEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAC;AACnD,OAAO,EAAE,4BAA4B,EAAE,MAAM,qBAAqB,CAAC;AAGnE,OAAO,EACL,2BAA2B,EAC3B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAEzB,MAAM,OAAO,iBAAiB;IAC5B,qEAAqE;IAC7D,MAAM,CAAC,SAAS,GAA8B,IAAI,CAAC;IAE3D,KAAK,CAAC,UAAU;QACd,IAAI,iBAAiB,CAAC,SAAS;YAAE,OAAO;QACxC,IAAI,CAAC;YACH,iBAAiB,CAAC,SAAS,GAAG,MAAM,iBAAiB,EAAE,CAAC;QAC1D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,2CAA2C,EAAE,KAAK,CAAC,CAAC;YAClE,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,CAAC,iBAAiB,CACrB,OAA0B;QAE1B,MAAM,SAAS,GACb,OAAO,CAAC,oBAAoB,EAAE,SAAS,IAAI,2BAA2B,CAAC;QAEzE,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;YAExB,6DAA6D;YAC7D,MAAM,gBAAgB,GAAG,OAAO,CAAC,eAAe;gBAC9C,CAAC,CAAC,OAAO,CAAC,eAAe;qBACpB,KAAK,CAAC,QAAQ,CAAC;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;qBAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;gBAC9B,CAAC,CAAC,EAAE,CAAC;YAEP,MAAM,cAAc,GAAG,MAAM,4BAA4B,CACvD,iBAAiB,CAAC,SAAS,EAC3B,OAAO,CAAC,cAAc,EACtB,gBAAgB,EAChB,SAAS,CACV,CAAC;YAEF,MAAM,UAAU,GAAG,cAAc,CAAC,MAAM,CAAC;YACzC,4EAA4E;YAC5E,MAAM,YAAY,GAAG,cAAc,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,wBAAwB,CAAC,KAAK,EACxD,CAAC,CACF,CAAC;YACF,MAAM,YAAY,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,4BAA4B;YACjG,MAAM,MAAM,GAAG,cAAc,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAE1D,MAAM,oBAAoB,GAAG;gBAC3B,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;gBACrC,SAAS;aACc,CAAC;YAE1B,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM;gBACN,cAAc;gBACd,oBAAoB;gBACpB,wBAAwB,EAAE;oBACxB,KAAK,EAAE,YAAY;oBACnB,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;gBACD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,wCAAwC,EAAE,KAAK,CAAC,CAAC;YAC/D,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;oBACrC,SAAS;iBACV;gBACD,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;gBACD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;IACH,CAAC","sourcesContent":["import { EvaluationResult, EvaluationRequest } from '../../types';\nimport { loadSemanticModel } from './model-loader';\nimport { evaluateKeywordsSemantically } from './evaluate-keywords';\nimport { FeatureExtractionPipeline } from '@xenova/transformers';\nimport { EvaluationParameters } from '../../../../types/evaluation';\nimport {\n DEFAULT_SEMANTIC_PASS_SCORE,\n EvaluationApproach,\n} from '../../constants';\n\nexport class SemanticEvaluator {\n // TODO(LLM-39): Refactor SemanticEvaluator into a singleton pattern.\n private static extractor: FeatureExtractionPipeline = null;\n\n async initialize(): Promise<void> {\n if (SemanticEvaluator.extractor) return;\n try {\n SemanticEvaluator.extractor = await loadSemanticModel();\n } catch (error) {\n console.error('Failed to load semantic evaluation model:', error);\n throw error;\n }\n }\n\n async performEvaluation(\n request: EvaluationRequest,\n ): Promise<EvaluationResult> {\n const threshold =\n request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;\n\n try {\n await this.initialize();\n\n // Split expectedOutcome by newlines to create keywords array\n const expectedKeywords = request.expectedOutcome\n ? request.expectedOutcome\n .split(/[\\n,]+/)\n .map(k => k.trim())\n .filter(k => k.length > 0)\n : [];\n\n const keywordMatches = await evaluateKeywordsSemantically(\n SemanticEvaluator.extractor,\n request.actualResponse,\n expectedKeywords,\n threshold,\n );\n\n const totalItems = keywordMatches.length;\n // calculate the overall score by averaging the score of the keyword matches\n const keywordScore = keywordMatches.reduce(\n (acc, curr) => acc + curr.evaluationApproachResult.score,\n 0,\n );\n const overallScore = totalItems > 0 ? keywordScore / totalItems : 0; // to avoid division by zero\n const passed = keywordMatches.every(match => match.found);\n\n const evaluationParameters = {\n approach: EvaluationApproach.SEMANTIC,\n threshold,\n } as EvaluationParameters;\n\n return {\n testCaseId: request.testCaseId,\n passed,\n keywordMatches,\n evaluationParameters,\n evaluationApproachResult: {\n score: overallScore,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n timestamp: new Date().toISOString(),\n };\n } catch (error) {\n console.error('Failed to perform semantic evaluation:', error);\n return {\n testCaseId: request.testCaseId,\n passed: false,\n keywordMatches: [],\n evaluationParameters: {\n approach: EvaluationApproach.SEMANTIC,\n threshold,\n },\n evaluationApproachResult: {\n score: 0,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n timestamp: new Date().toISOString(),\n };\n }\n }\n}\n"]}
@@ -0,0 +1,24 @@
1
+ import { EvaluationApproach, EvaluationApproachValues } from "./constants";
2
+ const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
3
+ export function getAllowedApproachesForFieldType(fieldType) {
4
+ if (fieldType === 'select') {
5
+ return SELECT_ONLY_APPROACHES;
6
+ }
7
+ return EvaluationApproachValues;
8
+ }
9
+ export function isApproachAllowedForFieldType(fieldType, approach) {
10
+ return getAllowedApproachesForFieldType(fieldType).includes(approach);
11
+ }
12
+ export function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
13
+ const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
14
+ const fallbackApproach = allowedApproaches[0];
15
+ const rawApproach = evaluationParameters?.approach;
16
+ const approach = rawApproach && allowedApproaches.includes(rawApproach)
17
+ ? rawApproach
18
+ : fallbackApproach;
19
+ return {
20
+ ...evaluationParameters,
21
+ approach,
22
+ };
23
+ }
24
+ //# sourceMappingURL=field-evaluation-approach.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"field-evaluation-approach.js","sourceRoot":"","sources":["../../../src/lib/evaluation/field-evaluation-approach.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AAK3E,MAAM,sBAAsB,GAAyB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC;AAEhF,MAAM,UAAU,gCAAgC,CAC9C,SAA8B;IAE9B,IAAI,SAAS,KAAK,QAAQ,EAAE,CAAC;QAC3B,OAAO,sBAAsB,CAAC;IAChC,CAAC;IACD,OAAO,wBAAwB,CAAC;AAClC,CAAC;AAED,MAAM,UAAU,6BAA6B,CAC3C,SAA8B,EAC9B,QAA4B;IAE5B,OAAO,gCAAgC,CAAC,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AACxE,CAAC;AAED,MAAM,UAAU,qCAAqC,CACnD,SAA8B,EAC9B,oBAA2C;IAE3C,MAAM,iBAAiB,GAAG,gCAAgC,CAAC,SAAS,CAAC,CAAC;IACtE,MAAM,gBAAgB,GAAG,iBAAiB,CAAC,CAAC,CAAC,CAAC;IAC9C,MAAM,WAAW,GAAG,oBAAoB,EAAE,QAAQ,CAAC;IACnD,MAAM,QAAQ,GACZ,WAAW,IAAI,iBAAiB,CAAC,QAAQ,CAAC,WAAW,CAAC;QACpD,CAAC,CAAC,WAAW;QACb,CAAC,CAAC,gBAAgB,CAAC;IAEvB,OAAO;QACL,GAAG,oBAAoB;QACvB,QAAQ;KACT,CAAC;AACJ,CAAC","sourcesContent":["import { EvaluationApproach, EvaluationApproachValues } from './constants';\nimport type { EvaluationParameters } from '../../types/evaluation';\n\nexport type EvaluationFieldType = 'text' | 'textarea' | 'chips-input' | 'select';\n\nconst SELECT_ONLY_APPROACHES: EvaluationApproach[] = [EvaluationApproach.EXACT];\n\nexport function getAllowedApproachesForFieldType(\n fieldType: EvaluationFieldType,\n): EvaluationApproach[] {\n if (fieldType === 'select') {\n return SELECT_ONLY_APPROACHES;\n }\n return EvaluationApproachValues;\n}\n\nexport function isApproachAllowedForFieldType(\n fieldType: EvaluationFieldType,\n approach: EvaluationApproach,\n): boolean {\n return getAllowedApproachesForFieldType(fieldType).includes(approach);\n}\n\nexport function normalizeEvaluationParametersForField(\n fieldType: EvaluationFieldType,\n evaluationParameters?: EvaluationParameters,\n): EvaluationParameters {\n const allowedApproaches = getAllowedApproachesForFieldType(fieldType);\n const fallbackApproach = allowedApproaches[0];\n const rawApproach = evaluationParameters?.approach;\n const approach =\n rawApproach && allowedApproaches.includes(rawApproach)\n ? rawApproach\n : fallbackApproach;\n\n return {\n ...evaluationParameters,\n approach,\n };\n}\n\n"]}
@@ -1,7 +1,3 @@
1
1
  import { LLMEvaluationEngine } from "./evaluation-engine";
2
2
  export { LLMEvaluationEngine };
3
- export async function evaluateLLMResponse(request, callback) {
4
- const engine = new LLMEvaluationEngine();
5
- await engine.evaluateResponse(request, callback);
6
- }
7
3
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/lib/evaluation/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAQ1D,OAAO,EAAE,mBAAmB,EAAE,CAAC;AAQ/B,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,OAA0B,EAC1B,QAA4B;IAE5B,MAAM,MAAM,GAAG,IAAI,mBAAmB,EAAE,CAAC;IACzC,MAAM,MAAM,CAAC,gBAAgB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;AACnD,CAAC","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n} from './types';\n\nexport { LLMEvaluationEngine };\nexport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n};\n\nexport async function evaluateLLMResponse(\n request: EvaluationRequest,\n callback: EvaluationCallback,\n): Promise<void> {\n const engine = new LLMEvaluationEngine();\n await engine.evaluateResponse(request, callback);\n}\n"]}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/lib/evaluation/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAQ1D,OAAO,EAAE,mBAAmB,EAAE,CAAC","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n} from './types';\n\nexport { LLMEvaluationEngine };\nexport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n};"]}
@@ -1 +1 @@
1
- {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n timestamp?: string;\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}\n"]}
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\nimport type { ExpectedOutcomeFieldType } from '../../types/llm-test-runner';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface FieldEvaluationInput {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationRequestV2 {\n testCaseId: string;\n question: string;\n actualResponse: string;\n fields: FieldEvaluationInput[];\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n fieldResults?: FieldEvaluationResult[];\n timestamp?: string;\n evaluationParameters?: EvaluationParameters;\n evaluationApproachResult?: EvaluationApproachResult;\n}\n\nexport interface FieldEvaluationResult {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n error?: string;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}\n"]}
@@ -1,4 +1,3 @@
1
- import { serializeExpectedOutcome } from "../expected-outcome-serializer";
2
1
  /**
3
2
  * Escapes a CSV field by wrapping it in quotes if it contains special characters
4
3
  * @param field - The field to escape
@@ -17,48 +16,63 @@ export function escapeCsvField(field) {
17
16
  */
18
17
  export function exportTestResultsToCsv(testCases) {
19
18
  const csvRows = [];
19
+ const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
20
20
  // Add header row
21
21
  const headers = [
22
22
  'Question',
23
- 'Expected Keywords',
24
- 'Generated Keywords',
25
- 'Keywords Match',
26
23
  'Response Time (s)',
27
- 'Evaluation Approach',
28
- 'Evaluation Score',
29
24
  ];
25
+ for (let i = 1; i <= maxFieldCount; i++) {
26
+ headers.push('Field Name');
27
+ headers.push('Expected Keywords');
28
+ headers.push('Generated Keywords');
29
+ headers.push('Evaluation Strategy');
30
+ headers.push('Passed Evaluation');
31
+ headers.push('Keyword Match');
32
+ headers.push('Score');
33
+ if (i < maxFieldCount) {
34
+ headers.push('');
35
+ }
36
+ }
30
37
  csvRows.push(headers.join(','));
31
- // Add data rows
38
+ // Add data rows (one row per test case)
32
39
  testCases.forEach(testCase => {
33
- const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
34
- const evaluationApproach = testCase.evaluationParameters?.approach || '';
35
- const score = testCase.evaluationResult?.evaluationApproachResult?.score;
36
- const evaluationScore = score !== undefined ? score.toString() : '';
37
- let generatedKeywords = '';
38
- let keywordsMatch = '';
39
- if (testCase.evaluationResult) {
40
- const foundKeywords = testCase.evaluationResult.keywordMatches
41
- .filter(match => match.found)
42
- .map(match => match.keyword);
43
- generatedKeywords = foundKeywords.join('; ');
44
- // Calculate match percentages
45
- const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
46
- const totalKeywords = testCase.evaluationResult.keywordMatches.length;
47
- keywordsMatch =
48
- totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
49
- }
50
40
  const responseTime = testCase.responseTime
51
41
  ? (testCase.responseTime / 1000).toFixed(3)
52
42
  : 'N/A';
53
- const row = [
54
- escapeCsvField(testCase.question),
55
- escapeCsvField(expectedOutcome),
56
- escapeCsvField(generatedKeywords),
57
- keywordsMatch,
58
- responseTime,
59
- escapeCsvField(evaluationApproach),
60
- escapeCsvField(evaluationScore),
61
- ];
43
+ const row = [escapeCsvField(testCase.question), responseTime];
44
+ for (let i = 0; i < maxFieldCount; i++) {
45
+ const field = testCase.expectedOutcome?.[i];
46
+ const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
47
+ const expectedKeywords = fieldResult?.expectedValue ??
48
+ (field
49
+ ? field.type === 'chips-input'
50
+ ? field.value.join(', ')
51
+ : field.value
52
+ : '');
53
+ const generatedKeywords = (fieldResult?.keywordMatches || [])
54
+ .filter(match => match.found)
55
+ .map(match => match.keyword)
56
+ .join('; ');
57
+ const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
58
+ const totalMatches = fieldResult?.keywordMatches?.length || 0;
59
+ const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
60
+ const score = fieldResult?.evaluationApproachResult?.score !== undefined
61
+ ? fieldResult.evaluationApproachResult.score.toFixed(2)
62
+ : '';
63
+ row.push(escapeCsvField(field?.label || ''));
64
+ row.push(escapeCsvField(expectedKeywords || ''));
65
+ row.push(escapeCsvField(generatedKeywords));
66
+ row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
67
+ field?.evaluationParameters?.approach ||
68
+ ''));
69
+ row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
70
+ row.push(keywordMatch);
71
+ row.push(score);
72
+ if (i < maxFieldCount - 1) {
73
+ row.push('');
74
+ }
75
+ }
62
76
  csvRows.push(row.join(','));
63
77
  });
64
78
  return csvRows.join('\n');
@@ -1 +1 @@
1
- {"version":3,"file":"test-results-csv.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-results-csv.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,wBAAwB,EAAE,MAAM,gCAAgC,CAAC;AAE1E;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,KAAa;IAC1C,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvE,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC;IAC1C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,SAAqB;IAC1D,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,iBAAiB;IACjB,MAAM,OAAO,GAAG;QACd,UAAU;QACV,mBAAmB;QACnB,oBAAoB;QACpB,gBAAgB;QAChB,mBAAmB;QACnB,qBAAqB;QACrB,kBAAkB;KACnB,CAAC;IACF,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAEhC,gBAAgB;IAChB,SAAS,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE;QAC3B,MAAM,eAAe,GAAG,wBAAwB,CAC9C,QAAQ,CAAC,eAAe,IAAI,EAAE,EAC9B,KAAK,CACN,CAAC;QAEF,MAAM,kBAAkB,GAAG,QAAQ,CAAC,oBAAoB,EAAE,QAAQ,IAAI,EAAE,CAAC;QACzE,MAAM,KAAK,GAAG,QAAQ,CAAC,gBAAgB,EAAE,wBAAwB,EAAE,KAAK,CAAC;QACzE,MAAM,eAAe,GAAG,KAAK,KAAK,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAEpE,IAAI,iBAAiB,GAAG,EAAE,CAAC;QAC3B,IAAI,aAAa,GAAG,EAAE,CAAC;QAEvB,IAAI,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YAC9B,MAAM,aAAa,GAAG,QAAQ,CAAC,gBAAgB,CAAC,cAAc;iBAC3D,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC;iBAC5B,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YAE/B,iBAAiB,GAAG,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAE7C,8BAA8B;YAC9B,MAAM,iBAAiB,GAAG,QAAQ,CAAC,gBAAgB,CAAC,cAAc,CAAC,MAAM,CACvE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CACb,CAAC,MAAM,CAAC;YACT,MAAM,aAAa,GAAG,QAAQ,CAAC,gBAAgB,CAAC,cAAc,CAAC,MAAM,CAAC;YAEtE,aAAa;gBACX,aAAa,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,iBAAiB,IAAI,aAAa,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;QACxE,CAAC;QAED,MAAM,YAAY,GAAG,QAAQ,CAAC,YAAY;YACxC,CAAC,CAAC,CAAC,QAAQ,CAAC,YAAY,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;YAC3C,CAAC,CAAC,KAAK,CAAC;QAEV,MAAM,GAAG,GAAG;YACV,cAAc,CAAC,QAAQ,CAAC,QAAQ,CAAC;YACjC,cAAc,CAAC,eAAe,CAAC;YAC/B,cAAc,CAAC,iBAAiB,CAAC;YACjC,aAAa;YACb,YAAY;YACZ,cAAc,CAAC,kBAAkB,CAAC;YAClC,cAAc,CAAC,eAAe,CAAC;SAChC,CAAC;QAEF,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9B,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5B,CAAC","sourcesContent":["import { TestCase } from '../../types/llm-test-runner';\nimport { serializeExpectedOutcome } from '../expected-outcome-serializer';\n\n/**\n * Escapes a CSV field by wrapping it in quotes if it contains special characters\n * @param field - The field to escape\n * @returns Escaped field string\n */\nexport function escapeCsvField(field: string): string {\n if (field.includes(',') || field.includes('\"') || field.includes('\\n')) {\n return `\"${field.replace(/\"/g, '\"\"')}\"`;\n }\n return field;\n}\n\n/**\n * Exports test results to a CSV string\n * @param testCases - Array of test cases with results to export\n * @returns CSV string representation of the test results\n */\nexport function exportTestResultsToCsv(testCases: TestCase[]): string {\n const csvRows: string[] = [];\n\n // Add header row\n const headers = [\n 'Question',\n 'Expected Keywords',\n 'Generated Keywords',\n 'Keywords Match',\n 'Response Time (s)',\n 'Evaluation Approach',\n 'Evaluation Score',\n ];\n csvRows.push(headers.join(','));\n\n // Add data rows\n testCases.forEach(testCase => {\n const expectedOutcome = serializeExpectedOutcome(\n testCase.expectedOutcome || [],\n ' | ',\n );\n\n const evaluationApproach = testCase.evaluationParameters?.approach || '';\n const score = testCase.evaluationResult?.evaluationApproachResult?.score;\n const evaluationScore = score !== undefined ? score.toString() : '';\n \n let generatedKeywords = '';\n let keywordsMatch = '';\n\n if (testCase.evaluationResult) {\n const foundKeywords = testCase.evaluationResult.keywordMatches\n .filter(match => match.found)\n .map(match => match.keyword);\n\n generatedKeywords = foundKeywords.join('; ');\n\n // Calculate match percentages\n const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(\n m => m.found,\n ).length;\n const totalKeywords = testCase.evaluationResult.keywordMatches.length;\n\n keywordsMatch =\n totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';\n }\n\n const responseTime = testCase.responseTime\n ? (testCase.responseTime / 1000).toFixed(3)\n : 'N/A';\n\n const row = [\n escapeCsvField(testCase.question),\n escapeCsvField(expectedOutcome),\n escapeCsvField(generatedKeywords),\n keywordsMatch,\n responseTime,\n escapeCsvField(evaluationApproach),\n escapeCsvField(evaluationScore),\n ];\n\n csvRows.push(row.join(','));\n });\n\n return csvRows.join('\\n');\n}\n\n"]}
1
+ {"version":3,"file":"test-results-csv.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-results-csv.ts"],"names":[],"mappings":"AAEA;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,KAAa;IAC1C,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvE,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC;IAC1C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,SAAqB;IAC1D,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,CACpC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,EACzE,CAAC,CACF,CAAC;IAEF,iBAAiB;IACjB,MAAM,OAAO,GAAa;QACxB,UAAU;QACV,mBAAmB;KACpB,CAAC;IACF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,aAAa,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC3B,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAClC,OAAO,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;QACnC,OAAO,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QACpC,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAClC,OAAO,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACtB,IAAI,CAAC,GAAG,aAAa,EAAE,CAAC;YACtB,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAEhC,wCAAwC;IACxC,SAAS,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE;QAC3B,MAAM,YAAY,GAAG,QAAQ,CAAC,YAAY;YACxC,CAAC,CAAC,CAAC,QAAQ,CAAC,YAAY,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;YAC3C,CAAC,CAAC,KAAK,CAAC;QACV,MAAM,GAAG,GAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,YAAY,CAAC,CAAC;QAExE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,QAAQ,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,CAAC;YAC5C,MAAM,WAAW,GAAG,QAAQ,CAAC,gBAAgB,EAAE,YAAY,EAAE,IAAI,CAC/D,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,KAAK,KAAK,CAAC,CAC7B,CAAC;YAEF,MAAM,gBAAgB,GACpB,WAAW,EAAE,aAAa;gBAC1B,CAAC,KAAK;oBACJ,CAAC,CAAC,KAAK,CAAC,IAAI,KAAK,aAAa;wBAC5B,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;wBACxB,CAAC,CAAC,KAAK,CAAC,KAAK;oBACf,CAAC,CAAC,EAAE,CAAC,CAAC;YACV,MAAM,iBAAiB,GAAG,CAAC,WAAW,EAAE,cAAc,IAAI,EAAE,CAAC;iBAC1D,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC;iBAC5B,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC;iBAC3B,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,MAAM,YAAY,GAAG,CAAC,WAAW,EAAE,cAAc,IAAI,EAAE,CAAC,CAAC,MAAM,CAC7D,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CACrB,CAAC,MAAM,CAAC;YACT,MAAM,YAAY,GAAG,WAAW,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC,CAAC;YAC9D,MAAM,YAAY,GAAG,YAAY,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,YAAY,IAAI,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/E,MAAM,KAAK,GACT,WAAW,EAAE,wBAAwB,EAAE,KAAK,KAAK,SAAS;gBACxD,CAAC,CAAC,WAAW,CAAC,wBAAwB,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;gBACvD,CAAC,CAAC,EAAE,CAAC;YAET,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC;YAC7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,gBAAgB,IAAI,EAAE,CAAC,CAAC,CAAC;YACjD,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,iBAAiB,CAAC,CAAC,CAAC;YAC5C,GAAG,CAAC,IAAI,CACN,cAAc,CACZ,WAAW,EAAE,oBAAoB,CAAC,QAAQ;gBACxC,KAAK,EAAE,oBAAoB,EAAE,QAAQ;gBACrC,EAAE,CACL,CACF,CAAC;YACF,GAAG,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YACrE,GAAG,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACvB,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAEhB,IAAI,CAAC,GAAG,aAAa,GAAG,CAAC,EAAE,CAAC;gBAC1B,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,CAAC;QACH,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9B,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5B,CAAC","sourcesContent":["import { TestCase } from '../../types/llm-test-runner';\n\n/**\n * Escapes a CSV field by wrapping it in quotes if it contains special characters\n * @param field - The field to escape\n * @returns Escaped field string\n */\nexport function escapeCsvField(field: string): string {\n if (field.includes(',') || field.includes('\"') || field.includes('\\n')) {\n return `\"${field.replace(/\"/g, '\"\"')}\"`;\n }\n return field;\n}\n\n/**\n * Exports test results to a CSV string\n * @param testCases - Array of test cases with results to export\n * @returns CSV string representation of the test results\n */\nexport function exportTestResultsToCsv(testCases: TestCase[]): string {\n const csvRows: string[] = [];\n const maxFieldCount = testCases.reduce(\n (max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length),\n 0,\n );\n\n // Add header row\n const headers: string[] = [\n 'Question',\n 'Response Time (s)',\n ];\n for (let i = 1; i <= maxFieldCount; i++) {\n headers.push('Field Name');\n headers.push('Expected Keywords');\n headers.push('Generated Keywords');\n headers.push('Evaluation Strategy');\n headers.push('Passed Evaluation');\n headers.push('Keyword Match');\n headers.push('Score');\n if (i < maxFieldCount) {\n headers.push('');\n }\n }\n csvRows.push(headers.join(','));\n\n // Add data rows (one row per test case)\n testCases.forEach(testCase => {\n const responseTime = testCase.responseTime\n ? (testCase.responseTime / 1000).toFixed(3)\n : 'N/A';\n const row: string[] = [escapeCsvField(testCase.question), responseTime];\n\n for (let i = 0; i < maxFieldCount; i++) {\n const field = testCase.expectedOutcome?.[i];\n const fieldResult = testCase.evaluationResult?.fieldResults?.find(\n result => result.index === i,\n );\n\n const expectedKeywords =\n fieldResult?.expectedValue ??\n (field\n ? field.type === 'chips-input'\n ? field.value.join(', ')\n : field.value\n : '');\n const generatedKeywords = (fieldResult?.keywordMatches || [])\n .filter(match => match.found)\n .map(match => match.keyword)\n .join('; ');\n const matchedCount = (fieldResult?.keywordMatches || []).filter(\n match => match.found,\n ).length;\n const totalMatches = fieldResult?.keywordMatches?.length || 0;\n const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';\n const score =\n fieldResult?.evaluationApproachResult?.score !== undefined\n ? fieldResult.evaluationApproachResult.score.toFixed(2)\n : '';\n\n row.push(escapeCsvField(field?.label || ''));\n row.push(escapeCsvField(expectedKeywords || ''));\n row.push(escapeCsvField(generatedKeywords));\n row.push(\n escapeCsvField(\n fieldResult?.evaluationParameters.approach ||\n field?.evaluationParameters?.approach ||\n '',\n ),\n );\n row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');\n row.push(keywordMatch);\n row.push(score);\n\n if (i < maxFieldCount - 1) {\n row.push('');\n }\n }\n\n csvRows.push(row.join(','));\n });\n\n return csvRows.join('\\n');\n}\n\n"]}
@@ -8,7 +8,6 @@ export function formatTestSuiteAsJson(testCases) {
8
8
  id: testCase.id,
9
9
  question: testCase.question,
10
10
  expectedOutcome: testCase.expectedOutcome,
11
- evaluationParameters: testCase.evaluationParameters,
12
11
  }));
13
12
  return JSON.stringify(exportData, null, 2);
14
13
  }
@@ -1 +1 @@
1
- {"version":3,"file":"test-suite-exporter.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-suite-exporter.ts"],"names":[],"mappings":"AAYA;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CAAC,SAAqB;IACzD,MAAM,UAAU,GAA0B,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;QACnE,EAAE,EAAE,QAAQ,CAAC,EAAE;QACf,QAAQ,EAAE,QAAQ,CAAC,QAAQ;QAC3B,eAAe,EAAE,QAAQ,CAAC,eAAe;QAEzC,oBAAoB,EAAE,QAAQ,CAAC,oBAAoB;KACpD,CAAC,CAAC,CAAC;IAEJ,OAAO,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AAC7C,CAAC","sourcesContent":["import { ExpectedOutcomeField, TestCase } from '../../types/llm-test-runner';\n\nexport interface TestSuiteExportData {\n id: string;\n question: string;\n expectedOutcome: ExpectedOutcomeField[];\n evaluationParameters?: {\n approach: string;\n threshold?: number;\n };\n}\n\n/**\n * Formats test cases as a JSON string suitable for saving as a test suite\n * @param testCases - Array of test cases to format\n * @returns JSON string representation of the test suite\n */\nexport function formatTestSuiteAsJson(testCases: TestCase[]): string {\n const exportData: TestSuiteExportData[] = testCases.map(testCase => ({\n id: testCase.id,\n question: testCase.question,\n expectedOutcome: testCase.expectedOutcome,\n\n evaluationParameters: testCase.evaluationParameters,\n }));\n\n return JSON.stringify(exportData, null, 2);\n}\n"]}
1
+ {"version":3,"file":"test-suite-exporter.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-suite-exporter.ts"],"names":[],"mappings":"AAQA;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CAAC,SAAqB;IACzD,MAAM,UAAU,GAA0B,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;QACnE,EAAE,EAAE,QAAQ,CAAC,EAAE;QACf,QAAQ,EAAE,QAAQ,CAAC,QAAQ;QAC3B,eAAe,EAAE,QAAQ,CAAC,eAAe;KAC1C,CAAC,CAAC,CAAC;IAEJ,OAAO,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AAC7C,CAAC","sourcesContent":["import { ExpectedOutcomeField, TestCase } from '../../types/llm-test-runner';\n\nexport interface TestSuiteExportData {\n id: string;\n question: string;\n expectedOutcome: ExpectedOutcomeField[];\n}\n\n/**\n * Formats test cases as a JSON string suitable for saving as a test suite\n * @param testCases - Array of test cases to format\n * @returns JSON string representation of the test suite\n */\nexport function formatTestSuiteAsJson(testCases: TestCase[]): string {\n const exportData: TestSuiteExportData[] = testCases.map(testCase => ({\n id: testCase.id,\n question: testCase.question,\n expectedOutcome: testCase.expectedOutcome,\n }));\n\n return JSON.stringify(exportData, null, 2);\n}\n"]}
@@ -1,5 +1,5 @@
1
1
  import { v4 as uuidv4 } from "uuid";
2
- import { EvaluationApproach } from "../evaluation/constants";
2
+ import { normalizeEvaluationParametersForField } from "../evaluation/field-evaluation-approach";
3
3
  export const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
4
4
  {
5
5
  type: 'textarea',
@@ -8,6 +8,12 @@ export const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
8
8
  rows: 2,
9
9
  },
10
10
  ];
11
+ function normalizeExpectedOutcomeField(field) {
12
+ return {
13
+ ...field,
14
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
15
+ };
16
+ }
11
17
  /**
12
18
  * Creates a new test case with default values
13
19
  * @returns A new TestCase object with a unique ID
@@ -17,9 +23,6 @@ export function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_
17
23
  id: uuidv4(),
18
24
  question: '',
19
25
  expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
20
- evaluationParameters: {
21
- approach: EvaluationApproach.EXACT,
22
- },
23
26
  isRunning: false,
24
27
  };
25
28
  }
@@ -29,35 +32,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
29
32
  return {
30
33
  type: 'text',
31
34
  label: schemaField.label,
32
- required: schemaField.required,
33
35
  placeholder: schemaField.placeholder,
34
36
  value: '',
37
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
35
38
  };
36
39
  case 'textarea':
37
40
  return {
38
41
  type: 'textarea',
39
42
  label: schemaField.label,
40
- required: schemaField.required,
41
43
  placeholder: schemaField.placeholder,
42
44
  rows: schemaField.rows,
43
45
  value: '',
46
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
44
47
  };
45
48
  case 'chips-input':
46
49
  return {
47
50
  type: 'chips-input',
48
51
  label: schemaField.label,
49
- required: schemaField.required,
50
52
  placeholder: schemaField.placeholder,
51
53
  value: [],
54
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
52
55
  };
53
56
  case 'select':
54
57
  return {
55
58
  type: 'select',
56
59
  label: schemaField.label,
57
- required: schemaField.required,
58
60
  placeholder: schemaField.placeholder,
59
61
  value: '',
60
62
  options: schemaField.options,
63
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
61
64
  };
62
65
  default: {
63
66
  const _exhaustiveCheck = schemaField;
@@ -68,31 +71,18 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
68
71
  export function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
69
72
  return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
70
73
  }
71
- export function migrateLegacyExpectedOutcomeString(value) {
72
- return [
73
- {
74
- type: 'textarea',
75
- label: 'Expected Outcome',
76
- value,
77
- },
78
- ];
79
- }
80
74
  /**
81
75
  * Creates a runtime test case from validated input data.
82
- * The input is expected to already satisfy `TestCaseInput` (legacy string or v2 shape),
83
- * and this function only performs normalization/defaulting (including legacy migration).
76
+ * The input is expected to already satisfy `TestCaseInput`,
77
+ * and this function only performs normalization/defaulting.
84
78
  *
85
79
  * @param data - Validated test case input
86
80
  * @returns A normalized TestCase object with runtime defaults applied
87
81
  */
88
82
  export function createTestCaseFromInput(data) {
89
- let expectedOutcome;
90
- if (typeof data.expectedOutcome === 'string') {
91
- expectedOutcome = migrateLegacyExpectedOutcomeString(data.expectedOutcome);
92
- }
93
- else {
94
- expectedOutcome = data.expectedOutcome;
95
- }
96
- return { ...data, expectedOutcome };
83
+ return {
84
+ ...data,
85
+ expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
86
+ };
97
87
  }
98
88
  //# sourceMappingURL=test-case-factory.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"test-case-factory.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,EAAE,IAAI,MAAM,EAAE,MAAM,MAAM,CAAC;AAQpC,OAAO,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAE7D,MAAM,CAAC,MAAM,+BAA+B,GAA0B;IACpE;QACE,IAAI,EAAE,UAAU;QAChB,KAAK,EAAE,kBAAkB;QACzB,WAAW,EAAE,2BAA2B;QACxC,IAAI,EAAE,CAAC;KACR;CACF,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,cAAc,CAC5B,wBAA+C,+BAA+B;IAE9E,OAAO;QACL,EAAE,EAAE,MAAM,EAAE;QACZ,QAAQ,EAAE,EAAE;QACZ,eAAe,EAAE,+BAA+B,CAAC,qBAAqB,CAAC;QACvE,oBAAoB,EAAE;YACpB,QAAQ,EAAE,kBAAkB,CAAC,KAAK;SACnC;QACD,SAAS,EAAE,KAAK;KACjB,CAAC;AACJ,CAAC;AAED,SAAS,oCAAoC,CAC3C,WAAuC;IAEvC,QAAQ,WAAW,CAAC,IAAI,EAAE,CAAC;QACzB,KAAK,MAAM;YACT,OAAO;gBACL,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ;gBAC9B,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;aACV,CAAC;QAEJ,KAAK,UAAU;YACb,OAAO;gBACL,IAAI,EAAE,UAAU;gBAChB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ;gBAC9B,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,IAAI,EAAE,WAAW,CAAC,IAAI;gBACtB,KAAK,EAAE,EAAE;aACV,CAAC;QAEJ,KAAK,aAAa;YAChB,OAAO;gBACL,IAAI,EAAE,aAAa;gBACnB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ;gBAC9B,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;aACV,CAAC;QAEJ,KAAK,QAAQ;YACX,OAAO;gBACL,IAAI,EAAE,QAAQ;gBACd,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ;gBAC9B,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;gBACT,OAAO,EAAE,WAAW,CAAC,OAAO;aAC7B,CAAC;QAEJ,OAAO,CAAC,CAAC,CAAC;YACR,MAAM,gBAAgB,GAAU,WAAW,CAAC;YAC5C,OAAO,gBAAgB,CAAC;QAC1B,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,UAAU,+BAA+B,CAC7C,qBAA4C;IAE5C,OAAO,qBAAqB,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;AACzE,CAAC;AAED,MAAM,UAAU,kCAAkC,CAChD,KAAa;IAEb,OAAO;QACL;YACE,IAAI,EAAE,UAAU;YAChB,KAAK,EAAE,kBAAkB;YACzB,KAAK;SACN;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,uBAAuB,CAAC,IAAmB;IACzD,IAAI,eAAuC,CAAC;IAC5C,IAAI,OAAO,IAAI,CAAC,eAAe,KAAK,QAAQ,EAAE,CAAC;QAC7C,eAAe,GAAG,kCAAkC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC7E,CAAC;SAAM,CAAC;QACN,eAAe,GAAG,IAAI,CAAC,eAAe,CAAC;IACzC,CAAC;IAED,OAAO,EAAE,GAAG,IAAI,EAAE,eAAe,EAAE,CAAC;AACtC,CAAC","sourcesContent":["import { v4 as uuidv4 } from 'uuid';\nimport {\n ExpectedOutcomeField,\n ExpectedOutcomeSchema,\n ExpectedOutcomeSchemaField,\n TestCase,\n TestCaseInput,\n} from '../../types/llm-test-runner';\nimport { EvaluationApproach } from '../evaluation/constants';\n\nexport const DEFAULT_EXPECTED_OUTCOME_SCHEMA: ExpectedOutcomeSchema = [\n {\n type: 'textarea',\n label: 'Expected Outcome',\n placeholder: 'Enter expected outcome...',\n rows: 2,\n },\n];\n\n/**\n * Creates a new test case with default values\n * @returns A new TestCase object with a unique ID\n */\nexport function createTestCase(\n expectedOutcomeSchema: ExpectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA,\n): TestCase {\n return {\n id: uuidv4(),\n question: '',\n expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),\n evaluationParameters: {\n approach: EvaluationApproach.EXACT,\n },\n isRunning: false,\n };\n}\n\nfunction createExpectedOutcomeFieldFromSchema(\n schemaField: ExpectedOutcomeSchemaField,\n): ExpectedOutcomeField {\n switch (schemaField.type) {\n case 'text':\n return {\n type: 'text',\n label: schemaField.label,\n required: schemaField.required,\n placeholder: schemaField.placeholder,\n value: '',\n };\n\n case 'textarea':\n return {\n type: 'textarea',\n label: schemaField.label,\n required: schemaField.required,\n placeholder: schemaField.placeholder,\n rows: schemaField.rows,\n value: '',\n };\n\n case 'chips-input':\n return {\n type: 'chips-input',\n label: schemaField.label,\n required: schemaField.required,\n placeholder: schemaField.placeholder,\n value: [],\n };\n\n case 'select':\n return {\n type: 'select',\n label: schemaField.label,\n required: schemaField.required,\n placeholder: schemaField.placeholder,\n value: '',\n options: schemaField.options,\n };\n\n default: {\n const _exhaustiveCheck: never = schemaField;\n return _exhaustiveCheck;\n }\n }\n}\n\nexport function createExpectedOutcomeFromSchema(\n expectedOutcomeSchema: ExpectedOutcomeSchema,\n): ExpectedOutcomeField[] {\n return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);\n}\n\nexport function migrateLegacyExpectedOutcomeString(\n value: string,\n): ExpectedOutcomeField[] {\n return [\n {\n type: 'textarea',\n label: 'Expected Outcome',\n value,\n },\n ];\n}\n\n/**\n * Creates a runtime test case from validated input data.\n * The input is expected to already satisfy `TestCaseInput` (legacy string or v2 shape),\n * and this function only performs normalization/defaulting (including legacy migration).\n *\n * @param data - Validated test case input\n * @returns A normalized TestCase object with runtime defaults applied\n */\nexport function createTestCaseFromInput(data: TestCaseInput): TestCase {\n let expectedOutcome: ExpectedOutcomeField[];\n if (typeof data.expectedOutcome === 'string') {\n expectedOutcome = migrateLegacyExpectedOutcomeString(data.expectedOutcome);\n } else {\n expectedOutcome = data.expectedOutcome;\n }\n\n return { ...data, expectedOutcome };\n}\n"]}
1
+ {"version":3,"file":"test-case-factory.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,EAAE,IAAI,MAAM,EAAE,MAAM,MAAM,CAAC;AASpC,OAAO,EAAE,qCAAqC,EAAE,MAAM,yCAAyC,CAAC;AAEhG,MAAM,CAAC,MAAM,+BAA+B,GAA0B;IACpE;QACE,IAAI,EAAE,UAAU;QAChB,KAAK,EAAE,kBAAkB;QACzB,WAAW,EAAE,2BAA2B;QACxC,IAAI,EAAE,CAAC;KACR;CACF,CAAC;AAEF,SAAS,6BAA6B,CACpC,KAA2B;IAE3B,OAAO;QACL,GAAG,KAAK;QACR,oBAAoB,EAAE,qCAAqC,CACzD,KAAK,CAAC,IAAI,EACV,KAAK,CAAC,oBAAoB,CAC3B;KACF,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAC5B,wBAA+C,+BAA+B;IAE9E,OAAO;QACL,EAAE,EAAE,MAAM,EAAE;QACZ,QAAQ,EAAE,EAAE;QACZ,eAAe,EAAE,+BAA+B,CAAC,qBAAqB,CAAC;QACvE,SAAS,EAAE,KAAK;KACjB,CAAC;AACJ,CAAC;AAED,SAAS,oCAAoC,CAC3C,WAAuC;IAEvC,QAAQ,WAAW,CAAC,IAAI,EAAE,CAAC;QACzB,KAAK,MAAM;YACT,OAAO;gBACL,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;gBACT,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CACjC;aACF,CAAC;QAEJ,KAAK,UAAU;YACb,OAAO;gBACL,IAAI,EAAE,UAAU;gBAChB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,IAAI,EAAE,WAAW,CAAC,IAAI;gBACtB,KAAK,EAAE,EAAE;gBACT,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CACjC;aACF,CAAC;QAEJ,KAAK,aAAa;YAChB,OAAO;gBACL,IAAI,EAAE,aAAa;gBACnB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;gBACT,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CACjC;aACF,CAAC;QAEJ,KAAK,QAAQ;YACX,OAAO;gBACL,IAAI,EAAE,QAAQ;gBACd,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;gBACT,OAAO,EAAE,WAAW,CAAC,OAAO;gBAC5B,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CAC6B;aAChE,CAAC;QAEJ,OAAO,CAAC,CAAC,CAAC;YACR,MAAM,gBAAgB,GAAU,WAAW,CAAC;YAC5C,OAAO,gBAAgB,CAAC;QAC1B,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,UAAU,+BAA+B,CAC7C,qBAA4C;IAE5C,OAAO,qBAAqB,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;AACzE,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,uBAAuB,CAAC,IAAmB;IACzD,OAAO;QACL,GAAG,IAAI;QACP,eAAe,EAAE,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,6BAA6B,CAAC;KACzE,CAAC;AACJ,CAAC","sourcesContent":["import { v4 as uuidv4 } from 'uuid';\nimport {\n ExpectedOutcomeField,\n ExpectedOutcomeSchema,\n ExpectedOutcomeSchemaField,\n TestCase,\n TestCaseInput,\n} from '../../types/llm-test-runner';\nimport { EvaluationApproach } from '../evaluation/constants';\nimport { normalizeEvaluationParametersForField } from '../evaluation/field-evaluation-approach';\n\nexport const DEFAULT_EXPECTED_OUTCOME_SCHEMA: ExpectedOutcomeSchema = [\n {\n type: 'textarea',\n label: 'Expected Outcome',\n placeholder: 'Enter expected outcome...',\n rows: 2,\n },\n];\n\nfunction normalizeExpectedOutcomeField(\n field: ExpectedOutcomeField,\n): ExpectedOutcomeField {\n return {\n ...field,\n evaluationParameters: normalizeEvaluationParametersForField(\n field.type,\n field.evaluationParameters,\n ),\n };\n}\n\n/**\n * Creates a new test case with default values\n * @returns A new TestCase object with a unique ID\n */\nexport function createTestCase(\n expectedOutcomeSchema: ExpectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA,\n): TestCase {\n return {\n id: uuidv4(),\n question: '',\n expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),\n isRunning: false,\n };\n}\n\nfunction createExpectedOutcomeFieldFromSchema(\n schemaField: ExpectedOutcomeSchemaField,\n): ExpectedOutcomeField {\n switch (schemaField.type) {\n case 'text':\n return {\n type: 'text',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n value: '',\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ),\n };\n\n case 'textarea':\n return {\n type: 'textarea',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n rows: schemaField.rows,\n value: '',\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ),\n };\n\n case 'chips-input':\n return {\n type: 'chips-input',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n value: [],\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ),\n };\n\n case 'select':\n return {\n type: 'select',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n value: '',\n options: schemaField.options,\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ) as { approach: EvaluationApproach.EXACT; threshold?: number },\n };\n\n default: {\n const _exhaustiveCheck: never = schemaField;\n return _exhaustiveCheck;\n }\n }\n}\n\nexport function createExpectedOutcomeFromSchema(\n expectedOutcomeSchema: ExpectedOutcomeSchema,\n): ExpectedOutcomeField[] {\n return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);\n}\n\n/**\n * Creates a runtime test case from validated input data.\n * The input is expected to already satisfy `TestCaseInput`,\n * and this function only performs normalization/defaulting.\n *\n * @param data - Validated test case input\n * @returns A normalized TestCase object with runtime defaults applied\n */\nexport function createTestCaseFromInput(data: TestCaseInput): TestCase {\n return {\n ...data,\n expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),\n };\n}\n"]}
@@ -1,16 +1,67 @@
1
+ import { normalizeEvaluationParametersForField } from "../evaluation/field-evaluation-approach";
2
+ export function applyExpectedOutcomeChange(testCase, change) {
3
+ const { index } = change;
4
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5
+ const target = expectedOutcome[index];
6
+ if (!target) {
7
+ return testCase;
8
+ }
9
+ switch (change.operation) {
10
+ case 'set-value': {
11
+ if (target.type === 'chips-input') {
12
+ return testCase;
13
+ }
14
+ expectedOutcome[index] = {
15
+ ...target,
16
+ value: change.value,
17
+ };
18
+ return { ...testCase, expectedOutcome };
19
+ }
20
+ case 'add-chip': {
21
+ if (target.type !== 'chips-input') {
22
+ return testCase;
23
+ }
24
+ expectedOutcome[index] = {
25
+ ...target,
26
+ value: [...target.value, change.value],
27
+ };
28
+ return { ...testCase, expectedOutcome };
29
+ }
30
+ case 'remove-chip': {
31
+ if (target.type !== 'chips-input') {
32
+ return testCase;
33
+ }
34
+ expectedOutcome[index] = {
35
+ ...target,
36
+ value: target.value.filter(chip => chip !== change.value),
37
+ };
38
+ return { ...testCase, expectedOutcome };
39
+ }
40
+ case 'set-evaluation-approach':
41
+ return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
42
+ }
43
+ }
1
44
  /**
2
- * Updates the evaluation approach for a test case
3
- * @param testCase - The test case to update
4
- * @param approach - The new evaluation approach
5
- * @returns Updated test case with the new evaluation approach
45
+ * Updates the evaluation approach for a specific expected outcome field.
46
+ * Select fields always use exact matching.
6
47
  */
7
- export function updateApproach(testCase, approach) {
48
+ export function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
49
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
50
+ const target = expectedOutcome[fieldIndex];
51
+ if (!target) {
52
+ return testCase;
53
+ }
54
+ const currentEvaluationParameters = target.evaluationParameters;
55
+ expectedOutcome[fieldIndex] = {
56
+ ...target,
57
+ evaluationParameters: normalizeEvaluationParametersForField(target.type, {
58
+ ...currentEvaluationParameters,
59
+ approach,
60
+ }),
61
+ };
8
62
  return {
9
63
  ...testCase,
10
- evaluationParameters: {
11
- ...testCase.evaluationParameters,
12
- approach: approach,
13
- },
64
+ expectedOutcome,
14
65
  };
15
66
  }
16
67
  //# sourceMappingURL=test-case-mutations.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"test-case-mutations.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-mutations.ts"],"names":[],"mappings":"AAGA;;;;;GAKG;AACH,MAAM,UAAU,cAAc,CAC5B,QAAkB,EAClB,QAA4B;IAE5B,OAAO;QACL,GAAG,QAAQ;QACX,oBAAoB,EAAE;YACpB,GAAG,QAAQ,CAAC,oBAAoB;YAChC,QAAQ,EAAE,QAAQ;SACnB;KACF,CAAC;AACJ,CAAC","sourcesContent":["import { TestCase } from '../../types/llm-test-runner';\nimport { EvaluationApproach } from '../evaluation/constants';\n\n/**\n * Updates the evaluation approach for a test case\n * @param testCase - The test case to update\n * @param approach - The new evaluation approach\n * @returns Updated test case with the new evaluation approach\n */\nexport function updateApproach(\n testCase: TestCase,\n approach: EvaluationApproach,\n): TestCase {\n return {\n ...testCase,\n evaluationParameters: {\n ...testCase.evaluationParameters,\n approach: approach,\n },\n };\n}\n"]}
1
+ {"version":3,"file":"test-case-mutations.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-mutations.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,qCAAqC,EAAE,MAAM,yCAAyC,CAAC;AAwBhG,MAAM,UAAU,0BAA0B,CACxC,QAAkB,EAClB,MAA6B;IAE7B,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,CAAC;IACzB,MAAM,eAAe,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;IAEtC,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,QAAQ,MAAM,CAAC,SAAS,EAAE,CAAC;QACzB,KAAK,WAAW,CAAC,CAAC,CAAC;YACjB,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;gBAClC,OAAO,QAAQ,CAAC;YAClB,CAAC;YACD,eAAe,CAAC,KAAK,CAAC,GAAG;gBACvB,GAAG,MAAM;gBACT,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC;YACF,OAAO,EAAE,GAAG,QAAQ,EAAE,eAAe,EAAE,CAAC;QAC1C,CAAC;QACD,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;gBAClC,OAAO,QAAQ,CAAC;YAClB,CAAC;YACD,eAAe,CAAC,KAAK,CAAC,GAAG;gBACvB,GAAG,MAAM;gBACT,KAAK,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC;aACvC,CAAC;YACF,OAAO,EAAE,GAAG,QAAQ,EAAE,eAAe,EAAE,CAAC;QAC1C,CAAC;QACD,KAAK,aAAa,CAAC,CAAC,CAAC;YACnB,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;gBAClC,OAAO,QAAQ,CAAC;YAClB,CAAC;YACD,eAAe,CAAC,KAAK,CAAC,GAAG;gBACvB,GAAG,MAAM;gBACT,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,KAAK,MAAM,CAAC,KAAK,CAAC;aAC1D,CAAC;YACF,OAAO,EAAE,GAAG,QAAQ,EAAE,eAAe,EAAE,CAAC;QAC1C,CAAC;QACD,KAAK,yBAAyB;YAC5B,OAAO,kCAAkC,CAAC,QAAQ,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;IAC7E,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kCAAkC,CAChD,QAAkB,EAClB,UAAkB,EAClB,QAA4B;IAE5B,MAAM,eAAe,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,eAAe,CAAC,UAAU,CAAC,CAAC;IAE3C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,MAAM,2BAA2B,GAAG,MAAM,CAAC,oBAAoB,CAAC;IAChE,eAAe,CAAC,UAAU,CAAC,GAAG;QAC5B,GAAG,MAAM;QACT,oBAAoB,EAAE,qCAAqC,CAAC,MAAM,CAAC,IAAI,EAAE;YACvE,GAAG,2BAA2B;YAC9B,QAAQ;SACT,CAAC;KACH,CAAC;IAEF,OAAO;QACL,GAAG,QAAQ;QACX,eAAe;KAChB,CAAC;AACJ,CAAC","sourcesContent":["import { TestCase } from '../../types/llm-test-runner';\nimport { EvaluationApproach } from '../evaluation/constants';\nimport { normalizeEvaluationParametersForField } from '../evaluation/field-evaluation-approach';\n\nexport type ExpectedOutcomeChange =\n | {\n index: number;\n operation: 'set-value';\n value: string;\n }\n | {\n index: number;\n operation: 'add-chip';\n value: string;\n }\n | {\n index: number;\n operation: 'remove-chip';\n value: string;\n }\n | {\n index: number;\n operation: 'set-evaluation-approach';\n value: EvaluationApproach;\n };\n\nexport function applyExpectedOutcomeChange(\n testCase: TestCase,\n change: ExpectedOutcomeChange,\n): TestCase {\n const { index } = change;\n const expectedOutcome = [...(testCase.expectedOutcome || [])];\n const target = expectedOutcome[index];\n\n if (!target) {\n return testCase;\n }\n\n switch (change.operation) {\n case 'set-value': {\n if (target.type === 'chips-input') {\n return testCase;\n }\n expectedOutcome[index] = {\n ...target,\n value: change.value,\n };\n return { ...testCase, expectedOutcome };\n }\n case 'add-chip': {\n if (target.type !== 'chips-input') {\n return testCase;\n }\n expectedOutcome[index] = {\n ...target,\n value: [...target.value, change.value],\n };\n return { ...testCase, expectedOutcome };\n }\n case 'remove-chip': {\n if (target.type !== 'chips-input') {\n return testCase;\n }\n expectedOutcome[index] = {\n ...target,\n value: target.value.filter(chip => chip !== change.value),\n };\n return { ...testCase, expectedOutcome };\n }\n case 'set-evaluation-approach':\n return updateExpectedOutcomeFieldApproach(testCase, index, change.value);\n }\n}\n\n/**\n * Updates the evaluation approach for a specific expected outcome field.\n * Select fields always use exact matching.\n */\nexport function updateExpectedOutcomeFieldApproach(\n testCase: TestCase,\n fieldIndex: number,\n approach: EvaluationApproach,\n): TestCase {\n const expectedOutcome = [...(testCase.expectedOutcome || [])];\n const target = expectedOutcome[fieldIndex];\n\n if (!target) {\n return testCase;\n }\n\n const currentEvaluationParameters = target.evaluationParameters;\n expectedOutcome[fieldIndex] = {\n ...target,\n evaluationParameters: normalizeEvaluationParametersForField(target.type, {\n ...currentEvaluationParameters,\n approach,\n }),\n };\n\n return {\n ...testCase,\n expectedOutcome,\n };\n}\n"]}
@@ -1,28 +1,46 @@
1
1
  import { z } from "zod";
2
+ import { EvaluationApproach } from "../lib/evaluation/constants";
3
+ import { isApproachAllowedForFieldType } from "../lib/evaluation/field-evaluation-approach";
2
4
  const nonEmptyString = z.string().trim().min(1);
3
5
  const optionalPositiveInt = z.number().int().positive().optional();
4
6
  const optionalString = z.string().optional();
5
- const optionalBoolean = z.boolean().optional();
6
7
  const selectOptionsSchema = z.array(nonEmptyString).min(1);
8
+ const optionalNumber = z.number().optional();
9
+ const evaluationParametersSchema = z.object({
10
+ approach: z.enum(EvaluationApproach),
11
+ threshold: optionalNumber,
12
+ });
13
+ const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
14
+ if (!isApproachAllowedForFieldType('select', parameters.approach)) {
15
+ ctx.addIssue({
16
+ code: 'custom',
17
+ path: ['approach'],
18
+ message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
19
+ });
20
+ }
21
+ });
7
22
  const defaultExpectedOutcomeBaseSchema = z.object({
8
23
  label: nonEmptyString,
9
- required: optionalBoolean,
10
24
  placeholder: optionalString,
11
25
  });
12
26
  const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
13
27
  text: baseSchema.extend({
14
28
  type: z.literal('text'),
29
+ evaluationParameters: evaluationParametersSchema.optional(),
15
30
  }),
16
31
  textarea: baseSchema.extend({
17
32
  type: z.literal('textarea'),
18
33
  rows: optionalPositiveInt,
34
+ evaluationParameters: evaluationParametersSchema.optional(),
19
35
  }),
20
36
  chipsInput: baseSchema.extend({
21
37
  type: z.literal('chips-input'),
38
+ evaluationParameters: evaluationParametersSchema.optional(),
22
39
  }),
23
40
  select: baseSchema.extend({
24
41
  type: z.literal('select'),
25
42
  options: selectOptionsSchema,
43
+ evaluationParameters: selectEvaluationParametersSchema.optional(),
26
44
  }),
27
45
  });
28
46
  function hasDuplicateChips(values) {