llm-testrunner-components 1.2.4 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/dist/cjs/{app-chips_5.cjs.entry.js → app-chips_4.cjs.entry.js} +20 -22
- package/dist/cjs/app-chips_4.cjs.entry.js.map +1 -0
- package/dist/cjs/index.cjs.js +464 -66
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-test-runner.cjs.entry.js +11 -0
- package/dist/cjs/llm-test-runner.cjs.entry.js.map +1 -0
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +46 -13
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/chat-history.css +5 -5
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +45 -5
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +21 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js.map +1 -1
- package/dist/collection/demo/demo-modes.js +130 -0
- package/dist/collection/demo/vanilla-demo.js +56 -0
- package/dist/collection/lib/evaluation/actual-value-resolver.js +52 -0
- package/dist/collection/lib/evaluation/actual-value-resolver.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluation-engine.js +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +55 -17
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/form/components/app-textarea.css +2 -2
- package/dist/collection/lib/import-export/test-suite-importer.js +7 -1
- package/dist/collection/lib/import-export/test-suite-importer.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +5 -0
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +58 -23
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +39 -0
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/model-response.js +7 -0
- package/dist/collection/schemas/model-response.js.map +1 -0
- package/dist/collection/schemas/test-case.js +2 -1
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/expected-outcome.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/components/app-textarea.js +1 -1
- package/dist/components/chat-history.js +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/{p-B87Lt3z4.js → p-D3eincg_.js} +3 -3
- package/dist/components/p-D3eincg_.js.map +1 -0
- package/dist/components/{p-D2qDAxFN.js → p-D6BL2E3J.js} +2 -2
- package/dist/components/{p-D2qDAxFN.js.map → p-D6BL2E3J.js.map} +1 -1
- package/dist/components/p-kmtfMXcQ.js +2 -0
- package/dist/components/p-kmtfMXcQ.js.map +1 -0
- package/dist/esm/{app-chips_5.entry.js → app-chips_4.entry.js} +4 -5
- package/dist/esm/app-chips_4.entry.js.map +1 -0
- package/dist/esm/index.js +464 -66
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-test-runner.entry.js +5 -0
- package/dist/esm/llm-test-runner.entry.js.map +1 -0
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/llm-testrunner/p-c3fec0bb.entry.js +2 -0
- package/dist/llm-testrunner/{p-21202f12.entry.js.map → p-c3fec0bb.entry.js.map} +1 -1
- package/dist/llm-testrunner/p-caccdb4b.entry.js +2 -0
- package/dist/llm-testrunner/p-caccdb4b.entry.js.map +1 -0
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +3 -4
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/output/response-output.d.ts +2 -1
- package/dist/types/components.d.ts +4 -2
- package/dist/types/lib/evaluation/actual-value-resolver.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-service.d.ts +2 -2
- package/dist/types/lib/evaluation/types.d.ts +1 -1
- package/dist/types/lib/import-export/test-suite-importer.d.ts +1 -1
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +10 -1
- package/dist/types/schemas/expected-outcome.d.ts +116 -0
- package/dist/types/schemas/model-response.d.ts +7 -0
- package/dist/types/schemas/test-case.d.ts +76 -1
- package/dist/types/types/expected-outcome.d.ts +1 -1
- package/dist/types/types/llm-test-runner.d.ts +4 -2
- package/package.json +1 -1
- package/dist/cjs/app-chips_5.cjs.entry.js.map +0 -1
- package/dist/components/p-B87Lt3z4.js.map +0 -1
- package/dist/components/p-Bx2jqguC.js +0 -2
- package/dist/components/p-Bx2jqguC.js.map +0 -1
- package/dist/esm/app-chips_5.entry.js.map +0 -1
- package/dist/llm-testrunner/p-21202f12.entry.js +0 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm-test-case-row.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-case-row.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,iCAAiC,CAAC;AACpE,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAkB,MAAM,0BAA0B,CAAC;AACzE,OAAO,EAEL,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;
|
|
1
|
+
{"version":3,"file":"llm-test-case-row.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-case-row.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,iCAAiC,CAAC;AACpE,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAkB,MAAM,0BAA0B,CAAC;AACzE,OAAO,EAEL,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AAsBrC,MAAM,CAAC,MAAM,cAAc,GAA6C,CAAC,EACvE,QAAQ,EACR,0BAA0B,GAAG,KAAK,EAClC,YAAY,GAAG,EAAE,EACjB,KAAK,EACL,QAAQ,EACR,oBAAoB,EACpB,uBAAuB,EACvB,mBAAmB,GACpB,EAAE,EAAE;IACH,MAAM,cAAc,GAAmB;QACrC,IAAI,EAAE,UAAU;QAChB,SAAS,EAAE,aAAa,CAAC,SAAS;QAClC,IAAI,EAAE,MAAM;QACZ,KAAK,EAAE,UAAU;QACjB,WAAW,EAAE,6BAA6B;QAC1C,QAAQ,EAAE,IAAI;QACd,IAAI,EAAE,CAAC;KACR,CAAC;IACF,OAAO,CACL,WAAK,KAAK,EAAC,eAAe,EAAC,GAAG,EAAE,QAAQ,CAAC,EAAE;QACzC,WAAK,KAAK,EAAC,6BAA6B;YACtC,oBACE,MAAM,EAAE,cAAc,EACtB,KAAK,EAAE,QAAQ,CAAC,QAAQ,EACxB,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,oBAAoB,CAAC;oBACnB,MAAM,EAAE;wBACN,UAAU,EAAE,QAAQ,CAAC,EAAE;wBACvB,GAAG,EAAE,UAAU;wBACf,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB;iBACiE,CAAC,GAEvE;YACF,oBACE,kBAAkB,EAAE,QAAQ,CAAC,WAAW,EAAE,OAAO,IAAI,KAAK,EAC1D,gBAAgB,EAAE,QAAQ,CAAC,WAAW,EAAE,KAAK,IAAI,EAAE,EACnD,mBAAmB,EAAE,CAAC,CAAQ,EAAE,EAAE;oBAChC,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,GAAI,CAA0C;yBACnE,MAAM,CAAC;oBACV,mBAAmB,CAAC;wBAClB,MAAM,EAAE;4BACN,UAAU,EAAE,QAAQ,CAAC,EAAE;4BACvB,OAAO;4BACP,KAAK;yBACN;qBACyC,CAAC,CAAC;gBAChD,CAAC,GACD;YACF,EAAC,uBAAuB,IACtB,UAAU,EAAE,QAAQ,CAAC,EAAE,EACvB,MAAM,EAAE,QAAQ,CAAC,eAAe,IAAI,EAAE,EACtC,0BAA0B,EAAE,0BAA0B,EACtD,YAAY,EAAE,YAAY,EAC1B,uBAAuB,EAAE,uBAAuB,GAChD,CACE;QAEN,EAAC,cAAc,IAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,CAAC,SAAS,GAAI;QAE1E,EAAC,iBAAiB,IAChB,MAAM,EAAE,QAAQ,CAAC,gBAAgB,EACjC,SAAS,EAAE,QAAQ,CAAC,SAAS,GAC7B;QAEF,EAAC,UAAU,IACT,SAAS,EAAE,QAAQ,CAAC,SAAS,EAC7B,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,EAClC,KAAK,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,EAC5B,QAAQ,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,GACrC,CACE,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport { TestCase } from '../../../types/llm-test-runner';\nimport { ResponseOutput } from './output/response-output';\nimport { EvaluationSummary } from './evaluation/evaluation-summary';\nimport { RowActions } from './actions/row-actions';\nimport { FormFieldType, TextAreaConfig } from '../../../lib/form/schema';\nimport {\n ExpectedOutcomeChangeDetail,\n ExpectedOutcomeRenderer,\n} from './expected-outcome-renderer';\nimport type { ChatHistoryChangeDetail } from './chat-history';\n\nexport type ChatHistoryRowChangeDetail = {\n testCaseId: string;\n} & ChatHistoryChangeDetail;\n\nexport interface LLMTestCaseRowProps {\n testCase: TestCase;\n dynamicResolutionSupported?: boolean;\n extractorIds?: string[];\n onRun: (testCase: TestCase) => void;\n onDelete: (id: string) => void;\n handleTestCaseChange: (\n e: CustomEvent<{ testCaseId: string; key: string; value: string }>,\n ) => void;\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n onChatHistoryChange: (e: CustomEvent<ChatHistoryRowChangeDetail>) => void;\n}\n\nexport const LLMTestCaseRow: FunctionalComponent<LLMTestCaseRowProps> = ({\n testCase,\n dynamicResolutionSupported = false,\n extractorIds = [],\n onRun,\n onDelete,\n handleTestCaseChange,\n onExpectedOutcomeChange,\n onChatHistoryChange,\n}) => {\n const questionConfig: TextAreaConfig = {\n name: 'question',\n fieldType: FormFieldType.TEXT_AREA,\n type: 'text',\n label: 'Question',\n placeholder: 'Enter your question here...',\n required: true,\n rows: 3,\n };\n return (\n <div class=\"test-case-row\" key={testCase.id}>\n <div class=\"test-case-row__input-column\">\n <app-textarea\n config={questionConfig}\n value={testCase.question}\n onValueChange={(e) =>\n handleTestCaseChange({\n detail: {\n testCaseId: testCase.id,\n key: 'question',\n value: e.detail.value,\n },\n } as CustomEvent<{ testCaseId: string; key: string; value: string }>)\n }\n />\n <chat-history\n chatHistoryEnabled={testCase.chatHistory?.enabled ?? false}\n chatHistoryValue={testCase.chatHistory?.value ?? ''}\n onChatHistoryChange={(e: Event) => {\n const { enabled, value } = (e as CustomEvent<ChatHistoryChangeDetail>)\n .detail;\n onChatHistoryChange({\n detail: {\n testCaseId: testCase.id,\n enabled,\n value,\n },\n } as CustomEvent<ChatHistoryRowChangeDetail>);\n }}\n />\n <ExpectedOutcomeRenderer\n testCaseId={testCase.id}\n fields={testCase.expectedOutcome || []}\n dynamicResolutionSupported={dynamicResolutionSupported}\n extractorIds={extractorIds}\n onExpectedOutcomeChange={onExpectedOutcomeChange}\n />\n </div>\n\n <ResponseOutput output={testCase.output} isRunning={testCase.isRunning} />\n\n <EvaluationSummary\n result={testCase.evaluationResult}\n isRunning={testCase.isRunning}\n />\n\n <RowActions\n isRunning={testCase.isRunning}\n canRun={!!testCase.question.trim()}\n onRun={() => onRun(testCase)}\n onDelete={() => onDelete(testCase.id)}\n />\n </div>\n );\n};\n"]}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
2
|
import { LLMTestCaseRow } from "./llm-test-case-row";
|
|
3
3
|
import { Button } from "../../../lib/ui/button/index";
|
|
4
|
-
export const LLMTestCases = ({ testCases, dynamicResolutionSupported = false, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
5
|
-
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, dynamicResolutionSupported: dynamicResolutionSupported, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange, onChatHistoryChange: onChatHistoryChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
4
|
+
export const LLMTestCases = ({ testCases, dynamicResolutionSupported = false, extractorIds = [], onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
5
|
+
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, dynamicResolutionSupported: dynamicResolutionSupported, extractorIds: extractorIds, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange, onChatHistoryChange: onChatHistoryChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
6
6
|
};
|
|
7
7
|
//# sourceMappingURL=llm-test-cases.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm-test-cases.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-cases.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAA8B,MAAM,qBAAqB,CAAC;AACjF,OAAO,EAAE,MAAM,EAAE,MAAM,8BAA8B,CAAC;
|
|
1
|
+
{"version":3,"file":"llm-test-cases.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-cases.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAA8B,MAAM,qBAAqB,CAAC;AACjF,OAAO,EAAE,MAAM,EAAE,MAAM,8BAA8B,CAAC;AAmBtD,MAAM,CAAC,MAAM,YAAY,GAA2C,CAAC,EACnE,SAAS,EACT,0BAA0B,GAAG,KAAK,EAClC,YAAY,GAAG,EAAE,EACjB,KAAK,EACL,QAAQ,EACR,aAAa,EACb,oBAAoB,EACpB,uBAAuB,EACvB,mBAAmB,GACpB,EAAE,EAAE;IACH,OAAO,CACL,WAAK,KAAK,EAAC,YAAY;QACrB,WAAK,KAAK,EAAC,4BAA4B;YACrC,WAAK,KAAK,EAAC,2BAA2B,YAAY;YAClD,WAAK,KAAK,EAAC,2BAA2B,aAAa;YACnD,WAAK,KAAK,EAAC,2BAA2B,iBAAiB;YACvD,WAAK,KAAK,EAAC,2BAA2B,cAAc,CAChD;QAEL,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CACzB,EAAC,cAAc,IACb,QAAQ,EAAE,QAAQ,EAClB,0BAA0B,EAAE,0BAA0B,EACtD,YAAY,EAAE,YAAY,EAC1B,KAAK,EAAE,KAAK,EACZ,QAAQ,EAAE,QAAQ,EAClB,oBAAoB,EAAE,oBAAoB,EAC1C,uBAAuB,EAAE,uBAAuB,EAChD,mBAAmB,EAAE,mBAAmB,GACxC,CACH,CAAC;QAEF,WAAK,KAAK,EAAC,yBAAyB;YAClC,EAAC,MAAM,IAAC,OAAO,EAAC,SAAS,EAAC,IAAI,EAAC,IAAI,EAAC,OAAO,EAAE,aAAa,qBAEjD,CACL,CACF,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport { TestCase } from '../../../types/llm-test-runner';\nimport { LLMTestCaseRow, ChatHistoryRowChangeDetail } from './llm-test-case-row';\nimport { Button } from '../../../lib/ui/button/index';\nimport { ExpectedOutcomeChangeDetail } from './expected-outcome-renderer';\n\nexport interface LLMTestCasesProps {\n testCases: TestCase[];\n dynamicResolutionSupported?: boolean;\n extractorIds?: string[];\n onRun: (testCase: TestCase) => void;\n onDelete: (id: string) => void;\n onAddTestCase: () => void;\n handleTestCaseChange: (\n e: CustomEvent<{ testCaseId: string; key: string; value: string }>,\n ) => void;\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n onChatHistoryChange: (e: CustomEvent<ChatHistoryRowChangeDetail>) => void;\n}\n\nexport const LLMTestCases: FunctionalComponent<LLMTestCasesProps> = ({\n testCases,\n dynamicResolutionSupported = false,\n extractorIds = [],\n onRun,\n onDelete,\n onAddTestCase,\n handleTestCaseChange,\n onExpectedOutcomeChange,\n onChatHistoryChange,\n}) => {\n return (\n <div class=\"test-cases\">\n <div class=\"test-cases__column-headers\">\n <div class=\"test-cases__column-header\">Input</div>\n <div class=\"test-cases__column-header\">Output</div>\n <div class=\"test-cases__column-header\">Evaluation</div>\n <div class=\"test-cases__column-header\">Actions</div>\n </div>\n\n {testCases.map(testCase => (\n <LLMTestCaseRow\n testCase={testCase}\n dynamicResolutionSupported={dynamicResolutionSupported}\n extractorIds={extractorIds}\n onRun={onRun}\n onDelete={onDelete}\n handleTestCaseChange={handleTestCaseChange}\n onExpectedOutcomeChange={onExpectedOutcomeChange}\n onChatHistoryChange={onChatHistoryChange}\n />\n ))}\n\n <div class=\"test-cases__add-section\">\n <Button variant=\"outline\" size=\"md\" onClick={onAddTestCase}>\n + Add Question\n </Button>\n </div>\n </div>\n );\n};\n"]}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
2
|
export const ResponseOutput = ({ output, isRunning, }) => {
|
|
3
|
-
return (h("div", { class: "response-output" }, output ? (h("div", { class: "response-output__content" }, output)) : (h("div", { class: "response-output__placeholder" }, isRunning ? 'Running...' : ''))));
|
|
3
|
+
return (h("div", { class: "response-output" }, output?.text ? (h("div", { class: "response-output__content" }, output.text)) : (h("div", { class: "response-output__placeholder" }, isRunning ? 'Running...' : ''))));
|
|
4
4
|
};
|
|
5
5
|
//# sourceMappingURL=response-output.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"response-output.js","sourceRoot":"","sources":["../../../../../src/components/llm-test-runner/test-cases/output/response-output.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"response-output.js","sourceRoot":"","sources":["../../../../../src/components/llm-test-runner/test-cases/output/response-output.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAQvD,MAAM,CAAC,MAAM,cAAc,GAA6C,CAAC,EACvE,MAAM,EACN,SAAS,GACV,EAAE,EAAE;IACH,OAAO,CACL,WAAK,KAAK,EAAC,iBAAiB,IACzB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,CACd,WAAK,KAAK,EAAC,0BAA0B,IAAE,MAAM,CAAC,IAAI,CAAO,CAC1D,CAAC,CAAC,CAAC,CACF,WAAK,KAAK,EAAC,8BAA8B,IAAE,SAAS,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,CAAO,CAChF,CACG,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport type { ModelResponsePayload } from '../../../../types/llm-test-runner';\n\nexport interface ResponseOutputProps {\n output?: ModelResponsePayload;\n isRunning: boolean;\n}\n\nexport const ResponseOutput: FunctionalComponent<ResponseOutputProps> = ({\n output,\n isRunning,\n}) => {\n return (\n <div class=\"response-output\">\n {output?.text ? (\n <div class=\"response-output__content\">{output.text}</div>\n ) : (\n <div class=\"response-output__placeholder\">{isRunning ? 'Running...' : ''}</div>\n )}\n </div>\n );\n};\n\n"]}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
export const DEMO_MODES = {
|
|
2
|
+
simpleTest: {
|
|
3
|
+
initialTestCases: [
|
|
4
|
+
{
|
|
5
|
+
id: 'demo-simpleTest-1',
|
|
6
|
+
question:
|
|
7
|
+
'What is the capital of France? Reply with just the city name, nothing else.',
|
|
8
|
+
expectedOutcome: [
|
|
9
|
+
{
|
|
10
|
+
type: 'textarea',
|
|
11
|
+
label: 'Expected Outcome',
|
|
12
|
+
value: 'paris',
|
|
13
|
+
evaluationParameters: { approach: 'exact' },
|
|
14
|
+
},
|
|
15
|
+
],
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
id: 'demo-simpleTest-2',
|
|
19
|
+
question: 'How many legs does a typical dog have? One short phrase is fine.',
|
|
20
|
+
expectedOutcome: [
|
|
21
|
+
{
|
|
22
|
+
type: 'textarea',
|
|
23
|
+
label: 'Expected Outcome',
|
|
24
|
+
value: '4,four',
|
|
25
|
+
evaluationParameters: {
|
|
26
|
+
approach: 'semantic',
|
|
27
|
+
threshold: 0.48,
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
},
|
|
34
|
+
multipleExpectedOutcomes: {
|
|
35
|
+
defaultExpectedOutcomeSchema: [
|
|
36
|
+
{
|
|
37
|
+
type: 'text',
|
|
38
|
+
label: 'Must include',
|
|
39
|
+
placeholder: 'Word that should appear in the answer',
|
|
40
|
+
evaluationParameters: { approach: 'exact' },
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
type: 'textarea',
|
|
44
|
+
label: 'Meaning check',
|
|
45
|
+
placeholder: 'Comma-separated ideas to match loosely',
|
|
46
|
+
rows: 3,
|
|
47
|
+
evaluationParameters: { approach: 'semantic', threshold: 0.5 },
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
type: 'chips-input',
|
|
51
|
+
label: 'Keywords',
|
|
52
|
+
placeholder: 'Add chip',
|
|
53
|
+
evaluationParameters: { approach: 'exact' },
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
type: 'select',
|
|
57
|
+
label: 'Says yes or no',
|
|
58
|
+
placeholder: 'Pick one',
|
|
59
|
+
options: ['Yes', 'No'],
|
|
60
|
+
evaluationParameters: { approach: 'exact' },
|
|
61
|
+
},
|
|
62
|
+
],
|
|
63
|
+
initialTestCases: [
|
|
64
|
+
{
|
|
65
|
+
id: 'demo-multipleExpectedOutcomes-1',
|
|
66
|
+
question:
|
|
67
|
+
'Is water a liquid? Answer yes or no in one short sentence, and use the word water.',
|
|
68
|
+
expectedOutcome: [
|
|
69
|
+
{
|
|
70
|
+
type: 'text',
|
|
71
|
+
label: 'Must include',
|
|
72
|
+
value: 'water',
|
|
73
|
+
evaluationParameters: { approach: 'exact' },
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
type: 'textarea',
|
|
77
|
+
label: 'Meaning check',
|
|
78
|
+
value: 'yes,liquid',
|
|
79
|
+
evaluationParameters: {
|
|
80
|
+
approach: 'semantic',
|
|
81
|
+
threshold: 0.5,
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
type: 'chips-input',
|
|
86
|
+
label: 'Keywords',
|
|
87
|
+
value: ['yes', 'water'],
|
|
88
|
+
evaluationParameters: { approach: 'exact' },
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
type: 'select',
|
|
92
|
+
label: 'Says yes or no',
|
|
93
|
+
options: ['Yes', 'No'],
|
|
94
|
+
value: 'Yes',
|
|
95
|
+
evaluationParameters: { approach: 'exact' },
|
|
96
|
+
},
|
|
97
|
+
],
|
|
98
|
+
},
|
|
99
|
+
],
|
|
100
|
+
},
|
|
101
|
+
dynamicExpectedOutcome: {
|
|
102
|
+
initialTestCases: [
|
|
103
|
+
{
|
|
104
|
+
id: 'demo-dynamicExpectedOutcome-1',
|
|
105
|
+
question:
|
|
106
|
+
'In one sentence, when can customers reach support?',
|
|
107
|
+
expectedOutcome: [
|
|
108
|
+
{
|
|
109
|
+
type: 'textarea',
|
|
110
|
+
label: 'Gold answer (filled when you run)',
|
|
111
|
+
outcomeMode: 'dynamic',
|
|
112
|
+
resolutionQuery: 'demo.faq.hours',
|
|
113
|
+
value: '',
|
|
114
|
+
evaluationParameters: {
|
|
115
|
+
approach: 'semantic',
|
|
116
|
+
threshold: 0.62,
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
],
|
|
120
|
+
},
|
|
121
|
+
],
|
|
122
|
+
resolveExpectedOutcome: async (resolutionQuery) => {
|
|
123
|
+
const key = resolutionQuery.trim();
|
|
124
|
+
if (key === 'demo.faq.hours') {
|
|
125
|
+
return 'Support is available 9am–5pm Eastern Time, Monday through Friday.';
|
|
126
|
+
}
|
|
127
|
+
return `Unknown resolution key: ${key}`;
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { DEMO_MODES } from './demo-modes.js';
|
|
2
|
+
|
|
3
|
+
function wireGemini(runner) {
|
|
4
|
+
const llm = new window.GeminiAdapter(window.env.API_KEY);
|
|
5
|
+
runner.addEventListener('llmRequest', async (event) => {
|
|
6
|
+
try {
|
|
7
|
+
const response = await llm.invoke(event.detail.prompt);
|
|
8
|
+
event.detail.resolve({ text: response });
|
|
9
|
+
} catch (err) {
|
|
10
|
+
event.detail.reject(
|
|
11
|
+
err instanceof Error ? err : new Error(String(err)),
|
|
12
|
+
);
|
|
13
|
+
}
|
|
14
|
+
});
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function mountMode(host, modeKey) {
|
|
18
|
+
const config = DEMO_MODES[modeKey];
|
|
19
|
+
host.replaceChildren();
|
|
20
|
+
const runner = document.createElement('llm-test-runner');
|
|
21
|
+
runner.delayMs = 1000;
|
|
22
|
+
runner.initialTestCases = config.initialTestCases;
|
|
23
|
+
if (config.defaultExpectedOutcomeSchema) {
|
|
24
|
+
runner.defaultExpectedOutcomeSchema = config.defaultExpectedOutcomeSchema;
|
|
25
|
+
}
|
|
26
|
+
if (config.resolveExpectedOutcome) {
|
|
27
|
+
runner.resolveExpectedOutcome = config.resolveExpectedOutcome;
|
|
28
|
+
}
|
|
29
|
+
host.appendChild(runner);
|
|
30
|
+
wireGemini(runner);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function initVanillaDemo() {
|
|
34
|
+
const host = document.getElementById('runner-host');
|
|
35
|
+
const modeSelect = document.getElementById('demo-example-mode');
|
|
36
|
+
|
|
37
|
+
function showMode(modeKey) {
|
|
38
|
+
mountMode(host, modeKey);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
modeSelect.addEventListener('change', () => {
|
|
42
|
+
showMode(modeSelect.value);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
void (async () => {
|
|
46
|
+
await customElements.whenDefined('llm-test-runner');
|
|
47
|
+
modeSelect.value = 'simpleTest';
|
|
48
|
+
showMode('simpleTest');
|
|
49
|
+
})();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if (document.readyState === 'loading') {
|
|
53
|
+
document.addEventListener('DOMContentLoaded', initVanillaDemo);
|
|
54
|
+
} else {
|
|
55
|
+
initVanillaDemo();
|
|
56
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
function toTextSource() {
|
|
2
|
+
return { type: 'text' };
|
|
3
|
+
}
|
|
4
|
+
export async function resolveActualValue(field, output, extractors) {
|
|
5
|
+
const source = field.evaluationSource || toTextSource();
|
|
6
|
+
if (source.type === 'text') {
|
|
7
|
+
const text = output?.text?.trim();
|
|
8
|
+
if (!text) {
|
|
9
|
+
return {
|
|
10
|
+
success: false,
|
|
11
|
+
error: 'Model response text is empty.',
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
return { success: true, value: text };
|
|
15
|
+
}
|
|
16
|
+
const extractor = extractors?.[source.extractorId];
|
|
17
|
+
if (!extractor) {
|
|
18
|
+
return {
|
|
19
|
+
success: false,
|
|
20
|
+
error: `Extractor "${source.extractorId}" is not registered.`,
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
try {
|
|
24
|
+
const extractedRaw = await extractor(output || {});
|
|
25
|
+
if (typeof extractedRaw !== 'string') {
|
|
26
|
+
return {
|
|
27
|
+
success: false,
|
|
28
|
+
error: `Extractor "${source.extractorId}" must return a string.`,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
const extracted = extractedRaw.trim();
|
|
32
|
+
if (!extracted) {
|
|
33
|
+
return {
|
|
34
|
+
success: false,
|
|
35
|
+
error: `Extractor "${source.extractorId}" returned an empty value.`,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
return {
|
|
39
|
+
success: true,
|
|
40
|
+
value: extracted,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
catch (error) {
|
|
44
|
+
return {
|
|
45
|
+
success: false,
|
|
46
|
+
error: error instanceof Error
|
|
47
|
+
? error.message
|
|
48
|
+
: `Extractor "${source.extractorId}" failed.`,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=actual-value-resolver.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"actual-value-resolver.js","sourceRoot":"","sources":["../../../src/lib/evaluation/actual-value-resolver.ts"],"names":[],"mappings":"AAUA,SAAS,YAAY;IACnB,OAAO,EAAE,IAAI,EAAE,MAAM,EAAW,CAAC;AACnC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAA2B,EAC3B,MAA6B,EAC7B,UAAuC;IAEvC,MAAM,MAAM,GAAG,KAAK,CAAC,gBAAgB,IAAI,YAAY,EAAE,CAAC;IAExD,IAAI,MAAM,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,+BAA+B;aACvC,CAAC;QACJ,CAAC;QACD,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC;IACxC,CAAC;IAED,MAAM,SAAS,GAAG,UAAU,EAAE,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;IACnD,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO;YACL,OAAO,EAAE,KAAK;YACd,KAAK,EAAE,cAAc,MAAM,CAAC,WAAW,sBAAsB;SAC9D,CAAC;IACJ,CAAC;IAED,IAAI,CAAC;QACH,MAAM,YAAY,GAAG,MAAM,SAAS,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC;QACnD,IAAI,OAAO,YAAY,KAAK,QAAQ,EAAE,CAAC;YACrC,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,cAAc,MAAM,CAAC,WAAW,yBAAyB;aACjE,CAAC;QACJ,CAAC;QAED,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,CAAC,SAAS,EAAE,CAAC;YACf,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,cAAc,MAAM,CAAC,WAAW,4BAA4B;aACpE,CAAC;QACJ,CAAC;QAED,OAAO;YACL,OAAO,EAAE,IAAI;YACb,KAAK,EAAE,SAAS;SACjB,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,OAAO,EAAE,KAAK;YACd,KAAK,EACH,KAAK,YAAY,KAAK;gBACpB,CAAC,CAAC,KAAK,CAAC,OAAO;gBACf,CAAC,CAAC,cAAc,MAAM,CAAC,WAAW,WAAW;SAClD,CAAC;IACJ,CAAC;AACH,CAAC","sourcesContent":["import type {\n EvaluationSourceExtractors,\n ExpectedOutcomeField,\n ModelResponsePayload,\n} from '../../types/llm-test-runner';\n\nexport type ResolvedActualValue =\n | { success: true; value: string }\n | { success: false; error: string };\n\nfunction toTextSource() {\n return { type: 'text' } as const;\n}\n\nexport async function resolveActualValue(\n field: ExpectedOutcomeField,\n output?: ModelResponsePayload,\n extractors?: EvaluationSourceExtractors,\n): Promise<ResolvedActualValue> {\n const source = field.evaluationSource || toTextSource();\n\n if (source.type === 'text') {\n const text = output?.text?.trim();\n if (!text) {\n return {\n success: false,\n error: 'Model response text is empty.',\n };\n }\n return { success: true, value: text };\n }\n\n const extractor = extractors?.[source.extractorId];\n if (!extractor) {\n return {\n success: false,\n error: `Extractor \"${source.extractorId}\" is not registered.`,\n };\n }\n\n try {\n const extractedRaw = await extractor(output || {});\n if (typeof extractedRaw !== 'string') {\n return {\n success: false,\n error: `Extractor \"${source.extractorId}\" must return a string.`,\n };\n }\n\n const extracted = extractedRaw.trim();\n if (!extracted) {\n return {\n success: false,\n error: `Extractor \"${source.extractorId}\" returned an empty value.`,\n };\n }\n\n return {\n success: true,\n value: extracted,\n };\n } catch (error) {\n return {\n success: false,\n error:\n error instanceof Error\n ? error.message\n : `Extractor \"${source.extractorId}\" failed.`,\n };\n }\n}\n"]}
|
|
@@ -10,7 +10,7 @@ export class LLMEvaluationEngine {
|
|
|
10
10
|
const fieldRequest = {
|
|
11
11
|
testCaseId: request.testCaseId,
|
|
12
12
|
question: request.question,
|
|
13
|
-
actualResponse:
|
|
13
|
+
actualResponse: field.actualResponse,
|
|
14
14
|
expectedOutcome: field.expectedValue,
|
|
15
15
|
evaluationParameters: field.evaluationParameters,
|
|
16
16
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,yBAAyB,EAAE,MAAM,6BAA6B,CAAC;AACxE,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AAEzE,MAAM,OAAO,mBAAmB;IAC9B,KAAK,CAAC,gBAAgB,CACpB,OAA4B,EAC5B,QAA4B;QAE5B,MAAM,cAAc,GAAG,MAAM,OAAO,CAAC,UAAU,CAC7C,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,EAAC,KAAK,EAAC,EAAE;YAC/B,MAAM,YAAY,GAAsB;gBACtC,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,QAAQ,EAAE,OAAO,CAAC,QAAQ;gBAC1B,cAAc,EAAE,
|
|
1
|
+
{"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,yBAAyB,EAAE,MAAM,6BAA6B,CAAC;AACxE,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AAEzE,MAAM,OAAO,mBAAmB;IAC9B,KAAK,CAAC,gBAAgB,CACpB,OAA4B,EAC5B,QAA4B;QAE5B,MAAM,cAAc,GAAG,MAAM,OAAO,CAAC,UAAU,CAC7C,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,EAAC,KAAK,EAAC,EAAE;YAC/B,MAAM,YAAY,GAAsB;gBACtC,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,QAAQ,EAAE,OAAO,CAAC,QAAQ;gBAC1B,cAAc,EAAE,KAAK,CAAC,cAAc;gBACpC,eAAe,EAAE,KAAK,CAAC,aAAa;gBACpC,oBAAoB,EAAE,KAAK,CAAC,oBAAoB;aACjD,CAAC;YACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,YAAY,CAAC,CAAC;YAEtD,MAAM,WAAW,GAA0B;gBACzC,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,aAAa,EAAE,KAAK,CAAC,aAAa;gBAClC,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,oBAAoB,EAAE,MAAM,CAAC,oBAAqB;gBAClD,wBAAwB,EAAE,MAAM,CAAC,wBAAwB;aAC1D,CAAC;YACF,OAAO,WAAW,CAAC;QACrB,CAAC,CAAC,CACH,CAAC;QAEF,MAAM,YAAY,GAA4B,cAAc,CAAC,GAAG,CAC9D,CAAC,aAAa,EAAE,KAAK,EAAE,EAAE;YACvB,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACpC,IAAI,aAAa,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;gBACzC,OAAO,aAAa,CAAC,KAAK,CAAC;YAC7B,CAAC;YAED,OAAO;gBACL,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,aAAa,EAAE,KAAK,CAAC,aAAa;gBAClC,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,oBAAoB,EAAE,KAAK,CAAC,oBAAoB;gBAChD,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,KAAK,CAAC,oBAAoB,CAAC,QAAQ;iBAClD;gBACD,KAAK,EAAE,IAAI,CAAC,mBAAmB,CAAC,aAAa,CAAC,MAAM,CAAC;aACtD,CAAC;QACJ,CAAC,CACF,CAAC;QAEF,MAAM,cAAc,GAAG,YAAY,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC3E,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAEzE,QAAQ,CAAC;YACP,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,MAAM;YACN,cAAc;YACd,YAAY;YACZ,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,aAAa,CAAC,OAA0B;QACpD,MAAM,QAAQ,GAAuB,OAAO,CAAC,oBAAoB,CAAC,QAAQ,CAAC;QAC3E,QAAQ,QAAQ,EAAE,CAAC;YACjB,KAAK,kBAAkB,CAAC,IAAI;gBAC1B,OAAO,qBAAqB,CAAC,OAAO,CAAC,CAAC;YACxC,KAAK,kBAAkB,CAAC,KAAK;gBAC3B,OAAO,iBAAiB,CAAC,OAAO,CAAC,CAAC;YACpC,KAAK,kBAAkB,CAAC,OAAO;gBAC7B,OAAO,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAC1C,KAAK,kBAAkB,CAAC,OAAO;gBAC7B,OAAO,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAC1C,KAAK,kBAAkB,CAAC,QAAQ;gBAC9B,OAAO,yBAAyB,CAAC,OAAO,CAAC,CAAC;YAC5C;gBACE,OAAO,CAAC,IAAI,CACV,8BAA8B,OAAO,CAAC,oBAAoB,CAAC,QAAQ,kCAAkC,CACtG,CAAC;gBACF,OAAO,iBAAiB,CAAC,OAAO,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAEO,mBAAmB,CAAC,KAAc;QACxC,OAAO,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,CAAC;IAC7E,CAAC;CACF","sourcesContent":["import {\n EvaluationRequest,\n EvaluationResult,\n EvaluationCallback,\n FieldEvaluationResult,\n EvaluationRequestV2,\n} from './types';\nimport { performEvaluation } from './evaluators/exact/exact';\nimport { EvaluationApproach } from './constants';\nimport { performRouge1Evaluation } from './evaluators/rouge1-evaluator';\nimport { performSemanticEvaluation } from './evaluators/semantic/index';\nimport { performRougeLEvaluation } from './evaluators/rougeL-evaluator';\nimport { performBleuEvaluation } from './evaluators/bleu/bleu-evaluator';\n\nexport class LLMEvaluationEngine {\n async evaluateResponse(\n request: EvaluationRequestV2,\n callback: EvaluationCallback,\n ): Promise<void> {\n const settledResults = await Promise.allSettled(\n request.fields.map(async field => {\n const fieldRequest: EvaluationRequest = {\n testCaseId: request.testCaseId,\n question: request.question,\n actualResponse: field.actualResponse,\n expectedOutcome: field.expectedValue,\n evaluationParameters: field.evaluationParameters,\n };\n const result = await this.evaluateField(fieldRequest);\n\n const fieldResult: FieldEvaluationResult = {\n index: field.index,\n label: field.label,\n type: field.type,\n expectedValue: field.expectedValue,\n passed: result.passed,\n keywordMatches: result.keywordMatches,\n evaluationParameters: result.evaluationParameters!,\n evaluationApproachResult: result.evaluationApproachResult,\n };\n return fieldResult;\n }),\n );\n\n const fieldResults: FieldEvaluationResult[] = settledResults.map(\n (settledResult, index) => {\n const field = request.fields[index];\n if (settledResult.status === 'fulfilled') {\n return settledResult.value;\n }\n\n return {\n index: field.index,\n label: field.label,\n type: field.type,\n expectedValue: field.expectedValue,\n passed: false,\n keywordMatches: [],\n evaluationParameters: field.evaluationParameters,\n evaluationApproachResult: {\n score: 0,\n approachUsed: field.evaluationParameters.approach,\n },\n error: this.getSafeErrorMessage(settledResult.reason),\n };\n },\n );\n\n const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);\n const passed = fieldResults.every(field => field.passed && !field.error);\n\n callback({\n testCaseId: request.testCaseId,\n passed,\n keywordMatches,\n fieldResults,\n timestamp: new Date().toISOString(),\n });\n }\n\n private async evaluateField(request: EvaluationRequest): Promise<EvaluationResult> {\n const approach: EvaluationApproach = request.evaluationParameters.approach;\n switch (approach) {\n case EvaluationApproach.BLEU:\n return performBleuEvaluation(request);\n case EvaluationApproach.EXACT:\n return performEvaluation(request);\n case EvaluationApproach.ROUGE_1:\n return performRouge1Evaluation(request);\n case EvaluationApproach.ROUGE_L:\n return performRougeLEvaluation(request);\n case EvaluationApproach.SEMANTIC:\n return performSemanticEvaluation(request);\n default:\n console.warn(\n `Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`,\n );\n return performEvaluation(request);\n }\n }\n\n private getSafeErrorMessage(error: unknown): string {\n return error instanceof Error ? error.message : 'Field evaluation failed.';\n }\n}\n"]}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { LLMEvaluationEngine } from "./evaluation-engine";
|
|
2
2
|
import { normalizeEvaluationParametersForField } from "./field-evaluation-approach";
|
|
3
|
+
import { resolveActualValue } from "./actual-value-resolver";
|
|
3
4
|
/**
|
|
4
5
|
* Service for evaluating test case responses
|
|
5
6
|
*/
|
|
@@ -13,34 +14,71 @@ export class EvaluationService {
|
|
|
13
14
|
* @param testCase - The test case to evaluate
|
|
14
15
|
* @param onResult - Callback to handle the evaluation result
|
|
15
16
|
*/
|
|
16
|
-
async evaluateTestCase(testCase, onResult) {
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
}
|
|
21
|
-
const fields = (testCase.expectedOutcome || []).flatMap((field, index) => {
|
|
17
|
+
async evaluateTestCase(testCase, onResult, extractors) {
|
|
18
|
+
const fields = [];
|
|
19
|
+
const failedFields = [];
|
|
20
|
+
for (const [index, field] of (testCase.expectedOutcome || []).entries()) {
|
|
22
21
|
if (field.type === 'textarea' && field.outcomeMode === 'dynamic') {
|
|
23
|
-
|
|
22
|
+
continue;
|
|
24
23
|
}
|
|
25
|
-
|
|
26
|
-
|
|
24
|
+
const evaluationParameters = normalizeEvaluationParametersForField(field.type, field.evaluationParameters);
|
|
25
|
+
const expectedValue = getFieldExpectedValue(field);
|
|
26
|
+
const resolvedActualValue = await resolveActualValue(field, testCase.output, extractors);
|
|
27
|
+
if (resolvedActualValue.success) {
|
|
28
|
+
fields.push({
|
|
27
29
|
index,
|
|
28
30
|
label: field.label,
|
|
29
31
|
type: field.type,
|
|
30
|
-
expectedValue
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
expectedValue,
|
|
33
|
+
actualResponse: resolvedActualValue.value,
|
|
34
|
+
evaluationParameters,
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
else {
|
|
38
|
+
failedFields.push({
|
|
39
|
+
index,
|
|
40
|
+
label: field.label,
|
|
41
|
+
type: field.type,
|
|
42
|
+
expectedValue,
|
|
43
|
+
passed: false,
|
|
44
|
+
keywordMatches: [],
|
|
45
|
+
evaluationParameters,
|
|
46
|
+
evaluationApproachResult: {
|
|
47
|
+
score: 0,
|
|
48
|
+
approachUsed: evaluationParameters.approach,
|
|
49
|
+
},
|
|
50
|
+
error: 'error' in resolvedActualValue
|
|
51
|
+
? resolvedActualValue.error
|
|
52
|
+
: 'Failed to resolve actual value.',
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
if (fields.length === 0) {
|
|
57
|
+
if (failedFields.length === 0) {
|
|
58
|
+
console.warn('⚠️ No evaluable fields for test case:', testCase.id);
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
onResult({
|
|
62
|
+
testCaseId: testCase.id,
|
|
63
|
+
passed: false,
|
|
64
|
+
keywordMatches: [],
|
|
65
|
+
fieldResults: failedFields,
|
|
66
|
+
timestamp: new Date().toISOString(),
|
|
67
|
+
});
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
35
70
|
const evaluationRequest = {
|
|
36
71
|
testCaseId: testCase.id,
|
|
37
72
|
question: testCase.question,
|
|
38
|
-
actualResponse: testCase.output,
|
|
39
73
|
fields,
|
|
40
74
|
};
|
|
41
75
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
42
|
-
|
|
43
|
-
onResult(
|
|
76
|
+
const combinedResults = [...(result.fieldResults || []), ...failedFields].sort((a, b) => a.index - b.index);
|
|
77
|
+
onResult({
|
|
78
|
+
...result,
|
|
79
|
+
passed: combinedResults.every(field => field.passed && !field.error),
|
|
80
|
+
fieldResults: combinedResults,
|
|
81
|
+
});
|
|
44
82
|
});
|
|
45
83
|
}
|
|
46
84
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluation-service.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;
|
|
1
|
+
{"version":3,"file":"evaluation-service.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAY1D,OAAO,EAAE,qCAAqC,EAAE,MAAM,6BAA6B,CAAC;AACpF,OAAO,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAE7D;;GAEG;AACH,MAAM,OAAO,iBAAiB;IACpB,MAAM,CAAsB;IAEpC;QACE,IAAI,CAAC,MAAM,GAAG,IAAI,mBAAmB,EAAE,CAAC;IAC1C,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,gBAAgB,CACpB,QAAkB,EAClB,QAA4C,EAC5C,UAAuC;QAEvC,MAAM,MAAM,GAA2B,EAAE,CAAC;QAC1C,MAAM,YAAY,GAA4B,EAAE,CAAC;QAEjD,KAAK,MAAM,CAAC,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC;YACxE,IAAI,KAAK,CAAC,IAAI,KAAK,UAAU,IAAI,KAAK,CAAC,WAAW,KAAK,SAAS,EAAE,CAAC;gBACjE,SAAS;YACX,CAAC;YAED,MAAM,oBAAoB,GAAG,qCAAqC,CAChE,KAAK,CAAC,IAAI,EACV,KAAK,CAAC,oBAAoB,CAC3B,CAAC;YACF,MAAM,aAAa,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,mBAAmB,GAAG,MAAM,kBAAkB,CAClD,KAAK,EACL,QAAQ,CAAC,MAAM,EACf,UAAU,CACX,CAAC;YAEF,IAAI,mBAAmB,CAAC,OAAO,EAAE,CAAC;gBAChC,MAAM,CAAC,IAAI,CAAC;oBACV,KAAK;oBACL,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;oBAChB,aAAa;oBACb,cAAc,EAAE,mBAAmB,CAAC,KAAK;oBACzC,oBAAoB;iBACrB,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,YAAY,CAAC,IAAI,CAAC;oBAChB,KAAK;oBACL,KAAK,EAAE,KAAK,CAAC,KAAK;oBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;oBAChB,aAAa;oBACb,MAAM,EAAE,KAAK;oBACb,cAAc,EAAE,EAAE;oBAClB,oBAAoB;oBACpB,wBAAwB,EAAE;wBACxB,KAAK,EAAE,CAAC;wBACR,YAAY,EAAE,oBAAoB,CAAC,QAAQ;qBAC5C;oBACD,KAAK,EACH,OAAO,IAAI,mBAAmB;wBAC5B,CAAC,CAAC,mBAAmB,CAAC,KAAK;wBAC3B,CAAC,CAAC,iCAAiC;iBACxC,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACxB,IAAI,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC9B,OAAO,CAAC,IAAI,CAAC,uCAAuC,EAAE,QAAQ,CAAC,EAAE,CAAC,CAAC;gBACnE,OAAO;YACT,CAAC;YAED,QAAQ,CAAC;gBACP,UAAU,EAAE,QAAQ,CAAC,EAAE;gBACvB,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,YAAY,EAAE,YAAY;gBAC1B,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC,CAAC;YACH,OAAO;QACT,CAAC;QAED,MAAM,iBAAiB,GAAwB;YAC7C,UAAU,EAAE,QAAQ,CAAC,EAAE;YACvB,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,MAAM;SACP,CAAC;QAEF,MAAM,IAAI,CAAC,MAAM,CAAC,gBAAgB,CAAC,iBAAiB,EAAE,CAAC,MAAwB,EAAE,EAAE;YACjF,MAAM,eAAe,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,YAAY,IAAI,EAAE,CAAC,EAAE,GAAG,YAAY,CAAC,CAAC,IAAI,CAC5E,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAC5B,CAAC;YACF,QAAQ,CAAC;gBACP,GAAG,MAAM;gBACT,MAAM,EAAE,eAAe,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC;gBACpE,YAAY,EAAE,eAAe;aAC9B,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAED,SAAS,qBAAqB,CAAC,KAA2B;IACxD,IAAI,KAAK,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;QACjC,OAAO,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAChC,CAAC;IACD,OAAO,KAAK,CAAC,KAAK,CAAC;AACrB,CAAC","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport {\n EvaluationResult,\n FieldEvaluationInput,\n EvaluationRequestV2,\n FieldEvaluationResult,\n} from './types';\nimport {\n TestCase,\n ExpectedOutcomeField,\n EvaluationSourceExtractors,\n} from '../../types/llm-test-runner';\nimport { normalizeEvaluationParametersForField } from './field-evaluation-approach';\nimport { resolveActualValue } from './actual-value-resolver';\n\n/**\n * Service for evaluating test case responses\n */\nexport class EvaluationService {\n private engine: LLMEvaluationEngine;\n\n constructor() {\n this.engine = new LLMEvaluationEngine();\n }\n\n /**\n * Evaluates a test case response\n * @param testCase - The test case to evaluate\n * @param onResult - Callback to handle the evaluation result\n */\n async evaluateTestCase(\n testCase: TestCase,\n onResult: (result: EvaluationResult) => void,\n extractors?: EvaluationSourceExtractors,\n ): Promise<void> {\n const fields: FieldEvaluationInput[] = [];\n const failedFields: FieldEvaluationResult[] = [];\n\n for (const [index, field] of (testCase.expectedOutcome || []).entries()) {\n if (field.type === 'textarea' && field.outcomeMode === 'dynamic') {\n continue;\n }\n\n const evaluationParameters = normalizeEvaluationParametersForField(\n field.type,\n field.evaluationParameters,\n );\n const expectedValue = getFieldExpectedValue(field);\n const resolvedActualValue = await resolveActualValue(\n field,\n testCase.output,\n extractors,\n );\n\n if (resolvedActualValue.success) {\n fields.push({\n index,\n label: field.label,\n type: field.type,\n expectedValue,\n actualResponse: resolvedActualValue.value,\n evaluationParameters,\n });\n } else {\n failedFields.push({\n index,\n label: field.label,\n type: field.type,\n expectedValue,\n passed: false,\n keywordMatches: [],\n evaluationParameters,\n evaluationApproachResult: {\n score: 0,\n approachUsed: evaluationParameters.approach,\n },\n error:\n 'error' in resolvedActualValue\n ? resolvedActualValue.error\n : 'Failed to resolve actual value.',\n });\n }\n }\n\n if (fields.length === 0) {\n if (failedFields.length === 0) {\n console.warn('⚠️ No evaluable fields for test case:', testCase.id);\n return;\n }\n\n onResult({\n testCaseId: testCase.id,\n passed: false,\n keywordMatches: [],\n fieldResults: failedFields,\n timestamp: new Date().toISOString(),\n });\n return;\n }\n\n const evaluationRequest: EvaluationRequestV2 = {\n testCaseId: testCase.id,\n question: testCase.question,\n fields,\n };\n\n await this.engine.evaluateResponse(evaluationRequest, (result: EvaluationResult) => {\n const combinedResults = [...(result.fieldResults || []), ...failedFields].sort(\n (a, b) => a.index - b.index,\n );\n onResult({\n ...result,\n passed: combinedResults.every(field => field.passed && !field.error),\n fieldResults: combinedResults,\n });\n });\n }\n}\n\nfunction getFieldExpectedValue(field: ExpectedOutcomeField): string {\n if (field.type === 'chips-input') {\n return field.value.join(', ');\n }\n return field.value;\n}\n"]}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\nimport type { ExpectedOutcomeFieldType } from '../../types/llm-test-runner';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface FieldEvaluationInput {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationRequestV2 {\n testCaseId: string;\n question: string;\n
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\nimport type { ExpectedOutcomeFieldType } from '../../types/llm-test-runner';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface FieldEvaluationInput {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationRequestV2 {\n testCaseId: string;\n question: string;\n fields: FieldEvaluationInput[];\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n fieldResults?: FieldEvaluationResult[];\n timestamp?: string;\n evaluationParameters?: EvaluationParameters;\n evaluationApproachResult?: EvaluationApproachResult;\n}\n\nexport interface FieldEvaluationResult {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n error?: string;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}\n"]}
|
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
import { createTestCaseFromInput } from "../test-cases/test-case-factory";
|
|
2
2
|
import { validateTestCaseInputArray } from "../../schemas/test-case";
|
|
3
|
+
import { validateExpectedOutcomeArrayWithExtractors } from "../../schemas/expected-outcome";
|
|
3
4
|
/**
|
|
4
5
|
* Validates and imports test cases from JSON content
|
|
5
6
|
* @param jsonContent - The JSON string to parse and validate
|
|
6
7
|
* @returns Validation result with test cases or error message
|
|
7
8
|
*/
|
|
8
|
-
export function importTestSuite(jsonContent) {
|
|
9
|
+
export function importTestSuite(jsonContent, allowedExtractorIds = []) {
|
|
9
10
|
try {
|
|
10
11
|
const parsed = JSON.parse(jsonContent);
|
|
11
12
|
validateTestCaseInputArray(parsed);
|
|
13
|
+
if (allowedExtractorIds.length > 0) {
|
|
14
|
+
parsed.forEach((testCase) => {
|
|
15
|
+
validateExpectedOutcomeArrayWithExtractors(testCase.expectedOutcome, allowedExtractorIds);
|
|
16
|
+
});
|
|
17
|
+
}
|
|
12
18
|
const testCases = parsed.map((item, index) => {
|
|
13
19
|
try {
|
|
14
20
|
return createTestCaseFromInput(item);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"test-suite-importer.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-suite-importer.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,0BAA0B,EAAE,MAAM,yBAAyB,CAAC;
|
|
1
|
+
{"version":3,"file":"test-suite-importer.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-suite-importer.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,uBAAuB,EAAE,MAAM,iCAAiC,CAAC;AAC1E,OAAO,EAAE,0BAA0B,EAAE,MAAM,yBAAyB,CAAC;AACrE,OAAO,EAAE,0CAA0C,EAAE,MAAM,gCAAgC,CAAC;AAQ5F;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAC7B,WAAmB,EACnB,sBAAgC,EAAE;IAElC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;QACvC,0BAA0B,CAAC,MAAM,CAAC,CAAC;QACnC,IAAI,mBAAmB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnC,MAAM,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,EAAE;gBAC1B,0CAA0C,CACxC,QAAQ,CAAC,eAAe,EACxB,mBAAmB,CACpB,CAAC;YACJ,CAAC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE;YAC3C,IAAI,CAAC;gBACH,OAAO,uBAAuB,CAAC,IAAI,CAAC,CAAC;YACvC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;gBACrE,MAAM,IAAI,KAAK,CAAC,8BAA8B,KAAK,KAAK,OAAO,EAAE,CAAC,CAAC;YACrE,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,OAAO,EAAE,IAAI;YACb,SAAS;SACV,CAAC;IACJ,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO;YACL,OAAO,EAAE,KAAK;YACd,KAAK,EACH,GAAG,YAAY,KAAK;gBAClB,CAAC,CAAC,GAAG,CAAC,OAAO;gBACb,CAAC,CAAC,gEAAgE;SACvE,CAAC;IACJ,CAAC;AACH,CAAC","sourcesContent":["import type { TestCase } from '../../types/llm-test-runner';\nimport { createTestCaseFromInput } from '../test-cases/test-case-factory';\nimport { validateTestCaseInputArray } from '../../schemas/test-case';\nimport { validateExpectedOutcomeArrayWithExtractors } from '../../schemas/expected-outcome';\n\nexport interface ImportValidationResult {\n success: boolean;\n testCases?: TestCase[];\n error?: string;\n}\n\n/**\n * Validates and imports test cases from JSON content\n * @param jsonContent - The JSON string to parse and validate\n * @returns Validation result with test cases or error message\n */\nexport function importTestSuite(\n jsonContent: string,\n allowedExtractorIds: string[] = [],\n): ImportValidationResult {\n try {\n const parsed = JSON.parse(jsonContent);\n validateTestCaseInputArray(parsed);\n if (allowedExtractorIds.length > 0) {\n parsed.forEach((testCase) => {\n validateExpectedOutcomeArrayWithExtractors(\n testCase.expectedOutcome,\n allowedExtractorIds,\n );\n });\n }\n\n const testCases = parsed.map((item, index) => {\n try {\n return createTestCaseFromInput(item);\n } catch (err) {\n const message = err instanceof Error ? err.message : 'Unknown error';\n throw new Error(`Invalid test case at index ${index}: ${message}`);\n }\n });\n\n return {\n success: true,\n testCases,\n };\n } catch (err) {\n return {\n success: false,\n error:\n err instanceof Error\n ? err.message\n : 'Error processing file. Please ensure it is a valid JSON array.',\n };\n }\n}\n\n"]}
|
|
@@ -11,6 +11,7 @@ export const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
11
11
|
function normalizeExpectedOutcomeField(field) {
|
|
12
12
|
return {
|
|
13
13
|
...field,
|
|
14
|
+
evaluationSource: field.evaluationSource || { type: 'text' },
|
|
14
15
|
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
15
16
|
};
|
|
16
17
|
}
|
|
@@ -34,6 +35,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
34
35
|
type: 'text',
|
|
35
36
|
label: schemaField.label,
|
|
36
37
|
placeholder: schemaField.placeholder,
|
|
38
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
37
39
|
value: '',
|
|
38
40
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
39
41
|
};
|
|
@@ -42,6 +44,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
42
44
|
type: 'textarea',
|
|
43
45
|
label: schemaField.label,
|
|
44
46
|
placeholder: schemaField.placeholder,
|
|
47
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
45
48
|
rows: schemaField.rows,
|
|
46
49
|
value: '',
|
|
47
50
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
@@ -51,6 +54,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
51
54
|
type: 'chips-input',
|
|
52
55
|
label: schemaField.label,
|
|
53
56
|
placeholder: schemaField.placeholder,
|
|
57
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
54
58
|
value: [],
|
|
55
59
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
56
60
|
};
|
|
@@ -59,6 +63,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
59
63
|
type: 'select',
|
|
60
64
|
label: schemaField.label,
|
|
61
65
|
placeholder: schemaField.placeholder,
|
|
66
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
62
67
|
value: schemaField.options[0],
|
|
63
68
|
options: schemaField.options,
|
|
64
69
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|