llm-testrunner-components 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +298 -232
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +6 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-Bb89MYYu.js +7 -0
- package/dist/components/p-Bb89MYYu.js.map +1 -0
- package/dist/esm/index.js +298 -232
- package/dist/esm/index.js.map +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +0 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
|
@@ -1,9 +1,29 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
2
|
import { FormFieldType } from "../../../lib/form/schema";
|
|
3
|
+
import { EvaluationApproach, } from "../../../lib/evaluation/constants";
|
|
4
|
+
import { getAllowedApproachesForFieldType } from "../../../lib/evaluation/field-evaluation-approach";
|
|
3
5
|
export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange, }) => {
|
|
4
6
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
5
7
|
detail,
|
|
6
8
|
});
|
|
9
|
+
const buildEvaluationConfig = (index, optionList) => ({
|
|
10
|
+
name: `expectedOutcomeEvaluation-${index}`,
|
|
11
|
+
fieldType: FormFieldType.SELECT,
|
|
12
|
+
label: 'Evaluation Approach',
|
|
13
|
+
placeholder: 'Select evaluation approach…',
|
|
14
|
+
required: true,
|
|
15
|
+
optionList,
|
|
16
|
+
defaultValue: EvaluationApproach.EXACT,
|
|
17
|
+
});
|
|
18
|
+
const renderEvaluationSelector = (field, index) => {
|
|
19
|
+
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
20
|
+
return (h("app-select", { config: buildEvaluationConfig(index, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
21
|
+
testCaseId,
|
|
22
|
+
index,
|
|
23
|
+
operation: 'set-evaluation-approach',
|
|
24
|
+
value: e.detail.value,
|
|
25
|
+
}) }));
|
|
26
|
+
};
|
|
7
27
|
return (h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index) => {
|
|
8
28
|
if (field.type === 'textarea') {
|
|
9
29
|
const config = {
|
|
@@ -11,15 +31,15 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
11
31
|
fieldType: FormFieldType.TEXT_AREA,
|
|
12
32
|
label: field.label,
|
|
13
33
|
placeholder: field.placeholder,
|
|
14
|
-
required:
|
|
34
|
+
required: true,
|
|
15
35
|
rows: field.rows || 2,
|
|
16
36
|
};
|
|
17
|
-
return (h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
37
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
18
38
|
testCaseId,
|
|
19
39
|
index,
|
|
20
40
|
operation: 'set-value',
|
|
21
41
|
value: e.detail.value,
|
|
22
|
-
}) }));
|
|
42
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
23
43
|
}
|
|
24
44
|
if (field.type === 'chips-input') {
|
|
25
45
|
const config = {
|
|
@@ -27,9 +47,9 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
27
47
|
fieldType: FormFieldType.CHIPS,
|
|
28
48
|
label: field.label,
|
|
29
49
|
placeholder: field.placeholder,
|
|
30
|
-
required:
|
|
50
|
+
required: true,
|
|
31
51
|
};
|
|
32
|
-
return (h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
52
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
33
53
|
testCaseId,
|
|
34
54
|
index,
|
|
35
55
|
operation: 'add-chip',
|
|
@@ -39,7 +59,7 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
39
59
|
index,
|
|
40
60
|
operation: 'remove-chip',
|
|
41
61
|
value: e.detail.value,
|
|
42
|
-
}) }));
|
|
62
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
43
63
|
}
|
|
44
64
|
if (field.type === 'select') {
|
|
45
65
|
const config = {
|
|
@@ -47,22 +67,22 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
47
67
|
fieldType: FormFieldType.SELECT,
|
|
48
68
|
label: field.label,
|
|
49
69
|
placeholder: field.placeholder,
|
|
50
|
-
required:
|
|
70
|
+
required: true,
|
|
51
71
|
optionList: field.options,
|
|
52
72
|
};
|
|
53
|
-
return (h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
73
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
54
74
|
testCaseId,
|
|
55
75
|
index,
|
|
56
76
|
operation: 'set-value',
|
|
57
77
|
value: e.detail.value,
|
|
58
|
-
}) }));
|
|
78
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
59
79
|
}
|
|
60
|
-
return (h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
80
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
61
81
|
testCaseId,
|
|
62
82
|
index,
|
|
63
83
|
operation: 'set-value',
|
|
64
84
|
value: e.target.value,
|
|
65
|
-
}) })));
|
|
85
|
+
}) })), renderEvaluationSelector(field, index)));
|
|
66
86
|
})));
|
|
67
87
|
};
|
|
68
88
|
//# sourceMappingURL=expected-outcome-renderer.js.map
|
package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"expected-outcome-renderer.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/expected-outcome-renderer.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAIvD,OAAO,EAAe,aAAa,EAAgC,MAAM,0BAA0B,CAAC;
|
|
1
|
+
{"version":3,"file":"expected-outcome-renderer.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/expected-outcome-renderer.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAIvD,OAAO,EAAe,aAAa,EAAgC,MAAM,0BAA0B,CAAC;AACpG,OAAO,EACL,kBAAkB,GACnB,MAAM,mCAAmC,CAAC;AAC3C,OAAO,EAAE,gCAAgC,EAAE,MAAM,mDAAmD,CAAC;AAerG,MAAM,CAAC,MAAM,uBAAuB,GAAsD,CAAC,EACzF,UAAU,EACV,MAAM,EACN,uBAAuB,GACxB,EAAE,EAAE;IACH,MAAM,IAAI,GAAG,CAAC,MAAmC,EAAE,EAAE,CACnD,uBAAuB,CAAC;QACtB,MAAM;KACqC,CAAC,CAAC;IAEjD,MAAM,qBAAqB,GAAG,CAC5B,KAAa,EACb,UAAoB,EACN,EAAE,CAAC,CAAC;QAClB,IAAI,EAAE,6BAA6B,KAAK,EAAE;QAC1C,SAAS,EAAE,aAAa,CAAC,MAAM;QAC/B,KAAK,EAAE,qBAAqB;QAC5B,WAAW,EAAE,6BAA6B;QAC1C,QAAQ,EAAE,IAAI;QACd,UAAU;QACV,YAAY,EAAE,kBAAkB,CAAC,KAAK;KACvC,CAAC,CAAC;IAEH,MAAM,wBAAwB,GAAG,CAC/B,KAA2B,EAC3B,KAAa,EACb,EAAE;QACF,MAAM,UAAU,GAAG,gCAAgC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEhE,OAAO,CACL,kBACE,MAAM,EAAE,qBAAqB,CAAC,KAAK,EAAE,UAAU,CAAC,EAChD,KAAK,EAAE,KAAK,CAAC,oBAAoB,EAAE,QAAQ,EAC3C,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,IAAI,CAAC;gBACH,UAAU;gBACV,KAAK;gBACL,SAAS,EAAE,yBAAyB;gBACpC,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAA2B;aAC5C,CAAC,GAEJ,CACH,CAAC;IACJ,CAAC,CAAC;IAEF,OAAO,CACL,WAAK,KAAK,EAAC,2BAA2B,IACnC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QACnC,IAAI,KAAK,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;YAC9B,MAAM,MAAM,GAAmB;gBAC7B,IAAI,EAAE,mBAAmB,KAAK,EAAE;gBAChC,SAAS,EAAE,aAAa,CAAC,SAAS;gBAClC,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,WAAW,EAAE,KAAK,CAAC,WAAW;gBAC9B,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC;aACtB,CAAC;YACF,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;gBAC3C,oBACE,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,WAAW;wBACtB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,GAEJ;gBACD,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;QACJ,CAAC;QAED,IAAI,KAAK,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;YACjC,MAAM,MAAM,GAAgB;gBAC1B,IAAI,EAAE,mBAAmB,KAAK,EAAE;gBAChC,SAAS,EAAE,aAAa,CAAC,KAAK;gBAC9B,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,WAAW,EAAE,KAAK,CAAC,WAAW;gBAC9B,QAAQ,EAAE,IAAI;aACf,CAAC;YAEF,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;gBAC3C,iBACE,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CACf,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,UAAU;wBACrB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,EAEJ,YAAY,EAAE,CAAC,CAAC,EAAE,EAAE,CAClB,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,aAAa;wBACxB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,GAEJ;gBACD,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;QACJ,CAAC;QAED,IAAI,KAAK,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAiB;gBAC3B,IAAI,EAAE,mBAAmB,KAAK,EAAE;gBAChC,SAAS,EAAE,aAAa,CAAC,MAAM;gBAC/B,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,WAAW,EAAE,KAAK,CAAC,WAAW;gBAC9B,QAAQ,EAAE,IAAI;gBACd,UAAU,EAAE,KAAK,CAAC,OAAO;aAC1B,CAAC;YAEF,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;gBAC3C,kBACE,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,WAAW;wBACtB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,GAEJ;gBACD,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;QACJ,CAAC;QAED,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;YAC3C,WAAK,KAAK,EAAC,iCAAiC;gBAC1C,iBAAQ,KAAK,CAAC,KAAK,CAAS;gBAC5B,aACE,IAAI,EAAC,MAAM,EACX,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,WAAW,EAAE,KAAK,CAAC,WAAW,EAC9B,OAAO,EAAE,CAAC,CAAC,EAAE,EAAE,CACb,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,WAAW;wBACtB,KAAK,EAAG,CAAC,CAAC,MAA2B,CAAC,KAAK;qBAC5C,CAAC,GAEJ,CACE;YACL,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;IACJ,CAAC,CAAC,CACE,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport {\n ExpectedOutcomeField,\n} from '../../../types/llm-test-runner';\nimport { ChipsConfig, FormFieldType, SelectConfig, TextAreaConfig } from '../../../lib/form/schema';\nimport {\n EvaluationApproach,\n} from '../../../lib/evaluation/constants';\nimport { getAllowedApproachesForFieldType } from '../../../lib/evaluation/field-evaluation-approach';\nimport { ExpectedOutcomeChange } from '../../../lib/test-cases/test-case-mutations';\n\nexport type ExpectedOutcomeChangeDetail = {\n testCaseId: string;\n} & ExpectedOutcomeChange;\n\ninterface ExpectedOutcomeRendererProps {\n testCaseId: string;\n fields: ExpectedOutcomeField[];\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n}\n\nexport const ExpectedOutcomeRenderer: FunctionalComponent<ExpectedOutcomeRendererProps> = ({\n testCaseId,\n fields,\n onExpectedOutcomeChange,\n}) => {\n const emit = (detail: ExpectedOutcomeChangeDetail) =>\n onExpectedOutcomeChange({\n detail,\n } as CustomEvent<ExpectedOutcomeChangeDetail>);\n\n const buildEvaluationConfig = (\n index: number,\n optionList: string[],\n ): SelectConfig => ({\n name: `expectedOutcomeEvaluation-${index}`,\n fieldType: FormFieldType.SELECT,\n label: 'Evaluation Approach',\n placeholder: 'Select evaluation approach…',\n required: true,\n optionList,\n defaultValue: EvaluationApproach.EXACT,\n });\n\n const renderEvaluationSelector = (\n field: ExpectedOutcomeField,\n index: number,\n ) => {\n const optionList = getAllowedApproachesForFieldType(field.type);\n\n return (\n <app-select\n config={buildEvaluationConfig(index, optionList)}\n value={field.evaluationParameters?.approach}\n onValueChange={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-evaluation-approach',\n value: e.detail.value as EvaluationApproach,\n })\n }\n />\n );\n };\n\n return (\n <div class=\"expected-outcome-renderer\">\n {(fields || []).map((field, index) => {\n if (field.type === 'textarea') {\n const config: TextAreaConfig = {\n name: `expectedOutcome-${index}`,\n fieldType: FormFieldType.TEXT_AREA,\n label: field.label,\n placeholder: field.placeholder,\n required: true,\n rows: field.rows || 2,\n };\n return (\n <div class=\"expected-outcome-renderer__group\">\n <app-textarea\n config={config}\n value={field.value}\n onValueChange={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-value',\n value: e.detail.value,\n })\n }\n />\n {renderEvaluationSelector(field, index)}\n </div>\n );\n }\n\n if (field.type === 'chips-input') {\n const config: ChipsConfig = {\n name: `expectedOutcome-${index}`,\n fieldType: FormFieldType.CHIPS,\n label: field.label,\n placeholder: field.placeholder,\n required: true,\n };\n\n return (\n <div class=\"expected-outcome-renderer__group\">\n <app-chips\n config={config}\n value={field.value}\n onAddChip={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'add-chip',\n value: e.detail.value,\n })\n }\n onRemoveChip={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'remove-chip',\n value: e.detail.value,\n })\n }\n />\n {renderEvaluationSelector(field, index)}\n </div>\n );\n }\n\n if (field.type === 'select') {\n const config: SelectConfig = {\n name: `expectedOutcome-${index}`,\n fieldType: FormFieldType.SELECT,\n label: field.label,\n placeholder: field.placeholder,\n required: true,\n optionList: field.options,\n };\n\n return (\n <div class=\"expected-outcome-renderer__group\">\n <app-select\n config={config}\n value={field.value}\n onValueChange={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-value',\n value: e.detail.value,\n })\n }\n />\n {renderEvaluationSelector(field, index)}\n </div>\n );\n }\n\n return (\n <div class=\"expected-outcome-renderer__group\">\n <div class=\"expected-outcome-renderer__text\">\n <label>{field.label}</label>\n <input\n type=\"text\"\n value={field.value}\n placeholder={field.placeholder}\n onInput={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-value',\n value: (e.target as HTMLInputElement).value,\n })\n }\n />\n </div>\n {renderEvaluationSelector(field, index)}\n </div>\n );\n })}\n </div>\n );\n};\n"]}
|
|
@@ -18,6 +18,23 @@
|
|
|
18
18
|
border-right: var(--border-width) solid var(--border);
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
+
.expected-outcome-renderer {
|
|
22
|
+
display: flex;
|
|
23
|
+
flex-direction: column;
|
|
24
|
+
gap: var(--spacing-4);
|
|
25
|
+
margin-top: var(--spacing-4);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
.expected-outcome-renderer__group {
|
|
29
|
+
display: flex;
|
|
30
|
+
flex-direction: column;
|
|
31
|
+
gap: var(--spacing-2);
|
|
32
|
+
padding: var(--spacing-3);
|
|
33
|
+
border: var(--border-width) solid var(--border);
|
|
34
|
+
border-radius: var(--radius-md);
|
|
35
|
+
background: var(--background);
|
|
36
|
+
}
|
|
37
|
+
|
|
21
38
|
/* Responsive Design */
|
|
22
39
|
@media (max-width: 1200px) {
|
|
23
40
|
.test-case-row {
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
|
-
import { EvaluationApproach, EvaluationApproachValues, } from "../../../lib/evaluation/constants";
|
|
3
2
|
import { ResponseOutput } from "./output/response-output";
|
|
4
3
|
import { EvaluationSummary } from "./evaluation/evaluation-summary";
|
|
5
4
|
import { RowActions } from "./actions/row-actions";
|
|
6
5
|
import { FormFieldType } from "../../../lib/form/schema";
|
|
7
6
|
import { ExpectedOutcomeRenderer, } from "./expected-outcome-renderer";
|
|
8
|
-
export const LLMTestCaseRow = ({ testCase, onRun, onDelete,
|
|
7
|
+
export const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
9
8
|
const questionConfig = {
|
|
10
9
|
name: 'question',
|
|
11
10
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -15,21 +14,12 @@ export const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, ha
|
|
|
15
14
|
required: true,
|
|
16
15
|
rows: 3,
|
|
17
16
|
};
|
|
18
|
-
const evaluationConfig = {
|
|
19
|
-
name: 'EvaluationApproach',
|
|
20
|
-
fieldType: FormFieldType.SELECT,
|
|
21
|
-
label: 'Evaluation',
|
|
22
|
-
placeholder: 'Select evaluation approach…',
|
|
23
|
-
required: true,
|
|
24
|
-
optionList: EvaluationApproachValues,
|
|
25
|
-
defaultValue: EvaluationApproach.EXACT,
|
|
26
|
-
};
|
|
27
17
|
return (h("div", { class: "test-case-row", key: testCase.id }, h("div", { class: "test-case-row__input-column" }, h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
|
|
28
18
|
detail: {
|
|
29
19
|
testCaseId: testCase.id,
|
|
30
20
|
key: 'question',
|
|
31
21
|
value: e.detail.value,
|
|
32
22
|
},
|
|
33
|
-
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })
|
|
23
|
+
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
34
24
|
};
|
|
35
25
|
//# sourceMappingURL=llm-test-case-row.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm-test-case-row.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-case-row.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,
|
|
1
|
+
{"version":3,"file":"llm-test-case-row.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-case-row.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,iCAAiC,CAAC;AACpE,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAkB,MAAM,0BAA0B,CAAC;AACzE,OAAO,EAEL,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AAcrC,MAAM,CAAC,MAAM,cAAc,GAA6C,CAAC,EACvE,QAAQ,EACR,KAAK,EACL,QAAQ,EACR,oBAAoB,EACpB,uBAAuB,GACxB,EAAE,EAAE;IACH,MAAM,cAAc,GAAmB;QACrC,IAAI,EAAE,UAAU;QAChB,SAAS,EAAE,aAAa,CAAC,SAAS;QAClC,IAAI,EAAE,MAAM;QACZ,KAAK,EAAE,UAAU;QACjB,WAAW,EAAE,6BAA6B;QAC1C,QAAQ,EAAE,IAAI;QACd,IAAI,EAAE,CAAC;KACR,CAAC;IACF,OAAO,CACL,WAAK,KAAK,EAAC,eAAe,EAAC,GAAG,EAAE,QAAQ,CAAC,EAAE;QACzC,WAAK,KAAK,EAAC,6BAA6B;YACtC,oBACE,MAAM,EAAE,cAAc,EACtB,KAAK,EAAE,QAAQ,CAAC,QAAQ,EACxB,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,oBAAoB,CAAC;oBACnB,MAAM,EAAE;wBACN,UAAU,EAAE,QAAQ,CAAC,EAAE;wBACvB,GAAG,EAAE,UAAU;wBACf,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB;iBACiE,CAAC,GAEvE;YACF,EAAC,uBAAuB,IACtB,UAAU,EAAE,QAAQ,CAAC,EAAE,EACvB,MAAM,EAAE,QAAQ,CAAC,eAAe,IAAI,EAAE,EACtC,uBAAuB,EAAE,uBAAuB,GAChD,CACE;QAEN,EAAC,cAAc,IAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,CAAC,SAAS,GAAI;QAE1E,EAAC,iBAAiB,IAChB,MAAM,EAAE,QAAQ,CAAC,gBAAgB,EACjC,SAAS,EAAE,QAAQ,CAAC,SAAS,GAC7B;QAEF,EAAC,UAAU,IACT,SAAS,EAAE,QAAQ,CAAC,SAAS,EAC7B,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,EAClC,KAAK,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,EAC5B,QAAQ,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,GACrC,CACE,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport { TestCase } from '../../../types/llm-test-runner';\nimport { ResponseOutput } from './output/response-output';\nimport { EvaluationSummary } from './evaluation/evaluation-summary';\nimport { RowActions } from './actions/row-actions';\nimport { FormFieldType, TextAreaConfig } from '../../../lib/form/schema';\nimport {\n ExpectedOutcomeChangeDetail,\n ExpectedOutcomeRenderer,\n} from './expected-outcome-renderer';\n\nexport interface LLMTestCaseRowProps {\n testCase: TestCase;\n onRun: (testCase: TestCase) => void;\n onDelete: (id: string) => void;\n handleTestCaseChange: (\n e: CustomEvent<{ testCaseId: string; key: string; value: string }>,\n ) => void;\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n}\n\nexport const LLMTestCaseRow: FunctionalComponent<LLMTestCaseRowProps> = ({\n testCase,\n onRun,\n onDelete,\n handleTestCaseChange,\n onExpectedOutcomeChange,\n}) => {\n const questionConfig: TextAreaConfig = {\n name: 'question',\n fieldType: FormFieldType.TEXT_AREA,\n type: 'text',\n label: 'Question',\n placeholder: 'Enter your question here...',\n required: true,\n rows: 3,\n };\n return (\n <div class=\"test-case-row\" key={testCase.id}>\n <div class=\"test-case-row__input-column\">\n <app-textarea\n config={questionConfig}\n value={testCase.question}\n onValueChange={(e) =>\n handleTestCaseChange({\n detail: {\n testCaseId: testCase.id,\n key: 'question',\n value: e.detail.value,\n },\n } as CustomEvent<{ testCaseId: string; key: string; value: string }>)\n }\n />\n <ExpectedOutcomeRenderer\n testCaseId={testCase.id}\n fields={testCase.expectedOutcome || []}\n onExpectedOutcomeChange={onExpectedOutcomeChange}\n />\n </div>\n\n <ResponseOutput output={testCase.output} isRunning={testCase.isRunning} />\n\n <EvaluationSummary\n result={testCase.evaluationResult}\n isRunning={testCase.isRunning}\n />\n\n <RowActions\n isRunning={testCase.isRunning}\n canRun={!!testCase.question.trim()}\n onRun={() => onRun(testCase)}\n onDelete={() => onDelete(testCase.id)}\n />\n </div>\n );\n};\n"]}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
2
|
import { LLMTestCaseRow } from "./llm-test-case-row";
|
|
3
3
|
import { Button } from "../../../lib/ui/button/index";
|
|
4
|
-
export const LLMTestCases = ({ testCases, onRun, onDelete,
|
|
5
|
-
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete,
|
|
4
|
+
export const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
5
|
+
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
6
6
|
};
|
|
7
7
|
//# sourceMappingURL=llm-test-cases.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm-test-cases.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-cases.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"llm-test-cases.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-cases.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,MAAM,EAAE,MAAM,8BAA8B,CAAC;AAgBtD,MAAM,CAAC,MAAM,YAAY,GAA2C,CAAC,EACnE,SAAS,EACT,KAAK,EACL,QAAQ,EACR,aAAa,EACb,oBAAoB,EACpB,uBAAuB,GACxB,EAAE,EAAE;IACH,OAAO,CACL,WAAK,KAAK,EAAC,YAAY;QACrB,WAAK,KAAK,EAAC,4BAA4B;YACrC,WAAK,KAAK,EAAC,2BAA2B,YAAY;YAClD,WAAK,KAAK,EAAC,2BAA2B,aAAa;YACnD,WAAK,KAAK,EAAC,2BAA2B,iBAAiB;YACvD,WAAK,KAAK,EAAC,2BAA2B,cAAc,CAChD;QAEL,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CACzB,EAAC,cAAc,IACb,QAAQ,EAAE,QAAQ,EAClB,KAAK,EAAE,KAAK,EACZ,QAAQ,EAAE,QAAQ,EAClB,oBAAoB,EAAE,oBAAoB,EAC1C,uBAAuB,EAAE,uBAAuB,GAChD,CACH,CAAC;QAEF,WAAK,KAAK,EAAC,yBAAyB;YAClC,EAAC,MAAM,IAAC,OAAO,EAAC,SAAS,EAAC,IAAI,EAAC,IAAI,EAAC,OAAO,EAAE,aAAa,qBAEjD,CACL,CACF,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport { TestCase } from '../../../types/llm-test-runner';\nimport { LLMTestCaseRow } from './llm-test-case-row';\nimport { Button } from '../../../lib/ui/button/index';\nimport { ExpectedOutcomeChangeDetail } from './expected-outcome-renderer';\n\nexport interface LLMTestCasesProps {\n testCases: TestCase[];\n onRun: (testCase: TestCase) => void;\n onDelete: (id: string) => void;\n onAddTestCase: () => void;\n handleTestCaseChange: (\n e: CustomEvent<{ testCaseId: string; key: string; value: string }>,\n ) => void;\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n}\n\nexport const LLMTestCases: FunctionalComponent<LLMTestCasesProps> = ({\n testCases,\n onRun,\n onDelete,\n onAddTestCase,\n handleTestCaseChange,\n onExpectedOutcomeChange,\n}) => {\n return (\n <div class=\"test-cases\">\n <div class=\"test-cases__column-headers\">\n <div class=\"test-cases__column-header\">Input</div>\n <div class=\"test-cases__column-header\">Output</div>\n <div class=\"test-cases__column-header\">Evaluation</div>\n <div class=\"test-cases__column-header\">Actions</div>\n </div>\n\n {testCases.map(testCase => (\n <LLMTestCaseRow\n testCase={testCase}\n onRun={onRun}\n onDelete={onDelete}\n handleTestCaseChange={handleTestCaseChange}\n onExpectedOutcomeChange={onExpectedOutcomeChange}\n />\n ))}\n\n <div class=\"test-cases__add-section\">\n <Button variant=\"outline\" size=\"md\" onClick={onAddTestCase}>\n + Add Question\n </Button>\n </div>\n </div>\n );\n};\n"]}
|
|
@@ -6,56 +6,77 @@ import { performRougeLEvaluation } from "./evaluators/rougeL-evaluator";
|
|
|
6
6
|
import { performBleuEvaluation } from "./evaluators/bleu/bleu-evaluator";
|
|
7
7
|
export class LLMEvaluationEngine {
|
|
8
8
|
async evaluateResponse(request, callback) {
|
|
9
|
-
|
|
10
|
-
const
|
|
11
|
-
switch (approach) {
|
|
12
|
-
case EvaluationApproach.BLEU: {
|
|
13
|
-
const bleuResult = performBleuEvaluation(request);
|
|
14
|
-
callback(bleuResult);
|
|
15
|
-
break;
|
|
16
|
-
}
|
|
17
|
-
case EvaluationApproach.EXACT: {
|
|
18
|
-
const exactResult = await performEvaluation(request);
|
|
19
|
-
callback(exactResult);
|
|
20
|
-
break;
|
|
21
|
-
}
|
|
22
|
-
case EvaluationApproach.ROUGE_1: {
|
|
23
|
-
const rougeResult = await performRouge1Evaluation(request);
|
|
24
|
-
callback(rougeResult);
|
|
25
|
-
break;
|
|
26
|
-
}
|
|
27
|
-
case EvaluationApproach.ROUGE_L: {
|
|
28
|
-
const rougeLResult = await performRougeLEvaluation(request);
|
|
29
|
-
callback(rougeLResult);
|
|
30
|
-
break;
|
|
31
|
-
}
|
|
32
|
-
case EvaluationApproach.SEMANTIC: {
|
|
33
|
-
const semanticResult = await performSemanticEvaluation(request);
|
|
34
|
-
callback(semanticResult);
|
|
35
|
-
break;
|
|
36
|
-
}
|
|
37
|
-
default: {
|
|
38
|
-
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
39
|
-
const fallbackResult = await performEvaluation(request);
|
|
40
|
-
callback(fallbackResult);
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
catch (error) {
|
|
45
|
-
console.error('Evaluation failed:', error);
|
|
46
|
-
const errorResult = {
|
|
9
|
+
const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
|
|
10
|
+
const fieldRequest = {
|
|
47
11
|
testCaseId: request.testCaseId,
|
|
12
|
+
question: request.question,
|
|
13
|
+
actualResponse: request.actualResponse,
|
|
14
|
+
expectedOutcome: field.expectedValue,
|
|
15
|
+
evaluationParameters: field.evaluationParameters,
|
|
16
|
+
};
|
|
17
|
+
const result = await this.evaluateField(fieldRequest);
|
|
18
|
+
const fieldResult = {
|
|
19
|
+
index: field.index,
|
|
20
|
+
label: field.label,
|
|
21
|
+
type: field.type,
|
|
22
|
+
expectedValue: field.expectedValue,
|
|
23
|
+
passed: result.passed,
|
|
24
|
+
keywordMatches: result.keywordMatches,
|
|
25
|
+
evaluationParameters: result.evaluationParameters,
|
|
26
|
+
evaluationApproachResult: result.evaluationApproachResult,
|
|
27
|
+
};
|
|
28
|
+
return fieldResult;
|
|
29
|
+
}));
|
|
30
|
+
const fieldResults = settledResults.map((settledResult, index) => {
|
|
31
|
+
const field = request.fields[index];
|
|
32
|
+
if (settledResult.status === 'fulfilled') {
|
|
33
|
+
return settledResult.value;
|
|
34
|
+
}
|
|
35
|
+
return {
|
|
36
|
+
index: field.index,
|
|
37
|
+
label: field.label,
|
|
38
|
+
type: field.type,
|
|
39
|
+
expectedValue: field.expectedValue,
|
|
48
40
|
passed: false,
|
|
49
41
|
keywordMatches: [],
|
|
50
|
-
|
|
51
|
-
evaluationParameters: request.evaluationParameters,
|
|
42
|
+
evaluationParameters: field.evaluationParameters,
|
|
52
43
|
evaluationApproachResult: {
|
|
53
44
|
score: 0,
|
|
54
|
-
approachUsed:
|
|
45
|
+
approachUsed: field.evaluationParameters.approach,
|
|
55
46
|
},
|
|
47
|
+
error: this.getSafeErrorMessage(settledResult.reason),
|
|
56
48
|
};
|
|
57
|
-
|
|
49
|
+
});
|
|
50
|
+
const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
|
|
51
|
+
const passed = fieldResults.every(field => field.passed && !field.error);
|
|
52
|
+
callback({
|
|
53
|
+
testCaseId: request.testCaseId,
|
|
54
|
+
passed,
|
|
55
|
+
keywordMatches,
|
|
56
|
+
fieldResults,
|
|
57
|
+
timestamp: new Date().toISOString(),
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
async evaluateField(request) {
|
|
61
|
+
const approach = request.evaluationParameters.approach;
|
|
62
|
+
switch (approach) {
|
|
63
|
+
case EvaluationApproach.BLEU:
|
|
64
|
+
return performBleuEvaluation(request);
|
|
65
|
+
case EvaluationApproach.EXACT:
|
|
66
|
+
return performEvaluation(request);
|
|
67
|
+
case EvaluationApproach.ROUGE_1:
|
|
68
|
+
return performRouge1Evaluation(request);
|
|
69
|
+
case EvaluationApproach.ROUGE_L:
|
|
70
|
+
return performRougeLEvaluation(request);
|
|
71
|
+
case EvaluationApproach.SEMANTIC:
|
|
72
|
+
return performSemanticEvaluation(request);
|
|
73
|
+
default:
|
|
74
|
+
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
75
|
+
return performEvaluation(request);
|
|
58
76
|
}
|
|
59
77
|
}
|
|
78
|
+
getSafeErrorMessage(error) {
|
|
79
|
+
return error instanceof Error ? error.message : 'Field evaluation failed.';
|
|
80
|
+
}
|
|
60
81
|
}
|
|
61
82
|
//# sourceMappingURL=evaluation-engine.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,yBAAyB,EAAE,MAAM,6BAA6B,CAAC;AACxE,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AAEzE,MAAM,OAAO,mBAAmB;IAC9B,KAAK,CAAC,gBAAgB,CACpB,OAA4B,EAC5B,QAA4B;QAE5B,MAAM,cAAc,GAAG,MAAM,OAAO,CAAC,UAAU,CAC7C,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,EAAC,KAAK,EAAC,EAAE;YAC/B,MAAM,YAAY,GAAsB;gBACtC,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,QAAQ,EAAE,OAAO,CAAC,QAAQ;gBAC1B,cAAc,EAAE,OAAO,CAAC,cAAc;gBACtC,eAAe,EAAE,KAAK,CAAC,aAAa;gBACpC,oBAAoB,EAAE,KAAK,CAAC,oBAAoB;aACjD,CAAC;YACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,YAAY,CAAC,CAAC;YAEtD,MAAM,WAAW,GAA0B;gBACzC,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,aAAa,EAAE,KAAK,CAAC,aAAa;gBAClC,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,oBAAoB,EAAE,MAAM,CAAC,oBAAqB;gBAClD,wBAAwB,EAAE,MAAM,CAAC,wBAAwB;aAC1D,CAAC;YACF,OAAO,WAAW,CAAC;QACrB,CAAC,CAAC,CACH,CAAC;QAEF,MAAM,YAAY,GAA4B,cAAc,CAAC,GAAG,CAC9D,CAAC,aAAa,EAAE,KAAK,EAAE,EAAE;YACvB,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACpC,IAAI,aAAa,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;gBACzC,OAAO,aAAa,CAAC,KAAK,CAAC;YAC7B,CAAC;YAED,OAAO;gBACL,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,aAAa,EAAE,KAAK,CAAC,aAAa;gBAClC,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,oBAAoB,EAAE,KAAK,CAAC,oBAAoB;gBAChD,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,KAAK,CAAC,oBAAoB,CAAC,QAAQ;iBAClD;gBACD,KAAK,EAAE,IAAI,CAAC,mBAAmB,CAAC,aAAa,CAAC,MAAM,CAAC;aACtD,CAAC;QACJ,CAAC,CACF,CAAC;QAEF,MAAM,cAAc,GAAG,YAAY,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC3E,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAEzE,QAAQ,CAAC;YACP,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,MAAM;YACN,cAAc;YACd,YAAY;YACZ,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,aAAa,CAAC,OAA0B;QACpD,MAAM,QAAQ,GAAuB,OAAO,CAAC,oBAAoB,CAAC,QAAQ,CAAC;QAC3E,QAAQ,QAAQ,EAAE,CAAC;YACjB,KAAK,kBAAkB,CAAC,IAAI;gBAC1B,OAAO,qBAAqB,CAAC,OAAO,CAAC,CAAC;YACxC,KAAK,kBAAkB,CAAC,KAAK;gBAC3B,OAAO,iBAAiB,CAAC,OAAO,CAAC,CAAC;YACpC,KAAK,kBAAkB,CAAC,OAAO;gBAC7B,OAAO,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAC1C,KAAK,kBAAkB,CAAC,OAAO;gBAC7B,OAAO,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAC1C,KAAK,kBAAkB,CAAC,QAAQ;gBAC9B,OAAO,yBAAyB,CAAC,OAAO,CAAC,CAAC;YAC5C;gBACE,OAAO,CAAC,IAAI,CACV,8BAA8B,OAAO,CAAC,oBAAoB,CAAC,QAAQ,kCAAkC,CACtG,CAAC;gBACF,OAAO,iBAAiB,CAAC,OAAO,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAEO,mBAAmB,CAAC,KAAc;QACxC,OAAO,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,CAAC;IAC7E,CAAC;CACF","sourcesContent":["import {\n EvaluationRequest,\n EvaluationResult,\n EvaluationCallback,\n FieldEvaluationResult,\n EvaluationRequestV2,\n} from './types';\nimport { performEvaluation } from './evaluators/exact/exact';\nimport { EvaluationApproach } from './constants';\nimport { performRouge1Evaluation } from './evaluators/rouge1-evaluator';\nimport { performSemanticEvaluation } from './evaluators/semantic/index';\nimport { performRougeLEvaluation } from './evaluators/rougeL-evaluator';\nimport { performBleuEvaluation } from './evaluators/bleu/bleu-evaluator';\n\nexport class LLMEvaluationEngine {\n async evaluateResponse(\n request: EvaluationRequestV2,\n callback: EvaluationCallback,\n ): Promise<void> {\n const settledResults = await Promise.allSettled(\n request.fields.map(async field => {\n const fieldRequest: EvaluationRequest = {\n testCaseId: request.testCaseId,\n question: request.question,\n actualResponse: request.actualResponse,\n expectedOutcome: field.expectedValue,\n evaluationParameters: field.evaluationParameters,\n };\n const result = await this.evaluateField(fieldRequest);\n\n const fieldResult: FieldEvaluationResult = {\n index: field.index,\n label: field.label,\n type: field.type,\n expectedValue: field.expectedValue,\n passed: result.passed,\n keywordMatches: result.keywordMatches,\n evaluationParameters: result.evaluationParameters!,\n evaluationApproachResult: result.evaluationApproachResult,\n };\n return fieldResult;\n }),\n );\n\n const fieldResults: FieldEvaluationResult[] = settledResults.map(\n (settledResult, index) => {\n const field = request.fields[index];\n if (settledResult.status === 'fulfilled') {\n return settledResult.value;\n }\n\n return {\n index: field.index,\n label: field.label,\n type: field.type,\n expectedValue: field.expectedValue,\n passed: false,\n keywordMatches: [],\n evaluationParameters: field.evaluationParameters,\n evaluationApproachResult: {\n score: 0,\n approachUsed: field.evaluationParameters.approach,\n },\n error: this.getSafeErrorMessage(settledResult.reason),\n };\n },\n );\n\n const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);\n const passed = fieldResults.every(field => field.passed && !field.error);\n\n callback({\n testCaseId: request.testCaseId,\n passed,\n keywordMatches,\n fieldResults,\n timestamp: new Date().toISOString(),\n });\n }\n\n private async evaluateField(request: EvaluationRequest): Promise<EvaluationResult> {\n const approach: EvaluationApproach = request.evaluationParameters.approach;\n switch (approach) {\n case EvaluationApproach.BLEU:\n return performBleuEvaluation(request);\n case EvaluationApproach.EXACT:\n return performEvaluation(request);\n case EvaluationApproach.ROUGE_1:\n return performRouge1Evaluation(request);\n case EvaluationApproach.ROUGE_L:\n return performRougeLEvaluation(request);\n case EvaluationApproach.SEMANTIC:\n return performSemanticEvaluation(request);\n default:\n console.warn(\n `Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`,\n );\n return performEvaluation(request);\n }\n }\n\n private getSafeErrorMessage(error: unknown): string {\n return error instanceof Error ? error.message : 'Field evaluation failed.';\n }\n}\n"]}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { LLMEvaluationEngine } from "./evaluation-engine";
|
|
2
|
-
import {
|
|
2
|
+
import { normalizeEvaluationParametersForField } from "./field-evaluation-approach";
|
|
3
3
|
/**
|
|
4
4
|
* Service for evaluating test case responses
|
|
5
5
|
*/
|
|
@@ -18,12 +18,18 @@ export class EvaluationService {
|
|
|
18
18
|
console.warn('⚠️ No output to evaluate for test case:', testCase.id);
|
|
19
19
|
return;
|
|
20
20
|
}
|
|
21
|
+
const fields = (testCase.expectedOutcome || []).map((field, index) => ({
|
|
22
|
+
index,
|
|
23
|
+
label: field.label,
|
|
24
|
+
type: field.type,
|
|
25
|
+
expectedValue: getFieldExpectedValue(field),
|
|
26
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
27
|
+
}));
|
|
21
28
|
const evaluationRequest = {
|
|
22
29
|
testCaseId: testCase.id,
|
|
23
30
|
question: testCase.question,
|
|
24
|
-
expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
|
|
25
31
|
actualResponse: testCase.output,
|
|
26
|
-
|
|
32
|
+
fields,
|
|
27
33
|
};
|
|
28
34
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
29
35
|
console.log('📊 Evaluation result received:', result);
|
|
@@ -31,4 +37,10 @@ export class EvaluationService {
|
|
|
31
37
|
});
|
|
32
38
|
}
|
|
33
39
|
}
|
|
40
|
+
function getFieldExpectedValue(field) {
|
|
41
|
+
if (field.type === 'chips-input') {
|
|
42
|
+
return field.value.join(', ');
|
|
43
|
+
}
|
|
44
|
+
return field.value;
|
|
45
|
+
}
|
|
34
46
|
//# sourceMappingURL=evaluation-service.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluation-service.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;
|
|
1
|
+
{"version":3,"file":"evaluation-service.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAO1D,OAAO,EAAE,qCAAqC,EAAE,MAAM,6BAA6B,CAAC;AAEpF;;GAEG;AACH,MAAM,OAAO,iBAAiB;IACpB,MAAM,CAAsB;IAEpC;QACE,IAAI,CAAC,MAAM,GAAG,IAAI,mBAAmB,EAAE,CAAC;IAC1C,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,gBAAgB,CACpB,QAAkB,EAClB,QAA4C;QAE5C,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;YACrB,OAAO,CAAC,IAAI,CAAC,yCAAyC,EAAE,QAAQ,CAAC,EAAE,CAAC,CAAC;YACrE,OAAO;QACT,CAAC;QAED,MAAM,MAAM,GAA2B,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,GAAG,CACzE,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE,CAAC,CAAC;YACjB,KAAK;YACL,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,IAAI,EAAE,KAAK,CAAC,IAAI;YAChB,aAAa,EAAE,qBAAqB,CAAC,KAAK,CAAC;YAC3C,oBAAoB,EAAE,qCAAqC,CACzD,KAAK,CAAC,IAAI,EACV,KAAK,CAAC,oBAAoB,CAC3B;SACF,CAAC,CACH,CAAC;QAEF,MAAM,iBAAiB,GAAwB;YAC7C,UAAU,EAAE,QAAQ,CAAC,EAAE;YACvB,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,cAAc,EAAE,QAAQ,CAAC,MAAM;YAC/B,MAAM;SACP,CAAC;QAEF,MAAM,IAAI,CAAC,MAAM,CAAC,gBAAgB,CAChC,iBAAiB,EACjB,CAAC,MAAwB,EAAE,EAAE;YAC3B,OAAO,CAAC,GAAG,CAAC,gCAAgC,EAAE,MAAM,CAAC,CAAC;YACtD,QAAQ,CAAC,MAAM,CAAC,CAAC;QACnB,CAAC,CACF,CAAC;IACJ,CAAC;CACF;AAED,SAAS,qBAAqB,CAAC,KAA2B;IACxD,IAAI,KAAK,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;QACjC,OAAO,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAChC,CAAC;IACD,OAAO,KAAK,CAAC,KAAK,CAAC;AACrB,CAAC","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport {\n EvaluationResult,\n FieldEvaluationInput,\n EvaluationRequestV2,\n} from './types';\nimport { TestCase, ExpectedOutcomeField } from '../../types/llm-test-runner';\nimport { normalizeEvaluationParametersForField } from './field-evaluation-approach';\n\n/**\n * Service for evaluating test case responses\n */\nexport class EvaluationService {\n private engine: LLMEvaluationEngine;\n\n constructor() {\n this.engine = new LLMEvaluationEngine();\n }\n\n /**\n * Evaluates a test case response\n * @param testCase - The test case to evaluate\n * @param onResult - Callback to handle the evaluation result\n */\n async evaluateTestCase(\n testCase: TestCase,\n onResult: (result: EvaluationResult) => void,\n ): Promise<void> {\n if (!testCase.output) {\n console.warn('⚠️ No output to evaluate for test case:', testCase.id);\n return;\n }\n\n const fields: FieldEvaluationInput[] = (testCase.expectedOutcome || []).map(\n (field, index) => ({\n index,\n label: field.label,\n type: field.type,\n expectedValue: getFieldExpectedValue(field),\n evaluationParameters: normalizeEvaluationParametersForField(\n field.type,\n field.evaluationParameters,\n ),\n }),\n );\n\n const evaluationRequest: EvaluationRequestV2 = {\n testCaseId: testCase.id,\n question: testCase.question,\n actualResponse: testCase.output,\n fields,\n };\n\n await this.engine.evaluateResponse(\n evaluationRequest,\n (result: EvaluationResult) => {\n console.log('📊 Evaluation result received:', result);\n onResult(result);\n },\n );\n }\n}\n\nfunction getFieldExpectedValue(field: ExpectedOutcomeField): string {\n if (field.type === 'chips-input') {\n return field.value.join(', ');\n }\n return field.value;\n}\n"]}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { describe, it, expect } from "@jest/globals";
|
|
2
|
-
import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "
|
|
2
|
+
import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "../constants";
|
|
3
3
|
// Using integration tests with actual js-rouge library (no mocks).
|
|
4
4
|
// This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.
|
|
5
|
-
import { performRouge1Evaluation } from "./
|
|
5
|
+
import { performRouge1Evaluation } from "./rouge1-evaluator";
|
|
6
6
|
const mockRequest = {
|
|
7
7
|
testCaseId: 'test-000',
|
|
8
8
|
question: 'What is your name?',
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rouge1-evaluator.test.js","sourceRoot":"","sources":["../../../../src/lib/evaluation/evaluators/rouge1-evaluator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAErD,OAAO,EAAE,wBAAwB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;AAC5E,mEAAmE;AACnE,8FAA8F;AAC9F,OAAO,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAC;AAE7D,MAAM,WAAW,GAAsB;IACrC,UAAU,EAAE,UAAU;IACtB,QAAQ,EAAE,oBAAoB;IAC9B,cAAc,EAAE,6BAA6B;IAC7C,eAAe,EAAE,iBAAiB;IAClC,oBAAoB,EAAE;QACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;QACpC,SAAS,EAAE,GAAG;KACf;CACF,CAAC;AAEF,MAAM,sBAAsB,GAAsB;IAChD,GAAG,WAAW;IACd,oBAAoB,EAAE;QACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;QACpC,SAAS,EAAE,SAAS;KACrB;CACF,CAAC;AAEF,QAAQ,CAAC,yBAAyB,EAAE,GAAG,EAAE;IACvC,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;YACxE,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,iBAAiB;aACnC,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;YACvB,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QACzB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;YACtE,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,oDAAoD;gBACpE,eAAe,EAAE,2CAA2C;aAC7D,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YACpB,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;YAC5E,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,oCAAoC;aACtD,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;YAC9B,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,sBAAsB,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAChD,wBAAwB,CACzB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,yCAAyC;gBACzD,eAAe,EAAE,8BAA8B;gBAC/C,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;oBACpC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7D,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,kBAAkB;gBACnC,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;oBACpC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACxD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;QAC1B,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC;YAEvD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACxE,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC1E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;YAExD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC","sourcesContent":["import { describe, it, expect } from '@jest/globals';\nimport { EvaluationRequest } from '../types';\nimport { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from '../constants';\n// Using integration tests with actual js-rouge library (no mocks).\n// This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.\nimport { performRouge1Evaluation } from './rouge1-evaluator';\n\nconst mockRequest: EvaluationRequest = {\n testCaseId: 'test-000',\n question: 'What is your name?',\n actualResponse: 'I am a large language model',\n expectedOutcome: 'model\\nlanguage',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 0.5,\n },\n};\n\nconst mockRequestNoThreshold: EvaluationRequest = {\n ...mockRequest,\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: undefined,\n },\n};\n\ndescribe('performRouge1Evaluation', () => {\n describe('Basic functionality', () => {\n it('should pass when response contains exact keyword matches', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is a language model system',\n expectedOutcome: 'language\\nmodel',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(2);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThan(0.5);\n expect(result.keywordMatches[1].found).toBe(true);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeGreaterThan(0.5);\n });\n\n it('should fail when keywords are not sufficiently present', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is completely unrelated content about cooking',\n expectedOutcome: 'machine learning\\nartificial intelligence',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(false);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n });\n\n it('should partially pass when only some keywords meet threshold', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'Machine learning is fascinating',\n expectedOutcome: 'machine learning\\ndatabase systems',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThanOrEqual(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n });\n });\n\n describe('Threshold handling', () => {\n it('should use default threshold when not provided', async () => {\n const result = await performRouge1Evaluation(mockRequestNoThreshold);\n\n expect(result.evaluationParameters.threshold).toBe(\n DEFAULT_ROUGE_PASS_SCORE,\n );\n });\n\n it('should pass all keywords with threshold 0.0', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'completely unrelated text about cooking',\n expectedOutcome: 'quantum physics\\nmathematics',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 0.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.every(m => m.found)).toBe(true);\n expect(result.evaluationParameters.threshold).toBe(0.0);\n });\n\n it('should fail when threshold is 1.0 and match is not perfect', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is about learning concepts',\n expectedOutcome: 'machine learning',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 1.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(1.0);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(1.0);\n });\n });\n\n describe('Edge cases', () => {\n it('should handle empty actualResponse', async () => {\n const request = { ...mockRequest, actualResponse: '' };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBe(0);\n expect(result.keywordMatches[1].evaluationApproachResult.score).toBe(0);\n });\n\n it('should handle empty expectedOutcome string', async () => {\n const request = { ...mockRequest, expectedOutcome: '' };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(0);\n });\n });\n});\n"]}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { EvaluationApproach, EvaluationApproachValues } from "./constants";
|
|
2
|
+
const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
|
|
3
|
+
export function getAllowedApproachesForFieldType(fieldType) {
|
|
4
|
+
if (fieldType === 'select') {
|
|
5
|
+
return SELECT_ONLY_APPROACHES;
|
|
6
|
+
}
|
|
7
|
+
return EvaluationApproachValues;
|
|
8
|
+
}
|
|
9
|
+
export function isApproachAllowedForFieldType(fieldType, approach) {
|
|
10
|
+
return getAllowedApproachesForFieldType(fieldType).includes(approach);
|
|
11
|
+
}
|
|
12
|
+
export function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
|
|
13
|
+
const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
|
|
14
|
+
const fallbackApproach = allowedApproaches[0];
|
|
15
|
+
const rawApproach = evaluationParameters?.approach;
|
|
16
|
+
const approach = rawApproach && allowedApproaches.includes(rawApproach)
|
|
17
|
+
? rawApproach
|
|
18
|
+
: fallbackApproach;
|
|
19
|
+
return {
|
|
20
|
+
...evaluationParameters,
|
|
21
|
+
approach,
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=field-evaluation-approach.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"field-evaluation-approach.js","sourceRoot":"","sources":["../../../src/lib/evaluation/field-evaluation-approach.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AAK3E,MAAM,sBAAsB,GAAyB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC;AAEhF,MAAM,UAAU,gCAAgC,CAC9C,SAA8B;IAE9B,IAAI,SAAS,KAAK,QAAQ,EAAE,CAAC;QAC3B,OAAO,sBAAsB,CAAC;IAChC,CAAC;IACD,OAAO,wBAAwB,CAAC;AAClC,CAAC;AAED,MAAM,UAAU,6BAA6B,CAC3C,SAA8B,EAC9B,QAA4B;IAE5B,OAAO,gCAAgC,CAAC,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AACxE,CAAC;AAED,MAAM,UAAU,qCAAqC,CACnD,SAA8B,EAC9B,oBAA2C;IAE3C,MAAM,iBAAiB,GAAG,gCAAgC,CAAC,SAAS,CAAC,CAAC;IACtE,MAAM,gBAAgB,GAAG,iBAAiB,CAAC,CAAC,CAAC,CAAC;IAC9C,MAAM,WAAW,GAAG,oBAAoB,EAAE,QAAQ,CAAC;IACnD,MAAM,QAAQ,GACZ,WAAW,IAAI,iBAAiB,CAAC,QAAQ,CAAC,WAAW,CAAC;QACpD,CAAC,CAAC,WAAW;QACb,CAAC,CAAC,gBAAgB,CAAC;IAEvB,OAAO;QACL,GAAG,oBAAoB;QACvB,QAAQ;KACT,CAAC;AACJ,CAAC","sourcesContent":["import { EvaluationApproach, EvaluationApproachValues } from './constants';\nimport type { EvaluationParameters } from '../../types/evaluation';\n\nexport type EvaluationFieldType = 'text' | 'textarea' | 'chips-input' | 'select';\n\nconst SELECT_ONLY_APPROACHES: EvaluationApproach[] = [EvaluationApproach.EXACT];\n\nexport function getAllowedApproachesForFieldType(\n fieldType: EvaluationFieldType,\n): EvaluationApproach[] {\n if (fieldType === 'select') {\n return SELECT_ONLY_APPROACHES;\n }\n return EvaluationApproachValues;\n}\n\nexport function isApproachAllowedForFieldType(\n fieldType: EvaluationFieldType,\n approach: EvaluationApproach,\n): boolean {\n return getAllowedApproachesForFieldType(fieldType).includes(approach);\n}\n\nexport function normalizeEvaluationParametersForField(\n fieldType: EvaluationFieldType,\n evaluationParameters?: EvaluationParameters,\n): EvaluationParameters {\n const allowedApproaches = getAllowedApproachesForFieldType(fieldType);\n const fallbackApproach = allowedApproaches[0];\n const rawApproach = evaluationParameters?.approach;\n const approach =\n rawApproach && allowedApproaches.includes(rawApproach)\n ? rawApproach\n : fallbackApproach;\n\n return {\n ...evaluationParameters,\n approach,\n };\n}\n\n"]}
|
|
@@ -1,7 +1,3 @@
|
|
|
1
1
|
import { LLMEvaluationEngine } from "./evaluation-engine";
|
|
2
2
|
export { LLMEvaluationEngine };
|
|
3
|
-
export async function evaluateLLMResponse(request, callback) {
|
|
4
|
-
const engine = new LLMEvaluationEngine();
|
|
5
|
-
await engine.evaluateResponse(request, callback);
|
|
6
|
-
}
|
|
7
3
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/lib/evaluation/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAQ1D,OAAO,EAAE,mBAAmB,EAAE,CAAC
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/lib/evaluation/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAQ1D,OAAO,EAAE,mBAAmB,EAAE,CAAC","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n} from './types';\n\nexport { LLMEvaluationEngine };\nexport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n};"]}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n timestamp?: string;\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}\n"]}
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\nimport type { ExpectedOutcomeFieldType } from '../../types/llm-test-runner';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface FieldEvaluationInput {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationRequestV2 {\n testCaseId: string;\n question: string;\n actualResponse: string;\n fields: FieldEvaluationInput[];\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n fieldResults?: FieldEvaluationResult[];\n timestamp?: string;\n evaluationParameters?: EvaluationParameters;\n evaluationApproachResult?: EvaluationApproachResult;\n}\n\nexport interface FieldEvaluationResult {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n error?: string;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}\n"]}
|