llm-testrunner-components 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +305 -237
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-JPMPoOC8.js +7 -0
- package/dist/components/p-JPMPoOC8.js.map +1 -0
- package/dist/esm/index.js +305 -237
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/components.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css
CHANGED
|
@@ -7,49 +7,88 @@
|
|
|
7
7
|
flex-direction: column;
|
|
8
8
|
}
|
|
9
9
|
|
|
10
|
-
.evaluation-
|
|
10
|
+
.evaluation-summary__field-results {
|
|
11
11
|
display: flex;
|
|
12
12
|
flex-direction: column;
|
|
13
13
|
gap: var(--spacing-2);
|
|
14
|
+
margin-top: var(--spacing-2);
|
|
14
15
|
}
|
|
15
16
|
|
|
16
|
-
.evaluation-
|
|
17
|
+
.evaluation-summary__field-result {
|
|
18
|
+
border: var(--border-width) solid var(--border);
|
|
19
|
+
border-radius: var(--radius-md);
|
|
20
|
+
padding: var(--spacing-2);
|
|
17
21
|
display: flex;
|
|
18
|
-
|
|
19
|
-
|
|
22
|
+
flex-direction: column;
|
|
23
|
+
gap: var(--spacing-1);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
.evaluation-summary__field-header {
|
|
27
|
+
display: flex;
|
|
28
|
+
flex-direction: column;
|
|
29
|
+
gap: var(--spacing-1);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
.evaluation-summary__field-label {
|
|
33
|
+
font-weight: var(--font-weight-semibold);
|
|
34
|
+
font-size: var(--font-size-xs);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
.evaluation-summary__field-approach {
|
|
20
38
|
color: var(--muted-foreground);
|
|
21
|
-
font-
|
|
22
|
-
flex: 1;
|
|
23
|
-
background: var(--muted);
|
|
24
|
-
border: 2px dashed var(--border);
|
|
25
|
-
border-radius: var(--radius);
|
|
39
|
+
font-size: 11px;
|
|
26
40
|
}
|
|
27
41
|
|
|
28
|
-
|
|
29
|
-
.evaluation-summary__result {
|
|
42
|
+
.evaluation-summary__field-details {
|
|
30
43
|
display: flex;
|
|
31
44
|
flex-direction: column;
|
|
32
|
-
gap: var(--spacing-
|
|
45
|
+
gap: var(--spacing-1);
|
|
46
|
+
font-size: var(--font-size-xs);
|
|
33
47
|
}
|
|
34
48
|
|
|
35
|
-
.evaluation-
|
|
49
|
+
.evaluation-summary__field-status {
|
|
50
|
+
width: fit-content;
|
|
51
|
+
padding: 2px var(--spacing-2);
|
|
52
|
+
border-radius: var(--radius-sm);
|
|
53
|
+
font-size: 11px;
|
|
36
54
|
font-weight: var(--font-weight-semibold);
|
|
37
|
-
|
|
38
|
-
padding: var(--spacing-2) var(--spacing-3);
|
|
39
|
-
border-radius: var(--radius-md);
|
|
40
|
-
text-align: center;
|
|
55
|
+
border: var(--border-width) solid transparent;
|
|
41
56
|
}
|
|
42
57
|
|
|
43
|
-
.evaluation-
|
|
58
|
+
.evaluation-summary__field-status--passed {
|
|
44
59
|
background: var(--success);
|
|
45
60
|
color: var(--success-foreground);
|
|
46
|
-
border: var(--
|
|
61
|
+
border-color: var(--success);
|
|
47
62
|
}
|
|
48
63
|
|
|
49
|
-
.evaluation-
|
|
64
|
+
.evaluation-summary__field-status--failed {
|
|
50
65
|
background: var(--destructive);
|
|
51
66
|
color: var(--destructive-foreground);
|
|
52
|
-
border: var(--
|
|
67
|
+
border-color: var(--destructive);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
.evaluation-summary__error-message {
|
|
71
|
+
color: var(--destructive);
|
|
72
|
+
font-size: var(--font-size-xs);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
.evaluation-summary__placeholder {
|
|
76
|
+
display: flex;
|
|
77
|
+
align-items: center;
|
|
78
|
+
justify-content: center;
|
|
79
|
+
color: var(--muted-foreground);
|
|
80
|
+
font-style: italic;
|
|
81
|
+
flex: 1;
|
|
82
|
+
background: var(--muted);
|
|
83
|
+
border: 2px dashed var(--border);
|
|
84
|
+
border-radius: var(--radius);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/* Evaluation Result Element */
|
|
88
|
+
.evaluation-summary__result {
|
|
89
|
+
display: flex;
|
|
90
|
+
flex-direction: column;
|
|
91
|
+
gap: var(--spacing-2);
|
|
53
92
|
}
|
|
54
93
|
|
|
55
94
|
/* Responsive Design */
|
package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
2
|
export const EvaluationSummary = ({ result, isRunning, }) => {
|
|
3
|
-
|
|
3
|
+
const fieldResults = result?.fieldResults || [];
|
|
4
|
+
const hasFieldResults = fieldResults.length > 0;
|
|
5
|
+
return (h("div", { class: "evaluation-summary" }, result ? (h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (h("div", { class: "evaluation-summary__field-result" }, h("div", { class: "evaluation-summary__field-header" }, h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), h("div", { class: "evaluation-summary__field-details" }, h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
|
|
4
6
|
};
|
|
5
7
|
//# sourceMappingURL=evaluation-summary.js.map
|
package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluation-summary.js","sourceRoot":"","sources":["../../../../../src/components/llm-test-runner/test-cases/evaluation/evaluation-summary.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAQvD,MAAM,CAAC,MAAM,iBAAiB,GAAgD,CAAC,EAC7E,MAAM,EACN,SAAS,GACV,EAAE,EAAE;IACH,OAAO,CACL,WAAK,KAAK,EAAC,oBAAoB,IAC5B,MAAM,CAAC,CAAC,CAAC,CACR,WAAK,KAAK,EAAC,4BAA4B;
|
|
1
|
+
{"version":3,"file":"evaluation-summary.js","sourceRoot":"","sources":["../../../../../src/components/llm-test-runner/test-cases/evaluation/evaluation-summary.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAQvD,MAAM,CAAC,MAAM,iBAAiB,GAAgD,CAAC,EAC7E,MAAM,EACN,SAAS,GACV,EAAE,EAAE;IACH,MAAM,YAAY,GAAG,MAAM,EAAE,YAAY,IAAI,EAAE,CAAC;IAChD,MAAM,eAAe,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC;IAEhD,OAAO,CACL,WAAK,KAAK,EAAC,oBAAoB,IAC5B,MAAM,CAAC,CAAC,CAAC,CACR,WAAK,KAAK,EAAC,4BAA4B,IACpC,eAAe,CAAC,CAAC,CAAC,CACjB,WAAK,KAAK,EAAC,mCAAmC,IAC3C,YAAY,CAAC,GAAG,CAAC,WAAW,CAAC,EAAE,CAAC,CAC/B,WAAK,KAAK,EAAC,kCAAkC;QAC3C,WAAK,KAAK,EAAC,kCAAkC;YAC3C,YAAM,KAAK,EAAC,iCAAiC,IAC1C,WAAW,CAAC,KAAK,CACb;YACP,YAAM,KAAK,EAAC,oCAAoC;;gBACnC,WAAW,CAAC,oBAAoB,CAAC,QAAQ,CAC/C,CACH;QACN,WAAK,KAAK,EAAC,mCAAmC;YAC5C,YACE,KAAK,EAAE,sEAAsE,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,EAAE,IAEtH,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CACpC;YACN,WAAW,CAAC,KAAK,IAAI,CACpB,YAAM,KAAK,EAAC,mCAAmC,IAC5C,WAAW,CAAC,KAAK,CACb,CACR;YACD;;gBACU,WAAW,CAAC,wBAAwB,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CACxD;YACP;;gBACW,GAAG;gBACX,WAAW,CAAC,cAAc,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM;;gBAC9D,WAAW,CAAC,cAAc,CAAC,MAAM,CAC7B,CACH,CACF,CACP,CAAC,CACE,CACP,CAAC,CAAC,CAAC,IAAI,CACJ,CACP,CAAC,CAAC,CAAC,CACF,WAAK,KAAK,EAAC,iCAAiC,IACzC,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,EAAE,CAC7B,CACP,CACG,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport { EvaluationResult } from '../../../../lib/evaluation/types';\n\nexport interface EvaluationSummaryProps {\n result?: EvaluationResult;\n isRunning: boolean;\n}\n\nexport const EvaluationSummary: FunctionalComponent<EvaluationSummaryProps> = ({\n result,\n isRunning,\n}) => {\n const fieldResults = result?.fieldResults || [];\n const hasFieldResults = fieldResults.length > 0;\n\n return (\n <div class=\"evaluation-summary\">\n {result ? (\n <div class=\"evaluation-summary__result\">\n {hasFieldResults ? (\n <div class=\"evaluation-summary__field-results\">\n {fieldResults.map(fieldResult => (\n <div class=\"evaluation-summary__field-result\">\n <div class=\"evaluation-summary__field-header\">\n <span class=\"evaluation-summary__field-label\">\n {fieldResult.label}\n </span>\n <span class=\"evaluation-summary__field-approach\">\n Strategy: {fieldResult.evaluationParameters.approach}\n </span>\n </div>\n <div class=\"evaluation-summary__field-details\">\n <span\n class={`evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}`}\n >\n {fieldResult.passed ? 'PASSED' : 'FAILED'}\n </span>\n {fieldResult.error && (\n <span class=\"evaluation-summary__error-message\">\n {fieldResult.error}\n </span>\n )}\n <span>\n Score: {fieldResult.evaluationApproachResult.score.toFixed(2)}\n </span>\n <span>\n Matches:{' '}\n {fieldResult.keywordMatches.filter(match => match.found).length}/\n {fieldResult.keywordMatches.length}\n </span>\n </div>\n </div>\n ))}\n </div>\n ) : null}\n </div>\n ) : (\n <div class=\"evaluation-summary__placeholder\">\n {isRunning ? 'Evaluating...' : ''}\n </div>\n )}\n </div>\n );\n};\n"]}
|
|
@@ -1,9 +1,29 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
2
|
import { FormFieldType } from "../../../lib/form/schema";
|
|
3
|
+
import { EvaluationApproach, } from "../../../lib/evaluation/constants";
|
|
4
|
+
import { getAllowedApproachesForFieldType } from "../../../lib/evaluation/field-evaluation-approach";
|
|
3
5
|
export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange, }) => {
|
|
4
6
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
5
7
|
detail,
|
|
6
8
|
});
|
|
9
|
+
const buildEvaluationConfig = (index, optionList) => ({
|
|
10
|
+
name: `expectedOutcomeEvaluation-${index}`,
|
|
11
|
+
fieldType: FormFieldType.SELECT,
|
|
12
|
+
label: 'Evaluation Approach',
|
|
13
|
+
placeholder: 'Select evaluation approach…',
|
|
14
|
+
required: true,
|
|
15
|
+
optionList,
|
|
16
|
+
defaultValue: EvaluationApproach.EXACT,
|
|
17
|
+
});
|
|
18
|
+
const renderEvaluationSelector = (field, index) => {
|
|
19
|
+
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
20
|
+
return (h("app-select", { config: buildEvaluationConfig(index, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
21
|
+
testCaseId,
|
|
22
|
+
index,
|
|
23
|
+
operation: 'set-evaluation-approach',
|
|
24
|
+
value: e.detail.value,
|
|
25
|
+
}) }));
|
|
26
|
+
};
|
|
7
27
|
return (h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index) => {
|
|
8
28
|
if (field.type === 'textarea') {
|
|
9
29
|
const config = {
|
|
@@ -11,15 +31,15 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
11
31
|
fieldType: FormFieldType.TEXT_AREA,
|
|
12
32
|
label: field.label,
|
|
13
33
|
placeholder: field.placeholder,
|
|
14
|
-
required:
|
|
34
|
+
required: true,
|
|
15
35
|
rows: field.rows || 2,
|
|
16
36
|
};
|
|
17
|
-
return (h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
37
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
18
38
|
testCaseId,
|
|
19
39
|
index,
|
|
20
40
|
operation: 'set-value',
|
|
21
41
|
value: e.detail.value,
|
|
22
|
-
}) }));
|
|
42
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
23
43
|
}
|
|
24
44
|
if (field.type === 'chips-input') {
|
|
25
45
|
const config = {
|
|
@@ -27,9 +47,9 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
27
47
|
fieldType: FormFieldType.CHIPS,
|
|
28
48
|
label: field.label,
|
|
29
49
|
placeholder: field.placeholder,
|
|
30
|
-
required:
|
|
50
|
+
required: true,
|
|
31
51
|
};
|
|
32
|
-
return (h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
52
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
33
53
|
testCaseId,
|
|
34
54
|
index,
|
|
35
55
|
operation: 'add-chip',
|
|
@@ -39,7 +59,7 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
39
59
|
index,
|
|
40
60
|
operation: 'remove-chip',
|
|
41
61
|
value: e.detail.value,
|
|
42
|
-
}) }));
|
|
62
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
43
63
|
}
|
|
44
64
|
if (field.type === 'select') {
|
|
45
65
|
const config = {
|
|
@@ -47,22 +67,22 @@ export const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeC
|
|
|
47
67
|
fieldType: FormFieldType.SELECT,
|
|
48
68
|
label: field.label,
|
|
49
69
|
placeholder: field.placeholder,
|
|
50
|
-
required:
|
|
70
|
+
required: true,
|
|
51
71
|
optionList: field.options,
|
|
52
72
|
};
|
|
53
|
-
return (h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
73
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
54
74
|
testCaseId,
|
|
55
75
|
index,
|
|
56
76
|
operation: 'set-value',
|
|
57
77
|
value: e.detail.value,
|
|
58
|
-
}) }));
|
|
78
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
59
79
|
}
|
|
60
|
-
return (h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
80
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
61
81
|
testCaseId,
|
|
62
82
|
index,
|
|
63
83
|
operation: 'set-value',
|
|
64
84
|
value: e.target.value,
|
|
65
|
-
}) })));
|
|
85
|
+
}) })), renderEvaluationSelector(field, index)));
|
|
66
86
|
})));
|
|
67
87
|
};
|
|
68
88
|
//# sourceMappingURL=expected-outcome-renderer.js.map
|
package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"expected-outcome-renderer.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/expected-outcome-renderer.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAIvD,OAAO,EAAe,aAAa,EAAgC,MAAM,0BAA0B,CAAC;
|
|
1
|
+
{"version":3,"file":"expected-outcome-renderer.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/expected-outcome-renderer.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAIvD,OAAO,EAAe,aAAa,EAAgC,MAAM,0BAA0B,CAAC;AACpG,OAAO,EACL,kBAAkB,GACnB,MAAM,mCAAmC,CAAC;AAC3C,OAAO,EAAE,gCAAgC,EAAE,MAAM,mDAAmD,CAAC;AAerG,MAAM,CAAC,MAAM,uBAAuB,GAAsD,CAAC,EACzF,UAAU,EACV,MAAM,EACN,uBAAuB,GACxB,EAAE,EAAE;IACH,MAAM,IAAI,GAAG,CAAC,MAAmC,EAAE,EAAE,CACnD,uBAAuB,CAAC;QACtB,MAAM;KACqC,CAAC,CAAC;IAEjD,MAAM,qBAAqB,GAAG,CAC5B,KAAa,EACb,UAAoB,EACN,EAAE,CAAC,CAAC;QAClB,IAAI,EAAE,6BAA6B,KAAK,EAAE;QAC1C,SAAS,EAAE,aAAa,CAAC,MAAM;QAC/B,KAAK,EAAE,qBAAqB;QAC5B,WAAW,EAAE,6BAA6B;QAC1C,QAAQ,EAAE,IAAI;QACd,UAAU;QACV,YAAY,EAAE,kBAAkB,CAAC,KAAK;KACvC,CAAC,CAAC;IAEH,MAAM,wBAAwB,GAAG,CAC/B,KAA2B,EAC3B,KAAa,EACb,EAAE;QACF,MAAM,UAAU,GAAG,gCAAgC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEhE,OAAO,CACL,kBACE,MAAM,EAAE,qBAAqB,CAAC,KAAK,EAAE,UAAU,CAAC,EAChD,KAAK,EAAE,KAAK,CAAC,oBAAoB,EAAE,QAAQ,EAC3C,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,IAAI,CAAC;gBACH,UAAU;gBACV,KAAK;gBACL,SAAS,EAAE,yBAAyB;gBACpC,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAA2B;aAC5C,CAAC,GAEJ,CACH,CAAC;IACJ,CAAC,CAAC;IAEF,OAAO,CACL,WAAK,KAAK,EAAC,2BAA2B,IACnC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QACnC,IAAI,KAAK,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;YAC9B,MAAM,MAAM,GAAmB;gBAC7B,IAAI,EAAE,mBAAmB,KAAK,EAAE;gBAChC,SAAS,EAAE,aAAa,CAAC,SAAS;gBAClC,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,WAAW,EAAE,KAAK,CAAC,WAAW;gBAC9B,QAAQ,EAAE,IAAI;gBACd,IAAI,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC;aACtB,CAAC;YACF,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;gBAC3C,oBACE,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,WAAW;wBACtB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,GAEJ;gBACD,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;QACJ,CAAC;QAED,IAAI,KAAK,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;YACjC,MAAM,MAAM,GAAgB;gBAC1B,IAAI,EAAE,mBAAmB,KAAK,EAAE;gBAChC,SAAS,EAAE,aAAa,CAAC,KAAK;gBAC9B,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,WAAW,EAAE,KAAK,CAAC,WAAW;gBAC9B,QAAQ,EAAE,IAAI;aACf,CAAC;YAEF,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;gBAC3C,iBACE,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,SAAS,EAAE,CAAC,CAAC,EAAE,EAAE,CACf,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,UAAU;wBACrB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,EAEJ,YAAY,EAAE,CAAC,CAAC,EAAE,EAAE,CAClB,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,aAAa;wBACxB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,GAEJ;gBACD,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;QACJ,CAAC;QAED,IAAI,KAAK,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAiB;gBAC3B,IAAI,EAAE,mBAAmB,KAAK,EAAE;gBAChC,SAAS,EAAE,aAAa,CAAC,MAAM;gBAC/B,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,WAAW,EAAE,KAAK,CAAC,WAAW;gBAC9B,QAAQ,EAAE,IAAI;gBACd,UAAU,EAAE,KAAK,CAAC,OAAO;aAC1B,CAAC;YAEF,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;gBAC3C,kBACE,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,WAAW;wBACtB,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB,CAAC,GAEJ;gBACD,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;QACJ,CAAC;QAED,OAAO,CACL,WAAK,KAAK,EAAC,kCAAkC;YAC3C,WAAK,KAAK,EAAC,iCAAiC;gBAC1C,iBAAQ,KAAK,CAAC,KAAK,CAAS;gBAC5B,aACE,IAAI,EAAC,MAAM,EACX,KAAK,EAAE,KAAK,CAAC,KAAK,EAClB,WAAW,EAAE,KAAK,CAAC,WAAW,EAC9B,OAAO,EAAE,CAAC,CAAC,EAAE,EAAE,CACb,IAAI,CAAC;wBACH,UAAU;wBACV,KAAK;wBACL,SAAS,EAAE,WAAW;wBACtB,KAAK,EAAG,CAAC,CAAC,MAA2B,CAAC,KAAK;qBAC5C,CAAC,GAEJ,CACE;YACL,wBAAwB,CAAC,KAAK,EAAE,KAAK,CAAC,CACnC,CACP,CAAC;IACJ,CAAC,CAAC,CACE,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport {\n ExpectedOutcomeField,\n} from '../../../types/llm-test-runner';\nimport { ChipsConfig, FormFieldType, SelectConfig, TextAreaConfig } from '../../../lib/form/schema';\nimport {\n EvaluationApproach,\n} from '../../../lib/evaluation/constants';\nimport { getAllowedApproachesForFieldType } from '../../../lib/evaluation/field-evaluation-approach';\nimport { ExpectedOutcomeChange } from '../../../lib/test-cases/test-case-mutations';\n\nexport type ExpectedOutcomeChangeDetail = {\n testCaseId: string;\n} & ExpectedOutcomeChange;\n\ninterface ExpectedOutcomeRendererProps {\n testCaseId: string;\n fields: ExpectedOutcomeField[];\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n}\n\nexport const ExpectedOutcomeRenderer: FunctionalComponent<ExpectedOutcomeRendererProps> = ({\n testCaseId,\n fields,\n onExpectedOutcomeChange,\n}) => {\n const emit = (detail: ExpectedOutcomeChangeDetail) =>\n onExpectedOutcomeChange({\n detail,\n } as CustomEvent<ExpectedOutcomeChangeDetail>);\n\n const buildEvaluationConfig = (\n index: number,\n optionList: string[],\n ): SelectConfig => ({\n name: `expectedOutcomeEvaluation-${index}`,\n fieldType: FormFieldType.SELECT,\n label: 'Evaluation Approach',\n placeholder: 'Select evaluation approach…',\n required: true,\n optionList,\n defaultValue: EvaluationApproach.EXACT,\n });\n\n const renderEvaluationSelector = (\n field: ExpectedOutcomeField,\n index: number,\n ) => {\n const optionList = getAllowedApproachesForFieldType(field.type);\n\n return (\n <app-select\n config={buildEvaluationConfig(index, optionList)}\n value={field.evaluationParameters?.approach}\n onValueChange={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-evaluation-approach',\n value: e.detail.value as EvaluationApproach,\n })\n }\n />\n );\n };\n\n return (\n <div class=\"expected-outcome-renderer\">\n {(fields || []).map((field, index) => {\n if (field.type === 'textarea') {\n const config: TextAreaConfig = {\n name: `expectedOutcome-${index}`,\n fieldType: FormFieldType.TEXT_AREA,\n label: field.label,\n placeholder: field.placeholder,\n required: true,\n rows: field.rows || 2,\n };\n return (\n <div class=\"expected-outcome-renderer__group\">\n <app-textarea\n config={config}\n value={field.value}\n onValueChange={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-value',\n value: e.detail.value,\n })\n }\n />\n {renderEvaluationSelector(field, index)}\n </div>\n );\n }\n\n if (field.type === 'chips-input') {\n const config: ChipsConfig = {\n name: `expectedOutcome-${index}`,\n fieldType: FormFieldType.CHIPS,\n label: field.label,\n placeholder: field.placeholder,\n required: true,\n };\n\n return (\n <div class=\"expected-outcome-renderer__group\">\n <app-chips\n config={config}\n value={field.value}\n onAddChip={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'add-chip',\n value: e.detail.value,\n })\n }\n onRemoveChip={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'remove-chip',\n value: e.detail.value,\n })\n }\n />\n {renderEvaluationSelector(field, index)}\n </div>\n );\n }\n\n if (field.type === 'select') {\n const config: SelectConfig = {\n name: `expectedOutcome-${index}`,\n fieldType: FormFieldType.SELECT,\n label: field.label,\n placeholder: field.placeholder,\n required: true,\n optionList: field.options,\n };\n\n return (\n <div class=\"expected-outcome-renderer__group\">\n <app-select\n config={config}\n value={field.value}\n onValueChange={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-value',\n value: e.detail.value,\n })\n }\n />\n {renderEvaluationSelector(field, index)}\n </div>\n );\n }\n\n return (\n <div class=\"expected-outcome-renderer__group\">\n <div class=\"expected-outcome-renderer__text\">\n <label>{field.label}</label>\n <input\n type=\"text\"\n value={field.value}\n placeholder={field.placeholder}\n onInput={(e) =>\n emit({\n testCaseId,\n index,\n operation: 'set-value',\n value: (e.target as HTMLInputElement).value,\n })\n }\n />\n </div>\n {renderEvaluationSelector(field, index)}\n </div>\n );\n })}\n </div>\n );\n};\n"]}
|
|
@@ -18,6 +18,23 @@
|
|
|
18
18
|
border-right: var(--border-width) solid var(--border);
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
+
.expected-outcome-renderer {
|
|
22
|
+
display: flex;
|
|
23
|
+
flex-direction: column;
|
|
24
|
+
gap: var(--spacing-4);
|
|
25
|
+
margin-top: var(--spacing-4);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
.expected-outcome-renderer__group {
|
|
29
|
+
display: flex;
|
|
30
|
+
flex-direction: column;
|
|
31
|
+
gap: var(--spacing-2);
|
|
32
|
+
padding: var(--spacing-3);
|
|
33
|
+
border: var(--border-width) solid var(--border);
|
|
34
|
+
border-radius: var(--radius-md);
|
|
35
|
+
background: var(--background);
|
|
36
|
+
}
|
|
37
|
+
|
|
21
38
|
/* Responsive Design */
|
|
22
39
|
@media (max-width: 1200px) {
|
|
23
40
|
.test-case-row {
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
|
-
import { EvaluationApproach, EvaluationApproachValues, } from "../../../lib/evaluation/constants";
|
|
3
2
|
import { ResponseOutput } from "./output/response-output";
|
|
4
3
|
import { EvaluationSummary } from "./evaluation/evaluation-summary";
|
|
5
4
|
import { RowActions } from "./actions/row-actions";
|
|
6
5
|
import { FormFieldType } from "../../../lib/form/schema";
|
|
7
6
|
import { ExpectedOutcomeRenderer, } from "./expected-outcome-renderer";
|
|
8
|
-
export const LLMTestCaseRow = ({ testCase, onRun, onDelete,
|
|
7
|
+
export const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
9
8
|
const questionConfig = {
|
|
10
9
|
name: 'question',
|
|
11
10
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -15,21 +14,12 @@ export const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, ha
|
|
|
15
14
|
required: true,
|
|
16
15
|
rows: 3,
|
|
17
16
|
};
|
|
18
|
-
const evaluationConfig = {
|
|
19
|
-
name: 'EvaluationApproach',
|
|
20
|
-
fieldType: FormFieldType.SELECT,
|
|
21
|
-
label: 'Evaluation',
|
|
22
|
-
placeholder: 'Select evaluation approach…',
|
|
23
|
-
required: true,
|
|
24
|
-
optionList: EvaluationApproachValues,
|
|
25
|
-
defaultValue: EvaluationApproach.EXACT,
|
|
26
|
-
};
|
|
27
17
|
return (h("div", { class: "test-case-row", key: testCase.id }, h("div", { class: "test-case-row__input-column" }, h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
|
|
28
18
|
detail: {
|
|
29
19
|
testCaseId: testCase.id,
|
|
30
20
|
key: 'question',
|
|
31
21
|
value: e.detail.value,
|
|
32
22
|
},
|
|
33
|
-
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })
|
|
23
|
+
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
34
24
|
};
|
|
35
25
|
//# sourceMappingURL=llm-test-case-row.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm-test-case-row.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-case-row.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,
|
|
1
|
+
{"version":3,"file":"llm-test-case-row.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-case-row.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,iCAAiC,CAAC;AACpE,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAkB,MAAM,0BAA0B,CAAC;AACzE,OAAO,EAEL,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AAcrC,MAAM,CAAC,MAAM,cAAc,GAA6C,CAAC,EACvE,QAAQ,EACR,KAAK,EACL,QAAQ,EACR,oBAAoB,EACpB,uBAAuB,GACxB,EAAE,EAAE;IACH,MAAM,cAAc,GAAmB;QACrC,IAAI,EAAE,UAAU;QAChB,SAAS,EAAE,aAAa,CAAC,SAAS;QAClC,IAAI,EAAE,MAAM;QACZ,KAAK,EAAE,UAAU;QACjB,WAAW,EAAE,6BAA6B;QAC1C,QAAQ,EAAE,IAAI;QACd,IAAI,EAAE,CAAC;KACR,CAAC;IACF,OAAO,CACL,WAAK,KAAK,EAAC,eAAe,EAAC,GAAG,EAAE,QAAQ,CAAC,EAAE;QACzC,WAAK,KAAK,EAAC,6BAA6B;YACtC,oBACE,MAAM,EAAE,cAAc,EACtB,KAAK,EAAE,QAAQ,CAAC,QAAQ,EACxB,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CACnB,oBAAoB,CAAC;oBACnB,MAAM,EAAE;wBACN,UAAU,EAAE,QAAQ,CAAC,EAAE;wBACvB,GAAG,EAAE,UAAU;wBACf,KAAK,EAAE,CAAC,CAAC,MAAM,CAAC,KAAK;qBACtB;iBACiE,CAAC,GAEvE;YACF,EAAC,uBAAuB,IACtB,UAAU,EAAE,QAAQ,CAAC,EAAE,EACvB,MAAM,EAAE,QAAQ,CAAC,eAAe,IAAI,EAAE,EACtC,uBAAuB,EAAE,uBAAuB,GAChD,CACE;QAEN,EAAC,cAAc,IAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,EAAE,SAAS,EAAE,QAAQ,CAAC,SAAS,GAAI;QAE1E,EAAC,iBAAiB,IAChB,MAAM,EAAE,QAAQ,CAAC,gBAAgB,EACjC,SAAS,EAAE,QAAQ,CAAC,SAAS,GAC7B;QAEF,EAAC,UAAU,IACT,SAAS,EAAE,QAAQ,CAAC,SAAS,EAC7B,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,EAClC,KAAK,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,EAC5B,QAAQ,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,GACrC,CACE,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport { TestCase } from '../../../types/llm-test-runner';\nimport { ResponseOutput } from './output/response-output';\nimport { EvaluationSummary } from './evaluation/evaluation-summary';\nimport { RowActions } from './actions/row-actions';\nimport { FormFieldType, TextAreaConfig } from '../../../lib/form/schema';\nimport {\n ExpectedOutcomeChangeDetail,\n ExpectedOutcomeRenderer,\n} from './expected-outcome-renderer';\n\nexport interface LLMTestCaseRowProps {\n testCase: TestCase;\n onRun: (testCase: TestCase) => void;\n onDelete: (id: string) => void;\n handleTestCaseChange: (\n e: CustomEvent<{ testCaseId: string; key: string; value: string }>,\n ) => void;\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n}\n\nexport const LLMTestCaseRow: FunctionalComponent<LLMTestCaseRowProps> = ({\n testCase,\n onRun,\n onDelete,\n handleTestCaseChange,\n onExpectedOutcomeChange,\n}) => {\n const questionConfig: TextAreaConfig = {\n name: 'question',\n fieldType: FormFieldType.TEXT_AREA,\n type: 'text',\n label: 'Question',\n placeholder: 'Enter your question here...',\n required: true,\n rows: 3,\n };\n return (\n <div class=\"test-case-row\" key={testCase.id}>\n <div class=\"test-case-row__input-column\">\n <app-textarea\n config={questionConfig}\n value={testCase.question}\n onValueChange={(e) =>\n handleTestCaseChange({\n detail: {\n testCaseId: testCase.id,\n key: 'question',\n value: e.detail.value,\n },\n } as CustomEvent<{ testCaseId: string; key: string; value: string }>)\n }\n />\n <ExpectedOutcomeRenderer\n testCaseId={testCase.id}\n fields={testCase.expectedOutcome || []}\n onExpectedOutcomeChange={onExpectedOutcomeChange}\n />\n </div>\n\n <ResponseOutput output={testCase.output} isRunning={testCase.isRunning} />\n\n <EvaluationSummary\n result={testCase.evaluationResult}\n isRunning={testCase.isRunning}\n />\n\n <RowActions\n isRunning={testCase.isRunning}\n canRun={!!testCase.question.trim()}\n onRun={() => onRun(testCase)}\n onDelete={() => onDelete(testCase.id)}\n />\n </div>\n );\n};\n"]}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { h } from "@stencil/core";
|
|
2
2
|
import { LLMTestCaseRow } from "./llm-test-case-row";
|
|
3
3
|
import { Button } from "../../../lib/ui/button/index";
|
|
4
|
-
export const LLMTestCases = ({ testCases, onRun, onDelete,
|
|
5
|
-
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete,
|
|
4
|
+
export const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
5
|
+
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
6
6
|
};
|
|
7
7
|
//# sourceMappingURL=llm-test-cases.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm-test-cases.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-cases.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"llm-test-cases.js","sourceRoot":"","sources":["../../../../src/components/llm-test-runner/test-cases/llm-test-cases.tsx"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAuB,MAAM,eAAe,CAAC;AAEvD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,MAAM,EAAE,MAAM,8BAA8B,CAAC;AAgBtD,MAAM,CAAC,MAAM,YAAY,GAA2C,CAAC,EACnE,SAAS,EACT,KAAK,EACL,QAAQ,EACR,aAAa,EACb,oBAAoB,EACpB,uBAAuB,GACxB,EAAE,EAAE;IACH,OAAO,CACL,WAAK,KAAK,EAAC,YAAY;QACrB,WAAK,KAAK,EAAC,4BAA4B;YACrC,WAAK,KAAK,EAAC,2BAA2B,YAAY;YAClD,WAAK,KAAK,EAAC,2BAA2B,aAAa;YACnD,WAAK,KAAK,EAAC,2BAA2B,iBAAiB;YACvD,WAAK,KAAK,EAAC,2BAA2B,cAAc,CAChD;QAEL,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CACzB,EAAC,cAAc,IACb,QAAQ,EAAE,QAAQ,EAClB,KAAK,EAAE,KAAK,EACZ,QAAQ,EAAE,QAAQ,EAClB,oBAAoB,EAAE,oBAAoB,EAC1C,uBAAuB,EAAE,uBAAuB,GAChD,CACH,CAAC;QAEF,WAAK,KAAK,EAAC,yBAAyB;YAClC,EAAC,MAAM,IAAC,OAAO,EAAC,SAAS,EAAC,IAAI,EAAC,IAAI,EAAC,OAAO,EAAE,aAAa,qBAEjD,CACL,CACF,CACP,CAAC;AACJ,CAAC,CAAC","sourcesContent":["import { h, FunctionalComponent } from '@stencil/core';\nimport { TestCase } from '../../../types/llm-test-runner';\nimport { LLMTestCaseRow } from './llm-test-case-row';\nimport { Button } from '../../../lib/ui/button/index';\nimport { ExpectedOutcomeChangeDetail } from './expected-outcome-renderer';\n\nexport interface LLMTestCasesProps {\n testCases: TestCase[];\n onRun: (testCase: TestCase) => void;\n onDelete: (id: string) => void;\n onAddTestCase: () => void;\n handleTestCaseChange: (\n e: CustomEvent<{ testCaseId: string; key: string; value: string }>,\n ) => void;\n onExpectedOutcomeChange: (\n e: CustomEvent<ExpectedOutcomeChangeDetail>,\n ) => void;\n}\n\nexport const LLMTestCases: FunctionalComponent<LLMTestCasesProps> = ({\n testCases,\n onRun,\n onDelete,\n onAddTestCase,\n handleTestCaseChange,\n onExpectedOutcomeChange,\n}) => {\n return (\n <div class=\"test-cases\">\n <div class=\"test-cases__column-headers\">\n <div class=\"test-cases__column-header\">Input</div>\n <div class=\"test-cases__column-header\">Output</div>\n <div class=\"test-cases__column-header\">Evaluation</div>\n <div class=\"test-cases__column-header\">Actions</div>\n </div>\n\n {testCases.map(testCase => (\n <LLMTestCaseRow\n testCase={testCase}\n onRun={onRun}\n onDelete={onDelete}\n handleTestCaseChange={handleTestCaseChange}\n onExpectedOutcomeChange={onExpectedOutcomeChange}\n />\n ))}\n\n <div class=\"test-cases__add-section\">\n <Button variant=\"outline\" size=\"md\" onClick={onAddTestCase}>\n + Add Question\n </Button>\n </div>\n </div>\n );\n};\n"]}
|
|
@@ -6,56 +6,77 @@ import { performRougeLEvaluation } from "./evaluators/rougeL-evaluator";
|
|
|
6
6
|
import { performBleuEvaluation } from "./evaluators/bleu/bleu-evaluator";
|
|
7
7
|
export class LLMEvaluationEngine {
|
|
8
8
|
async evaluateResponse(request, callback) {
|
|
9
|
-
|
|
10
|
-
const
|
|
11
|
-
switch (approach) {
|
|
12
|
-
case EvaluationApproach.BLEU: {
|
|
13
|
-
const bleuResult = performBleuEvaluation(request);
|
|
14
|
-
callback(bleuResult);
|
|
15
|
-
break;
|
|
16
|
-
}
|
|
17
|
-
case EvaluationApproach.EXACT: {
|
|
18
|
-
const exactResult = await performEvaluation(request);
|
|
19
|
-
callback(exactResult);
|
|
20
|
-
break;
|
|
21
|
-
}
|
|
22
|
-
case EvaluationApproach.ROUGE_1: {
|
|
23
|
-
const rougeResult = await performRouge1Evaluation(request);
|
|
24
|
-
callback(rougeResult);
|
|
25
|
-
break;
|
|
26
|
-
}
|
|
27
|
-
case EvaluationApproach.ROUGE_L: {
|
|
28
|
-
const rougeLResult = await performRougeLEvaluation(request);
|
|
29
|
-
callback(rougeLResult);
|
|
30
|
-
break;
|
|
31
|
-
}
|
|
32
|
-
case EvaluationApproach.SEMANTIC: {
|
|
33
|
-
const semanticResult = await performSemanticEvaluation(request);
|
|
34
|
-
callback(semanticResult);
|
|
35
|
-
break;
|
|
36
|
-
}
|
|
37
|
-
default: {
|
|
38
|
-
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
39
|
-
const fallbackResult = await performEvaluation(request);
|
|
40
|
-
callback(fallbackResult);
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
catch (error) {
|
|
45
|
-
console.error('Evaluation failed:', error);
|
|
46
|
-
const errorResult = {
|
|
9
|
+
const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
|
|
10
|
+
const fieldRequest = {
|
|
47
11
|
testCaseId: request.testCaseId,
|
|
12
|
+
question: request.question,
|
|
13
|
+
actualResponse: request.actualResponse,
|
|
14
|
+
expectedOutcome: field.expectedValue,
|
|
15
|
+
evaluationParameters: field.evaluationParameters,
|
|
16
|
+
};
|
|
17
|
+
const result = await this.evaluateField(fieldRequest);
|
|
18
|
+
const fieldResult = {
|
|
19
|
+
index: field.index,
|
|
20
|
+
label: field.label,
|
|
21
|
+
type: field.type,
|
|
22
|
+
expectedValue: field.expectedValue,
|
|
23
|
+
passed: result.passed,
|
|
24
|
+
keywordMatches: result.keywordMatches,
|
|
25
|
+
evaluationParameters: result.evaluationParameters,
|
|
26
|
+
evaluationApproachResult: result.evaluationApproachResult,
|
|
27
|
+
};
|
|
28
|
+
return fieldResult;
|
|
29
|
+
}));
|
|
30
|
+
const fieldResults = settledResults.map((settledResult, index) => {
|
|
31
|
+
const field = request.fields[index];
|
|
32
|
+
if (settledResult.status === 'fulfilled') {
|
|
33
|
+
return settledResult.value;
|
|
34
|
+
}
|
|
35
|
+
return {
|
|
36
|
+
index: field.index,
|
|
37
|
+
label: field.label,
|
|
38
|
+
type: field.type,
|
|
39
|
+
expectedValue: field.expectedValue,
|
|
48
40
|
passed: false,
|
|
49
41
|
keywordMatches: [],
|
|
50
|
-
|
|
51
|
-
evaluationParameters: request.evaluationParameters,
|
|
42
|
+
evaluationParameters: field.evaluationParameters,
|
|
52
43
|
evaluationApproachResult: {
|
|
53
44
|
score: 0,
|
|
54
|
-
approachUsed:
|
|
45
|
+
approachUsed: field.evaluationParameters.approach,
|
|
55
46
|
},
|
|
47
|
+
error: this.getSafeErrorMessage(settledResult.reason),
|
|
56
48
|
};
|
|
57
|
-
|
|
49
|
+
});
|
|
50
|
+
const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
|
|
51
|
+
const passed = fieldResults.every(field => field.passed && !field.error);
|
|
52
|
+
callback({
|
|
53
|
+
testCaseId: request.testCaseId,
|
|
54
|
+
passed,
|
|
55
|
+
keywordMatches,
|
|
56
|
+
fieldResults,
|
|
57
|
+
timestamp: new Date().toISOString(),
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
async evaluateField(request) {
|
|
61
|
+
const approach = request.evaluationParameters.approach;
|
|
62
|
+
switch (approach) {
|
|
63
|
+
case EvaluationApproach.BLEU:
|
|
64
|
+
return performBleuEvaluation(request);
|
|
65
|
+
case EvaluationApproach.EXACT:
|
|
66
|
+
return performEvaluation(request);
|
|
67
|
+
case EvaluationApproach.ROUGE_1:
|
|
68
|
+
return performRouge1Evaluation(request);
|
|
69
|
+
case EvaluationApproach.ROUGE_L:
|
|
70
|
+
return performRougeLEvaluation(request);
|
|
71
|
+
case EvaluationApproach.SEMANTIC:
|
|
72
|
+
return performSemanticEvaluation(request);
|
|
73
|
+
default:
|
|
74
|
+
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
75
|
+
return performEvaluation(request);
|
|
58
76
|
}
|
|
59
77
|
}
|
|
78
|
+
getSafeErrorMessage(error) {
|
|
79
|
+
return error instanceof Error ? error.message : 'Field evaluation failed.';
|
|
80
|
+
}
|
|
60
81
|
}
|
|
61
82
|
//# sourceMappingURL=evaluation-engine.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,yBAAyB,EAAE,MAAM,6BAA6B,CAAC;AACxE,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AAEzE,MAAM,OAAO,mBAAmB;IAC9B,KAAK,CAAC,gBAAgB,CACpB,OAA4B,EAC5B,QAA4B;QAE5B,MAAM,cAAc,GAAG,MAAM,OAAO,CAAC,UAAU,CAC7C,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,EAAC,KAAK,EAAC,EAAE;YAC/B,MAAM,YAAY,GAAsB;gBACtC,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,QAAQ,EAAE,OAAO,CAAC,QAAQ;gBAC1B,cAAc,EAAE,OAAO,CAAC,cAAc;gBACtC,eAAe,EAAE,KAAK,CAAC,aAAa;gBACpC,oBAAoB,EAAE,KAAK,CAAC,oBAAoB;aACjD,CAAC;YACF,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,YAAY,CAAC,CAAC;YAEtD,MAAM,WAAW,GAA0B;gBACzC,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,aAAa,EAAE,KAAK,CAAC,aAAa;gBAClC,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,oBAAoB,EAAE,MAAM,CAAC,oBAAqB;gBAClD,wBAAwB,EAAE,MAAM,CAAC,wBAAwB;aAC1D,CAAC;YACF,OAAO,WAAW,CAAC;QACrB,CAAC,CAAC,CACH,CAAC;QAEF,MAAM,YAAY,GAA4B,cAAc,CAAC,GAAG,CAC9D,CAAC,aAAa,EAAE,KAAK,EAAE,EAAE;YACvB,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACpC,IAAI,aAAa,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;gBACzC,OAAO,aAAa,CAAC,KAAK,CAAC;YAC7B,CAAC;YAED,OAAO;gBACL,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,KAAK,EAAE,KAAK,CAAC,KAAK;gBAClB,IAAI,EAAE,KAAK,CAAC,IAAI;gBAChB,aAAa,EAAE,KAAK,CAAC,aAAa;gBAClC,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,oBAAoB,EAAE,KAAK,CAAC,oBAAoB;gBAChD,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,KAAK,CAAC,oBAAoB,CAAC,QAAQ;iBAClD;gBACD,KAAK,EAAE,IAAI,CAAC,mBAAmB,CAAC,aAAa,CAAC,MAAM,CAAC;aACtD,CAAC;QACJ,CAAC,CACF,CAAC;QAEF,MAAM,cAAc,GAAG,YAAY,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC3E,MAAM,MAAM,GAAG,YAAY,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QAEzE,QAAQ,CAAC;YACP,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,MAAM;YACN,cAAc;YACd,YAAY;YACZ,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAC,CAAC;IACL,CAAC;IAEO,KAAK,CAAC,aAAa,CAAC,OAA0B;QACpD,MAAM,QAAQ,GAAuB,OAAO,CAAC,oBAAoB,CAAC,QAAQ,CAAC;QAC3E,QAAQ,QAAQ,EAAE,CAAC;YACjB,KAAK,kBAAkB,CAAC,IAAI;gBAC1B,OAAO,qBAAqB,CAAC,OAAO,CAAC,CAAC;YACxC,KAAK,kBAAkB,CAAC,KAAK;gBAC3B,OAAO,iBAAiB,CAAC,OAAO,CAAC,CAAC;YACpC,KAAK,kBAAkB,CAAC,OAAO;gBAC7B,OAAO,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAC1C,KAAK,kBAAkB,CAAC,OAAO;gBAC7B,OAAO,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAC1C,KAAK,kBAAkB,CAAC,QAAQ;gBAC9B,OAAO,yBAAyB,CAAC,OAAO,CAAC,CAAC;YAC5C;gBACE,OAAO,CAAC,IAAI,CACV,8BAA8B,OAAO,CAAC,oBAAoB,CAAC,QAAQ,kCAAkC,CACtG,CAAC;gBACF,OAAO,iBAAiB,CAAC,OAAO,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAEO,mBAAmB,CAAC,KAAc;QACxC,OAAO,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,CAAC;IAC7E,CAAC;CACF","sourcesContent":["import {\n EvaluationRequest,\n EvaluationResult,\n EvaluationCallback,\n FieldEvaluationResult,\n EvaluationRequestV2,\n} from './types';\nimport { performEvaluation } from './evaluators/exact/exact';\nimport { EvaluationApproach } from './constants';\nimport { performRouge1Evaluation } from './evaluators/rouge1-evaluator';\nimport { performSemanticEvaluation } from './evaluators/semantic/index';\nimport { performRougeLEvaluation } from './evaluators/rougeL-evaluator';\nimport { performBleuEvaluation } from './evaluators/bleu/bleu-evaluator';\n\nexport class LLMEvaluationEngine {\n async evaluateResponse(\n request: EvaluationRequestV2,\n callback: EvaluationCallback,\n ): Promise<void> {\n const settledResults = await Promise.allSettled(\n request.fields.map(async field => {\n const fieldRequest: EvaluationRequest = {\n testCaseId: request.testCaseId,\n question: request.question,\n actualResponse: request.actualResponse,\n expectedOutcome: field.expectedValue,\n evaluationParameters: field.evaluationParameters,\n };\n const result = await this.evaluateField(fieldRequest);\n\n const fieldResult: FieldEvaluationResult = {\n index: field.index,\n label: field.label,\n type: field.type,\n expectedValue: field.expectedValue,\n passed: result.passed,\n keywordMatches: result.keywordMatches,\n evaluationParameters: result.evaluationParameters!,\n evaluationApproachResult: result.evaluationApproachResult,\n };\n return fieldResult;\n }),\n );\n\n const fieldResults: FieldEvaluationResult[] = settledResults.map(\n (settledResult, index) => {\n const field = request.fields[index];\n if (settledResult.status === 'fulfilled') {\n return settledResult.value;\n }\n\n return {\n index: field.index,\n label: field.label,\n type: field.type,\n expectedValue: field.expectedValue,\n passed: false,\n keywordMatches: [],\n evaluationParameters: field.evaluationParameters,\n evaluationApproachResult: {\n score: 0,\n approachUsed: field.evaluationParameters.approach,\n },\n error: this.getSafeErrorMessage(settledResult.reason),\n };\n },\n );\n\n const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);\n const passed = fieldResults.every(field => field.passed && !field.error);\n\n callback({\n testCaseId: request.testCaseId,\n passed,\n keywordMatches,\n fieldResults,\n timestamp: new Date().toISOString(),\n });\n }\n\n private async evaluateField(request: EvaluationRequest): Promise<EvaluationResult> {\n const approach: EvaluationApproach = request.evaluationParameters.approach;\n switch (approach) {\n case EvaluationApproach.BLEU:\n return performBleuEvaluation(request);\n case EvaluationApproach.EXACT:\n return performEvaluation(request);\n case EvaluationApproach.ROUGE_1:\n return performRouge1Evaluation(request);\n case EvaluationApproach.ROUGE_L:\n return performRougeLEvaluation(request);\n case EvaluationApproach.SEMANTIC:\n return performSemanticEvaluation(request);\n default:\n console.warn(\n `Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`,\n );\n return performEvaluation(request);\n }\n }\n\n private getSafeErrorMessage(error: unknown): string {\n return error instanceof Error ? error.message : 'Field evaluation failed.';\n }\n}\n"]}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { LLMEvaluationEngine } from "./evaluation-engine";
|
|
2
|
-
import {
|
|
2
|
+
import { normalizeEvaluationParametersForField } from "./field-evaluation-approach";
|
|
3
3
|
/**
|
|
4
4
|
* Service for evaluating test case responses
|
|
5
5
|
*/
|
|
@@ -18,12 +18,18 @@ export class EvaluationService {
|
|
|
18
18
|
console.warn('⚠️ No output to evaluate for test case:', testCase.id);
|
|
19
19
|
return;
|
|
20
20
|
}
|
|
21
|
+
const fields = (testCase.expectedOutcome || []).map((field, index) => ({
|
|
22
|
+
index,
|
|
23
|
+
label: field.label,
|
|
24
|
+
type: field.type,
|
|
25
|
+
expectedValue: getFieldExpectedValue(field),
|
|
26
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
27
|
+
}));
|
|
21
28
|
const evaluationRequest = {
|
|
22
29
|
testCaseId: testCase.id,
|
|
23
30
|
question: testCase.question,
|
|
24
|
-
expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
|
|
25
31
|
actualResponse: testCase.output,
|
|
26
|
-
|
|
32
|
+
fields,
|
|
27
33
|
};
|
|
28
34
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
29
35
|
console.log('📊 Evaluation result received:', result);
|
|
@@ -31,4 +37,10 @@ export class EvaluationService {
|
|
|
31
37
|
});
|
|
32
38
|
}
|
|
33
39
|
}
|
|
40
|
+
function getFieldExpectedValue(field) {
|
|
41
|
+
if (field.type === 'chips-input') {
|
|
42
|
+
return field.value.join(', ');
|
|
43
|
+
}
|
|
44
|
+
return field.value;
|
|
45
|
+
}
|
|
34
46
|
//# sourceMappingURL=evaluation-service.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluation-service.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;
|
|
1
|
+
{"version":3,"file":"evaluation-service.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAO1D,OAAO,EAAE,qCAAqC,EAAE,MAAM,6BAA6B,CAAC;AAEpF;;GAEG;AACH,MAAM,OAAO,iBAAiB;IACpB,MAAM,CAAsB;IAEpC;QACE,IAAI,CAAC,MAAM,GAAG,IAAI,mBAAmB,EAAE,CAAC;IAC1C,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,gBAAgB,CACpB,QAAkB,EAClB,QAA4C;QAE5C,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;YACrB,OAAO,CAAC,IAAI,CAAC,yCAAyC,EAAE,QAAQ,CAAC,EAAE,CAAC,CAAC;YACrE,OAAO;QACT,CAAC;QAED,MAAM,MAAM,GAA2B,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,GAAG,CACzE,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE,CAAC,CAAC;YACjB,KAAK;YACL,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,IAAI,EAAE,KAAK,CAAC,IAAI;YAChB,aAAa,EAAE,qBAAqB,CAAC,KAAK,CAAC;YAC3C,oBAAoB,EAAE,qCAAqC,CACzD,KAAK,CAAC,IAAI,EACV,KAAK,CAAC,oBAAoB,CAC3B;SACF,CAAC,CACH,CAAC;QAEF,MAAM,iBAAiB,GAAwB;YAC7C,UAAU,EAAE,QAAQ,CAAC,EAAE;YACvB,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,cAAc,EAAE,QAAQ,CAAC,MAAM;YAC/B,MAAM;SACP,CAAC;QAEF,MAAM,IAAI,CAAC,MAAM,CAAC,gBAAgB,CAChC,iBAAiB,EACjB,CAAC,MAAwB,EAAE,EAAE;YAC3B,OAAO,CAAC,GAAG,CAAC,gCAAgC,EAAE,MAAM,CAAC,CAAC;YACtD,QAAQ,CAAC,MAAM,CAAC,CAAC;QACnB,CAAC,CACF,CAAC;IACJ,CAAC;CACF;AAED,SAAS,qBAAqB,CAAC,KAA2B;IACxD,IAAI,KAAK,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;QACjC,OAAO,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAChC,CAAC;IACD,OAAO,KAAK,CAAC,KAAK,CAAC;AACrB,CAAC","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport {\n EvaluationResult,\n FieldEvaluationInput,\n EvaluationRequestV2,\n} from './types';\nimport { TestCase, ExpectedOutcomeField } from '../../types/llm-test-runner';\nimport { normalizeEvaluationParametersForField } from './field-evaluation-approach';\n\n/**\n * Service for evaluating test case responses\n */\nexport class EvaluationService {\n private engine: LLMEvaluationEngine;\n\n constructor() {\n this.engine = new LLMEvaluationEngine();\n }\n\n /**\n * Evaluates a test case response\n * @param testCase - The test case to evaluate\n * @param onResult - Callback to handle the evaluation result\n */\n async evaluateTestCase(\n testCase: TestCase,\n onResult: (result: EvaluationResult) => void,\n ): Promise<void> {\n if (!testCase.output) {\n console.warn('⚠️ No output to evaluate for test case:', testCase.id);\n return;\n }\n\n const fields: FieldEvaluationInput[] = (testCase.expectedOutcome || []).map(\n (field, index) => ({\n index,\n label: field.label,\n type: field.type,\n expectedValue: getFieldExpectedValue(field),\n evaluationParameters: normalizeEvaluationParametersForField(\n field.type,\n field.evaluationParameters,\n ),\n }),\n );\n\n const evaluationRequest: EvaluationRequestV2 = {\n testCaseId: testCase.id,\n question: testCase.question,\n actualResponse: testCase.output,\n fields,\n };\n\n await this.engine.evaluateResponse(\n evaluationRequest,\n (result: EvaluationResult) => {\n console.log('📊 Evaluation result received:', result);\n onResult(result);\n },\n );\n }\n}\n\nfunction getFieldExpectedValue(field: ExpectedOutcomeField): string {\n if (field.type === 'chips-input') {\n return field.value.join(', ');\n }\n return field.value;\n}\n"]}
|