llm-testrunner-components 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +298 -232
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +6 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-Bb89MYYu.js +7 -0
- package/dist/components/p-Bb89MYYu.js.map +1 -0
- package/dist/esm/index.js +298 -232
- package/dist/esm/index.js.map +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +0 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/dist/esm/index.js
CHANGED
|
@@ -61,20 +61,6 @@ class RateLimitedFetcher {
|
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
-
var EvaluationApproach;
|
|
65
|
-
(function (EvaluationApproach) {
|
|
66
|
-
EvaluationApproach["EXACT"] = "exact";
|
|
67
|
-
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
68
|
-
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
69
|
-
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
70
|
-
EvaluationApproach["BLEU"] = "bleu";
|
|
71
|
-
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
72
|
-
// Array of all evaluation approach values for UI components
|
|
73
|
-
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
74
|
-
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
75
|
-
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
76
|
-
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
77
|
-
|
|
78
64
|
/**
|
|
79
65
|
* Reads a file asynchronously and returns its content as a string
|
|
80
66
|
* @param file - The File object to read
|
|
@@ -117,23 +103,10 @@ function formatTestSuiteAsJson(testCases) {
|
|
|
117
103
|
id: testCase.id,
|
|
118
104
|
question: testCase.question,
|
|
119
105
|
expectedOutcome: testCase.expectedOutcome,
|
|
120
|
-
evaluationParameters: testCase.evaluationParameters,
|
|
121
106
|
}));
|
|
122
107
|
return JSON.stringify(exportData, null, 2);
|
|
123
108
|
}
|
|
124
109
|
|
|
125
|
-
function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
|
|
126
|
-
return (expectedOutcome || [])
|
|
127
|
-
.map(field => {
|
|
128
|
-
if (field.type === 'chips-input') {
|
|
129
|
-
return field.value.join(', ');
|
|
130
|
-
}
|
|
131
|
-
return field.value;
|
|
132
|
-
})
|
|
133
|
-
.join(joinWith)
|
|
134
|
-
.trim();
|
|
135
|
-
}
|
|
136
|
-
|
|
137
110
|
/**
|
|
138
111
|
* Escapes a CSV field by wrapping it in quotes if it contains special characters
|
|
139
112
|
* @param field - The field to escape
|
|
@@ -152,48 +125,63 @@ function escapeCsvField(field) {
|
|
|
152
125
|
*/
|
|
153
126
|
function exportTestResultsToCsv(testCases) {
|
|
154
127
|
const csvRows = [];
|
|
128
|
+
const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
|
|
155
129
|
// Add header row
|
|
156
130
|
const headers = [
|
|
157
131
|
'Question',
|
|
158
|
-
'Expected Keywords',
|
|
159
|
-
'Generated Keywords',
|
|
160
|
-
'Keywords Match',
|
|
161
132
|
'Response Time (s)',
|
|
162
|
-
'Evaluation Approach',
|
|
163
|
-
'Evaluation Score',
|
|
164
133
|
];
|
|
134
|
+
for (let i = 1; i <= maxFieldCount; i++) {
|
|
135
|
+
headers.push('Field Name');
|
|
136
|
+
headers.push('Expected Keywords');
|
|
137
|
+
headers.push('Generated Keywords');
|
|
138
|
+
headers.push('Evaluation Strategy');
|
|
139
|
+
headers.push('Passed Evaluation');
|
|
140
|
+
headers.push('Keyword Match');
|
|
141
|
+
headers.push('Score');
|
|
142
|
+
if (i < maxFieldCount) {
|
|
143
|
+
headers.push('');
|
|
144
|
+
}
|
|
145
|
+
}
|
|
165
146
|
csvRows.push(headers.join(','));
|
|
166
|
-
// Add data rows
|
|
147
|
+
// Add data rows (one row per test case)
|
|
167
148
|
testCases.forEach(testCase => {
|
|
168
|
-
const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
|
|
169
|
-
const evaluationApproach = testCase.evaluationParameters?.approach || '';
|
|
170
|
-
const score = testCase.evaluationResult?.evaluationApproachResult?.score;
|
|
171
|
-
const evaluationScore = score !== undefined ? score.toString() : '';
|
|
172
|
-
let generatedKeywords = '';
|
|
173
|
-
let keywordsMatch = '';
|
|
174
|
-
if (testCase.evaluationResult) {
|
|
175
|
-
const foundKeywords = testCase.evaluationResult.keywordMatches
|
|
176
|
-
.filter(match => match.found)
|
|
177
|
-
.map(match => match.keyword);
|
|
178
|
-
generatedKeywords = foundKeywords.join('; ');
|
|
179
|
-
// Calculate match percentages
|
|
180
|
-
const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
|
|
181
|
-
const totalKeywords = testCase.evaluationResult.keywordMatches.length;
|
|
182
|
-
keywordsMatch =
|
|
183
|
-
totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
|
|
184
|
-
}
|
|
185
149
|
const responseTime = testCase.responseTime
|
|
186
150
|
? (testCase.responseTime / 1000).toFixed(3)
|
|
187
151
|
: 'N/A';
|
|
188
|
-
const row = [
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
152
|
+
const row = [escapeCsvField(testCase.question), responseTime];
|
|
153
|
+
for (let i = 0; i < maxFieldCount; i++) {
|
|
154
|
+
const field = testCase.expectedOutcome?.[i];
|
|
155
|
+
const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
|
|
156
|
+
const expectedKeywords = fieldResult?.expectedValue ??
|
|
157
|
+
(field
|
|
158
|
+
? field.type === 'chips-input'
|
|
159
|
+
? field.value.join(', ')
|
|
160
|
+
: field.value
|
|
161
|
+
: '');
|
|
162
|
+
const generatedKeywords = (fieldResult?.keywordMatches || [])
|
|
163
|
+
.filter(match => match.found)
|
|
164
|
+
.map(match => match.keyword)
|
|
165
|
+
.join('; ');
|
|
166
|
+
const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
|
|
167
|
+
const totalMatches = fieldResult?.keywordMatches?.length || 0;
|
|
168
|
+
const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
|
|
169
|
+
const score = fieldResult?.evaluationApproachResult?.score !== undefined
|
|
170
|
+
? fieldResult.evaluationApproachResult.score.toFixed(2)
|
|
171
|
+
: '';
|
|
172
|
+
row.push(escapeCsvField(field?.label || ''));
|
|
173
|
+
row.push(escapeCsvField(expectedKeywords || ''));
|
|
174
|
+
row.push(escapeCsvField(generatedKeywords));
|
|
175
|
+
row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
|
|
176
|
+
field?.evaluationParameters?.approach ||
|
|
177
|
+
''));
|
|
178
|
+
row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
|
|
179
|
+
row.push(keywordMatch);
|
|
180
|
+
row.push(score);
|
|
181
|
+
if (i < maxFieldCount - 1) {
|
|
182
|
+
row.push('');
|
|
183
|
+
}
|
|
184
|
+
}
|
|
197
185
|
csvRows.push(row.join(','));
|
|
198
186
|
});
|
|
199
187
|
return csvRows.join('\n');
|
|
@@ -252,6 +240,43 @@ function v4(options, buf, offset) {
|
|
|
252
240
|
return unsafeStringify(rnds);
|
|
253
241
|
}
|
|
254
242
|
|
|
243
|
+
var EvaluationApproach;
|
|
244
|
+
(function (EvaluationApproach) {
|
|
245
|
+
EvaluationApproach["EXACT"] = "exact";
|
|
246
|
+
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
247
|
+
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
248
|
+
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
249
|
+
EvaluationApproach["BLEU"] = "bleu";
|
|
250
|
+
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
251
|
+
// Array of all evaluation approach values for UI components
|
|
252
|
+
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
253
|
+
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
254
|
+
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
255
|
+
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
256
|
+
|
|
257
|
+
const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
|
|
258
|
+
function getAllowedApproachesForFieldType(fieldType) {
|
|
259
|
+
if (fieldType === 'select') {
|
|
260
|
+
return SELECT_ONLY_APPROACHES;
|
|
261
|
+
}
|
|
262
|
+
return EvaluationApproachValues;
|
|
263
|
+
}
|
|
264
|
+
function isApproachAllowedForFieldType(fieldType, approach) {
|
|
265
|
+
return getAllowedApproachesForFieldType(fieldType).includes(approach);
|
|
266
|
+
}
|
|
267
|
+
function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
|
|
268
|
+
const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
|
|
269
|
+
const fallbackApproach = allowedApproaches[0];
|
|
270
|
+
const rawApproach = evaluationParameters?.approach;
|
|
271
|
+
const approach = rawApproach && allowedApproaches.includes(rawApproach)
|
|
272
|
+
? rawApproach
|
|
273
|
+
: fallbackApproach;
|
|
274
|
+
return {
|
|
275
|
+
...evaluationParameters,
|
|
276
|
+
approach,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
255
280
|
const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
256
281
|
{
|
|
257
282
|
type: 'textarea',
|
|
@@ -260,6 +285,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
260
285
|
rows: 2,
|
|
261
286
|
},
|
|
262
287
|
];
|
|
288
|
+
function normalizeExpectedOutcomeField(field) {
|
|
289
|
+
return {
|
|
290
|
+
...field,
|
|
291
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
292
|
+
};
|
|
293
|
+
}
|
|
263
294
|
/**
|
|
264
295
|
* Creates a new test case with default values
|
|
265
296
|
* @returns A new TestCase object with a unique ID
|
|
@@ -269,9 +300,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
|
|
|
269
300
|
id: v4(),
|
|
270
301
|
question: '',
|
|
271
302
|
expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
|
|
272
|
-
evaluationParameters: {
|
|
273
|
-
approach: EvaluationApproach.EXACT,
|
|
274
|
-
},
|
|
275
303
|
isRunning: false,
|
|
276
304
|
};
|
|
277
305
|
}
|
|
@@ -281,35 +309,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
281
309
|
return {
|
|
282
310
|
type: 'text',
|
|
283
311
|
label: schemaField.label,
|
|
284
|
-
required: schemaField.required,
|
|
285
312
|
placeholder: schemaField.placeholder,
|
|
286
313
|
value: '',
|
|
314
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
287
315
|
};
|
|
288
316
|
case 'textarea':
|
|
289
317
|
return {
|
|
290
318
|
type: 'textarea',
|
|
291
319
|
label: schemaField.label,
|
|
292
|
-
required: schemaField.required,
|
|
293
320
|
placeholder: schemaField.placeholder,
|
|
294
321
|
rows: schemaField.rows,
|
|
295
322
|
value: '',
|
|
323
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
296
324
|
};
|
|
297
325
|
case 'chips-input':
|
|
298
326
|
return {
|
|
299
327
|
type: 'chips-input',
|
|
300
328
|
label: schemaField.label,
|
|
301
|
-
required: schemaField.required,
|
|
302
329
|
placeholder: schemaField.placeholder,
|
|
303
330
|
value: [],
|
|
331
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
304
332
|
};
|
|
305
333
|
case 'select':
|
|
306
334
|
return {
|
|
307
335
|
type: 'select',
|
|
308
336
|
label: schemaField.label,
|
|
309
|
-
required: schemaField.required,
|
|
310
337
|
placeholder: schemaField.placeholder,
|
|
311
338
|
value: '',
|
|
312
339
|
options: schemaField.options,
|
|
340
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
313
341
|
};
|
|
314
342
|
default: {
|
|
315
343
|
const _exhaustiveCheck = schemaField;
|
|
@@ -320,32 +348,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
320
348
|
function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
|
|
321
349
|
return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
|
|
322
350
|
}
|
|
323
|
-
function migrateLegacyExpectedOutcomeString(value) {
|
|
324
|
-
return [
|
|
325
|
-
{
|
|
326
|
-
type: 'textarea',
|
|
327
|
-
label: 'Expected Outcome',
|
|
328
|
-
value,
|
|
329
|
-
},
|
|
330
|
-
];
|
|
331
|
-
}
|
|
332
351
|
/**
|
|
333
352
|
* Creates a runtime test case from validated input data.
|
|
334
|
-
* The input is expected to already satisfy `TestCaseInput
|
|
335
|
-
* and this function only performs normalization/defaulting
|
|
353
|
+
* The input is expected to already satisfy `TestCaseInput`,
|
|
354
|
+
* and this function only performs normalization/defaulting.
|
|
336
355
|
*
|
|
337
356
|
* @param data - Validated test case input
|
|
338
357
|
* @returns A normalized TestCase object with runtime defaults applied
|
|
339
358
|
*/
|
|
340
359
|
function createTestCaseFromInput(data) {
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
expectedOutcome
|
|
344
|
-
}
|
|
345
|
-
else {
|
|
346
|
-
expectedOutcome = data.expectedOutcome;
|
|
347
|
-
}
|
|
348
|
-
return { ...data, expectedOutcome };
|
|
360
|
+
return {
|
|
361
|
+
...data,
|
|
362
|
+
expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
|
|
363
|
+
};
|
|
349
364
|
}
|
|
350
365
|
|
|
351
366
|
/** A special constant with type `never` */
|
|
@@ -4935,27 +4950,43 @@ function superRefine(fn) {
|
|
|
4935
4950
|
const nonEmptyString = string().trim().min(1);
|
|
4936
4951
|
const optionalPositiveInt = number().int().positive().optional();
|
|
4937
4952
|
const optionalString = string().optional();
|
|
4938
|
-
const optionalBoolean = boolean().optional();
|
|
4939
4953
|
const selectOptionsSchema = array(nonEmptyString).min(1);
|
|
4954
|
+
const optionalNumber = number().optional();
|
|
4955
|
+
const evaluationParametersSchema = object({
|
|
4956
|
+
approach: _enum(EvaluationApproach),
|
|
4957
|
+
threshold: optionalNumber,
|
|
4958
|
+
});
|
|
4959
|
+
const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
|
|
4960
|
+
if (!isApproachAllowedForFieldType('select', parameters.approach)) {
|
|
4961
|
+
ctx.addIssue({
|
|
4962
|
+
code: 'custom',
|
|
4963
|
+
path: ['approach'],
|
|
4964
|
+
message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
|
|
4965
|
+
});
|
|
4966
|
+
}
|
|
4967
|
+
});
|
|
4940
4968
|
const defaultExpectedOutcomeBaseSchema = object({
|
|
4941
4969
|
label: nonEmptyString,
|
|
4942
|
-
required: optionalBoolean,
|
|
4943
4970
|
placeholder: optionalString,
|
|
4944
4971
|
});
|
|
4945
4972
|
const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
|
|
4946
4973
|
text: baseSchema.extend({
|
|
4947
4974
|
type: literal('text'),
|
|
4975
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4948
4976
|
}),
|
|
4949
4977
|
textarea: baseSchema.extend({
|
|
4950
4978
|
type: literal('textarea'),
|
|
4951
4979
|
rows: optionalPositiveInt,
|
|
4980
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4952
4981
|
}),
|
|
4953
4982
|
chipsInput: baseSchema.extend({
|
|
4954
4983
|
type: literal('chips-input'),
|
|
4984
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4955
4985
|
}),
|
|
4956
4986
|
select: baseSchema.extend({
|
|
4957
4987
|
type: literal('select'),
|
|
4958
4988
|
options: selectOptionsSchema,
|
|
4989
|
+
evaluationParameters: selectEvaluationParametersSchema.optional(),
|
|
4959
4990
|
}),
|
|
4960
4991
|
});
|
|
4961
4992
|
function hasDuplicateChips(values) {
|
|
@@ -5017,33 +5048,16 @@ function validateExpectedOutcomeSchema(schema) {
|
|
|
5017
5048
|
}
|
|
5018
5049
|
}
|
|
5019
5050
|
|
|
5020
|
-
const
|
|
5021
|
-
approach: _enum(EvaluationApproach),
|
|
5022
|
-
threshold: number().optional(),
|
|
5023
|
-
});
|
|
5024
|
-
const baseTestCaseInputSchema = object({
|
|
5051
|
+
const testCaseInputSchema = object({
|
|
5025
5052
|
id: string(),
|
|
5026
5053
|
question: string(),
|
|
5027
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5028
|
-
});
|
|
5029
|
-
const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5030
|
-
expectedOutcome: string(),
|
|
5031
|
-
});
|
|
5032
|
-
const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5033
5054
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5034
5055
|
});
|
|
5035
|
-
const
|
|
5036
|
-
legacyTestCaseInputSchema,
|
|
5037
|
-
v2TestCaseInputSchema,
|
|
5038
|
-
]);
|
|
5039
|
-
const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
|
|
5040
|
-
message: 'The test suite is empty. Please provide at least one test case.',
|
|
5041
|
-
});
|
|
5056
|
+
const testCaseInputArraySchema = array(testCaseInputSchema);
|
|
5042
5057
|
object({
|
|
5043
5058
|
id: string(),
|
|
5044
5059
|
question: string(),
|
|
5045
5060
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5046
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5047
5061
|
output: string().optional(),
|
|
5048
5062
|
isRunning: boolean().optional(),
|
|
5049
5063
|
error: string().optional(),
|
|
@@ -5094,19 +5108,69 @@ function importTestSuite(jsonContent) {
|
|
|
5094
5108
|
}
|
|
5095
5109
|
}
|
|
5096
5110
|
|
|
5111
|
+
function applyExpectedOutcomeChange(testCase, change) {
|
|
5112
|
+
const { index } = change;
|
|
5113
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5114
|
+
const target = expectedOutcome[index];
|
|
5115
|
+
if (!target) {
|
|
5116
|
+
return testCase;
|
|
5117
|
+
}
|
|
5118
|
+
switch (change.operation) {
|
|
5119
|
+
case 'set-value': {
|
|
5120
|
+
if (target.type === 'chips-input') {
|
|
5121
|
+
return testCase;
|
|
5122
|
+
}
|
|
5123
|
+
expectedOutcome[index] = {
|
|
5124
|
+
...target,
|
|
5125
|
+
value: change.value,
|
|
5126
|
+
};
|
|
5127
|
+
return { ...testCase, expectedOutcome };
|
|
5128
|
+
}
|
|
5129
|
+
case 'add-chip': {
|
|
5130
|
+
if (target.type !== 'chips-input') {
|
|
5131
|
+
return testCase;
|
|
5132
|
+
}
|
|
5133
|
+
expectedOutcome[index] = {
|
|
5134
|
+
...target,
|
|
5135
|
+
value: [...target.value, change.value],
|
|
5136
|
+
};
|
|
5137
|
+
return { ...testCase, expectedOutcome };
|
|
5138
|
+
}
|
|
5139
|
+
case 'remove-chip': {
|
|
5140
|
+
if (target.type !== 'chips-input') {
|
|
5141
|
+
return testCase;
|
|
5142
|
+
}
|
|
5143
|
+
expectedOutcome[index] = {
|
|
5144
|
+
...target,
|
|
5145
|
+
value: target.value.filter(chip => chip !== change.value),
|
|
5146
|
+
};
|
|
5147
|
+
return { ...testCase, expectedOutcome };
|
|
5148
|
+
}
|
|
5149
|
+
case 'set-evaluation-approach':
|
|
5150
|
+
return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
|
|
5151
|
+
}
|
|
5152
|
+
}
|
|
5097
5153
|
/**
|
|
5098
|
-
* Updates the evaluation approach for a
|
|
5099
|
-
*
|
|
5100
|
-
* @param approach - The new evaluation approach
|
|
5101
|
-
* @returns Updated test case with the new evaluation approach
|
|
5154
|
+
* Updates the evaluation approach for a specific expected outcome field.
|
|
5155
|
+
* Select fields always use exact matching.
|
|
5102
5156
|
*/
|
|
5103
|
-
function
|
|
5157
|
+
function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
|
|
5158
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5159
|
+
const target = expectedOutcome[fieldIndex];
|
|
5160
|
+
if (!target) {
|
|
5161
|
+
return testCase;
|
|
5162
|
+
}
|
|
5163
|
+
const currentEvaluationParameters = target.evaluationParameters;
|
|
5164
|
+
expectedOutcome[fieldIndex] = {
|
|
5165
|
+
...target,
|
|
5166
|
+
evaluationParameters: normalizeEvaluationParametersForField(target.type, {
|
|
5167
|
+
...currentEvaluationParameters,
|
|
5168
|
+
approach,
|
|
5169
|
+
}),
|
|
5170
|
+
};
|
|
5104
5171
|
return {
|
|
5105
5172
|
...testCase,
|
|
5106
|
-
|
|
5107
|
-
...testCase.evaluationParameters,
|
|
5108
|
-
approach: approach,
|
|
5109
|
-
},
|
|
5173
|
+
expectedOutcome,
|
|
5110
5174
|
};
|
|
5111
5175
|
}
|
|
5112
5176
|
|
|
@@ -29858,57 +29922,78 @@ function performBleuEvaluation(request) {
|
|
|
29858
29922
|
|
|
29859
29923
|
class LLMEvaluationEngine {
|
|
29860
29924
|
async evaluateResponse(request, callback) {
|
|
29861
|
-
|
|
29862
|
-
const
|
|
29863
|
-
switch (approach) {
|
|
29864
|
-
case EvaluationApproach.BLEU: {
|
|
29865
|
-
const bleuResult = performBleuEvaluation(request);
|
|
29866
|
-
callback(bleuResult);
|
|
29867
|
-
break;
|
|
29868
|
-
}
|
|
29869
|
-
case EvaluationApproach.EXACT: {
|
|
29870
|
-
const exactResult = await performEvaluation(request);
|
|
29871
|
-
callback(exactResult);
|
|
29872
|
-
break;
|
|
29873
|
-
}
|
|
29874
|
-
case EvaluationApproach.ROUGE_1: {
|
|
29875
|
-
const rougeResult = await performRouge1Evaluation(request);
|
|
29876
|
-
callback(rougeResult);
|
|
29877
|
-
break;
|
|
29878
|
-
}
|
|
29879
|
-
case EvaluationApproach.ROUGE_L: {
|
|
29880
|
-
const rougeLResult = await performRougeLEvaluation(request);
|
|
29881
|
-
callback(rougeLResult);
|
|
29882
|
-
break;
|
|
29883
|
-
}
|
|
29884
|
-
case EvaluationApproach.SEMANTIC: {
|
|
29885
|
-
const semanticResult = await performSemanticEvaluation(request);
|
|
29886
|
-
callback(semanticResult);
|
|
29887
|
-
break;
|
|
29888
|
-
}
|
|
29889
|
-
default: {
|
|
29890
|
-
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29891
|
-
const fallbackResult = await performEvaluation(request);
|
|
29892
|
-
callback(fallbackResult);
|
|
29893
|
-
}
|
|
29894
|
-
}
|
|
29895
|
-
}
|
|
29896
|
-
catch (error) {
|
|
29897
|
-
console.error('Evaluation failed:', error);
|
|
29898
|
-
const errorResult = {
|
|
29925
|
+
const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
|
|
29926
|
+
const fieldRequest = {
|
|
29899
29927
|
testCaseId: request.testCaseId,
|
|
29928
|
+
question: request.question,
|
|
29929
|
+
actualResponse: request.actualResponse,
|
|
29930
|
+
expectedOutcome: field.expectedValue,
|
|
29931
|
+
evaluationParameters: field.evaluationParameters,
|
|
29932
|
+
};
|
|
29933
|
+
const result = await this.evaluateField(fieldRequest);
|
|
29934
|
+
const fieldResult = {
|
|
29935
|
+
index: field.index,
|
|
29936
|
+
label: field.label,
|
|
29937
|
+
type: field.type,
|
|
29938
|
+
expectedValue: field.expectedValue,
|
|
29939
|
+
passed: result.passed,
|
|
29940
|
+
keywordMatches: result.keywordMatches,
|
|
29941
|
+
evaluationParameters: result.evaluationParameters,
|
|
29942
|
+
evaluationApproachResult: result.evaluationApproachResult,
|
|
29943
|
+
};
|
|
29944
|
+
return fieldResult;
|
|
29945
|
+
}));
|
|
29946
|
+
const fieldResults = settledResults.map((settledResult, index) => {
|
|
29947
|
+
const field = request.fields[index];
|
|
29948
|
+
if (settledResult.status === 'fulfilled') {
|
|
29949
|
+
return settledResult.value;
|
|
29950
|
+
}
|
|
29951
|
+
return {
|
|
29952
|
+
index: field.index,
|
|
29953
|
+
label: field.label,
|
|
29954
|
+
type: field.type,
|
|
29955
|
+
expectedValue: field.expectedValue,
|
|
29900
29956
|
passed: false,
|
|
29901
29957
|
keywordMatches: [],
|
|
29902
|
-
|
|
29903
|
-
evaluationParameters: request.evaluationParameters,
|
|
29958
|
+
evaluationParameters: field.evaluationParameters,
|
|
29904
29959
|
evaluationApproachResult: {
|
|
29905
29960
|
score: 0,
|
|
29906
|
-
approachUsed:
|
|
29961
|
+
approachUsed: field.evaluationParameters.approach,
|
|
29907
29962
|
},
|
|
29963
|
+
error: this.getSafeErrorMessage(settledResult.reason),
|
|
29908
29964
|
};
|
|
29909
|
-
|
|
29965
|
+
});
|
|
29966
|
+
const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
|
|
29967
|
+
const passed = fieldResults.every(field => field.passed && !field.error);
|
|
29968
|
+
callback({
|
|
29969
|
+
testCaseId: request.testCaseId,
|
|
29970
|
+
passed,
|
|
29971
|
+
keywordMatches,
|
|
29972
|
+
fieldResults,
|
|
29973
|
+
timestamp: new Date().toISOString(),
|
|
29974
|
+
});
|
|
29975
|
+
}
|
|
29976
|
+
async evaluateField(request) {
|
|
29977
|
+
const approach = request.evaluationParameters.approach;
|
|
29978
|
+
switch (approach) {
|
|
29979
|
+
case EvaluationApproach.BLEU:
|
|
29980
|
+
return performBleuEvaluation(request);
|
|
29981
|
+
case EvaluationApproach.EXACT:
|
|
29982
|
+
return performEvaluation(request);
|
|
29983
|
+
case EvaluationApproach.ROUGE_1:
|
|
29984
|
+
return performRouge1Evaluation(request);
|
|
29985
|
+
case EvaluationApproach.ROUGE_L:
|
|
29986
|
+
return performRougeLEvaluation(request);
|
|
29987
|
+
case EvaluationApproach.SEMANTIC:
|
|
29988
|
+
return performSemanticEvaluation(request);
|
|
29989
|
+
default:
|
|
29990
|
+
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29991
|
+
return performEvaluation(request);
|
|
29910
29992
|
}
|
|
29911
29993
|
}
|
|
29994
|
+
getSafeErrorMessage(error) {
|
|
29995
|
+
return error instanceof Error ? error.message : 'Field evaluation failed.';
|
|
29996
|
+
}
|
|
29912
29997
|
}
|
|
29913
29998
|
|
|
29914
29999
|
/**
|
|
@@ -29929,12 +30014,18 @@ class EvaluationService {
|
|
|
29929
30014
|
console.warn('⚠️ No output to evaluate for test case:', testCase.id);
|
|
29930
30015
|
return;
|
|
29931
30016
|
}
|
|
30017
|
+
const fields = (testCase.expectedOutcome || []).map((field, index) => ({
|
|
30018
|
+
index,
|
|
30019
|
+
label: field.label,
|
|
30020
|
+
type: field.type,
|
|
30021
|
+
expectedValue: getFieldExpectedValue(field),
|
|
30022
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
30023
|
+
}));
|
|
29932
30024
|
const evaluationRequest = {
|
|
29933
30025
|
testCaseId: testCase.id,
|
|
29934
30026
|
question: testCase.question,
|
|
29935
|
-
expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
|
|
29936
30027
|
actualResponse: testCase.output,
|
|
29937
|
-
|
|
30028
|
+
fields,
|
|
29938
30029
|
};
|
|
29939
30030
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
29940
30031
|
console.log('📊 Evaluation result received:', result);
|
|
@@ -29942,6 +30033,12 @@ class EvaluationService {
|
|
|
29942
30033
|
});
|
|
29943
30034
|
}
|
|
29944
30035
|
}
|
|
30036
|
+
function getFieldExpectedValue(field) {
|
|
30037
|
+
if (field.type === 'chips-input') {
|
|
30038
|
+
return field.value.join(', ');
|
|
30039
|
+
}
|
|
30040
|
+
return field.value;
|
|
30041
|
+
}
|
|
29945
30042
|
|
|
29946
30043
|
const Button = (props, children) => {
|
|
29947
30044
|
const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
|
|
@@ -29984,7 +30081,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
|
|
|
29984
30081
|
};
|
|
29985
30082
|
|
|
29986
30083
|
const EvaluationSummary = ({ result, isRunning, }) => {
|
|
29987
|
-
|
|
30084
|
+
const fieldResults = result?.fieldResults || [];
|
|
30085
|
+
const hasFieldResults = fieldResults.length > 0;
|
|
30086
|
+
return (h("div", { class: "evaluation-summary" }, result ? (h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (h("div", { class: "evaluation-summary__field-result" }, h("div", { class: "evaluation-summary__field-header" }, h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), h("div", { class: "evaluation-summary__field-details" }, h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
|
|
29988
30087
|
};
|
|
29989
30088
|
|
|
29990
30089
|
const IconButton = (props, children) => {
|
|
@@ -30020,6 +30119,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30020
30119
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
30021
30120
|
detail,
|
|
30022
30121
|
});
|
|
30122
|
+
const buildEvaluationConfig = (index, optionList) => ({
|
|
30123
|
+
name: `expectedOutcomeEvaluation-${index}`,
|
|
30124
|
+
fieldType: FormFieldType.SELECT,
|
|
30125
|
+
label: 'Evaluation Approach',
|
|
30126
|
+
placeholder: 'Select evaluation approach…',
|
|
30127
|
+
required: true,
|
|
30128
|
+
optionList,
|
|
30129
|
+
defaultValue: EvaluationApproach.EXACT,
|
|
30130
|
+
});
|
|
30131
|
+
const renderEvaluationSelector = (field, index) => {
|
|
30132
|
+
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
30133
|
+
return (h("app-select", { config: buildEvaluationConfig(index, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
30134
|
+
testCaseId,
|
|
30135
|
+
index,
|
|
30136
|
+
operation: 'set-evaluation-approach',
|
|
30137
|
+
value: e.detail.value,
|
|
30138
|
+
}) }));
|
|
30139
|
+
};
|
|
30023
30140
|
return (h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index) => {
|
|
30024
30141
|
if (field.type === 'textarea') {
|
|
30025
30142
|
const config = {
|
|
@@ -30027,15 +30144,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30027
30144
|
fieldType: FormFieldType.TEXT_AREA,
|
|
30028
30145
|
label: field.label,
|
|
30029
30146
|
placeholder: field.placeholder,
|
|
30030
|
-
required:
|
|
30147
|
+
required: true,
|
|
30031
30148
|
rows: field.rows || 2,
|
|
30032
30149
|
};
|
|
30033
|
-
return (h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30150
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30034
30151
|
testCaseId,
|
|
30035
30152
|
index,
|
|
30036
30153
|
operation: 'set-value',
|
|
30037
30154
|
value: e.detail.value,
|
|
30038
|
-
}) }));
|
|
30155
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
30039
30156
|
}
|
|
30040
30157
|
if (field.type === 'chips-input') {
|
|
30041
30158
|
const config = {
|
|
@@ -30043,9 +30160,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30043
30160
|
fieldType: FormFieldType.CHIPS,
|
|
30044
30161
|
label: field.label,
|
|
30045
30162
|
placeholder: field.placeholder,
|
|
30046
|
-
required:
|
|
30163
|
+
required: true,
|
|
30047
30164
|
};
|
|
30048
|
-
return (h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30165
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30049
30166
|
testCaseId,
|
|
30050
30167
|
index,
|
|
30051
30168
|
operation: 'add-chip',
|
|
@@ -30055,7 +30172,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30055
30172
|
index,
|
|
30056
30173
|
operation: 'remove-chip',
|
|
30057
30174
|
value: e.detail.value,
|
|
30058
|
-
}) }));
|
|
30175
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
30059
30176
|
}
|
|
30060
30177
|
if (field.type === 'select') {
|
|
30061
30178
|
const config = {
|
|
@@ -30063,26 +30180,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30063
30180
|
fieldType: FormFieldType.SELECT,
|
|
30064
30181
|
label: field.label,
|
|
30065
30182
|
placeholder: field.placeholder,
|
|
30066
|
-
required:
|
|
30183
|
+
required: true,
|
|
30067
30184
|
optionList: field.options,
|
|
30068
30185
|
};
|
|
30069
|
-
return (h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30186
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30070
30187
|
testCaseId,
|
|
30071
30188
|
index,
|
|
30072
30189
|
operation: 'set-value',
|
|
30073
30190
|
value: e.detail.value,
|
|
30074
|
-
}) }));
|
|
30191
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
30075
30192
|
}
|
|
30076
|
-
return (h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30193
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30077
30194
|
testCaseId,
|
|
30078
30195
|
index,
|
|
30079
30196
|
operation: 'set-value',
|
|
30080
30197
|
value: e.target.value,
|
|
30081
|
-
}) })));
|
|
30198
|
+
}) })), renderEvaluationSelector(field, index)));
|
|
30082
30199
|
})));
|
|
30083
30200
|
};
|
|
30084
30201
|
|
|
30085
|
-
const LLMTestCaseRow = ({ testCase, onRun, onDelete,
|
|
30202
|
+
const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30086
30203
|
const questionConfig = {
|
|
30087
30204
|
name: 'question',
|
|
30088
30205
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -30092,26 +30209,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
|
|
|
30092
30209
|
required: true,
|
|
30093
30210
|
rows: 3,
|
|
30094
30211
|
};
|
|
30095
|
-
const evaluationConfig = {
|
|
30096
|
-
name: 'EvaluationApproach',
|
|
30097
|
-
fieldType: FormFieldType.SELECT,
|
|
30098
|
-
label: 'Evaluation',
|
|
30099
|
-
placeholder: 'Select evaluation approach…',
|
|
30100
|
-
required: true,
|
|
30101
|
-
optionList: EvaluationApproachValues,
|
|
30102
|
-
defaultValue: EvaluationApproach.EXACT,
|
|
30103
|
-
};
|
|
30104
30212
|
return (h("div", { class: "test-case-row", key: testCase.id }, h("div", { class: "test-case-row__input-column" }, h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
|
|
30105
30213
|
detail: {
|
|
30106
30214
|
testCaseId: testCase.id,
|
|
30107
30215
|
key: 'question',
|
|
30108
30216
|
value: e.detail.value,
|
|
30109
30217
|
},
|
|
30110
|
-
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })
|
|
30218
|
+
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30111
30219
|
};
|
|
30112
30220
|
|
|
30113
|
-
const LLMTestCases = ({ testCases, onRun, onDelete,
|
|
30114
|
-
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete,
|
|
30221
|
+
const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30222
|
+
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30115
30223
|
};
|
|
30116
30224
|
|
|
30117
30225
|
const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
|
|
@@ -30122,11 +30230,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
|
|
|
30122
30230
|
|
|
30123
30231
|
const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
|
|
30124
30232
|
|
|
30125
|
-
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30233
|
+
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30126
30234
|
|
|
30127
30235
|
const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
|
|
30128
30236
|
|
|
30129
|
-
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-
|
|
30237
|
+
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
|
|
30130
30238
|
|
|
30131
30239
|
const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
|
|
30132
30240
|
|
|
@@ -30159,9 +30267,6 @@ const LLMTestRunner = class {
|
|
|
30159
30267
|
value: '',
|
|
30160
30268
|
},
|
|
30161
30269
|
],
|
|
30162
|
-
evaluationParameters: {
|
|
30163
|
-
approach: EvaluationApproach.EXACT,
|
|
30164
|
-
},
|
|
30165
30270
|
isRunning: false,
|
|
30166
30271
|
},
|
|
30167
30272
|
];
|
|
@@ -30268,52 +30373,13 @@ const LLMTestRunner = class {
|
|
|
30268
30373
|
deleteTestCase(id) {
|
|
30269
30374
|
this.testCases = this.testCases.filter(tc => tc.id !== id);
|
|
30270
30375
|
}
|
|
30271
|
-
updateApproach(testCase, approach) {
|
|
30272
|
-
if (testCase) {
|
|
30273
|
-
const updated = updateApproach(testCase, approach);
|
|
30274
|
-
this.updateTestCase(testCase.id, {
|
|
30275
|
-
evaluationParameters: updated.evaluationParameters,
|
|
30276
|
-
});
|
|
30277
|
-
}
|
|
30278
|
-
}
|
|
30279
30376
|
handleExpectedOutcomeChange = (event) => {
|
|
30280
|
-
const { testCaseId,
|
|
30377
|
+
const { testCaseId, ...change } = event.detail;
|
|
30281
30378
|
this.testCases = this.testCases.map(tc => {
|
|
30282
|
-
if (tc.id !== testCaseId)
|
|
30283
|
-
return tc;
|
|
30284
|
-
const expectedOutcome = [...(tc.expectedOutcome || [])];
|
|
30285
|
-
const target = expectedOutcome[index];
|
|
30286
|
-
if (!target)
|
|
30379
|
+
if (tc.id !== testCaseId) {
|
|
30287
30380
|
return tc;
|
|
30288
|
-
if (operation === 'set-value') {
|
|
30289
|
-
if (target.type === 'chips-input') {
|
|
30290
|
-
return tc;
|
|
30291
|
-
}
|
|
30292
|
-
expectedOutcome[index] = { ...target, value: value || '' };
|
|
30293
|
-
return { ...tc, expectedOutcome };
|
|
30294
|
-
}
|
|
30295
|
-
if (operation === 'add-chip') {
|
|
30296
|
-
if (target.type !== 'chips-input' || !value) {
|
|
30297
|
-
return tc;
|
|
30298
|
-
}
|
|
30299
|
-
expectedOutcome[index] = {
|
|
30300
|
-
...target,
|
|
30301
|
-
value: [...target.value, value],
|
|
30302
|
-
};
|
|
30303
|
-
return { ...tc, expectedOutcome };
|
|
30304
|
-
}
|
|
30305
|
-
if (operation === 'remove-chip') {
|
|
30306
|
-
if (target.type !== 'chips-input' ||
|
|
30307
|
-
!value) {
|
|
30308
|
-
return tc;
|
|
30309
|
-
}
|
|
30310
|
-
expectedOutcome[index] = {
|
|
30311
|
-
...target,
|
|
30312
|
-
value: target.value.filter(chip => chip !== value),
|
|
30313
|
-
};
|
|
30314
|
-
return { ...tc, expectedOutcome };
|
|
30315
30381
|
}
|
|
30316
|
-
return tc;
|
|
30382
|
+
return applyExpectedOutcomeChange(tc, change);
|
|
30317
30383
|
});
|
|
30318
30384
|
};
|
|
30319
30385
|
async evaluateResponse(testCase) {
|
|
@@ -30413,7 +30479,7 @@ const LLMTestRunner = class {
|
|
|
30413
30479
|
}
|
|
30414
30480
|
}
|
|
30415
30481
|
render() {
|
|
30416
|
-
return (h("div", { key: '
|
|
30482
|
+
return (h("div", { key: 'e3d007b453f770fcb59c29f8ee83bd8a35e82a34', class: "test-runner-container" }, h(LLMTestRunnerHeader, { key: 'b7c44bf4807fe8d9e5de514818420d67d2e0dbfb', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), h(ErrorMessage, { key: '697237ec0f8d2e704609fd0b240629f22c2a3ef6', message: this.error, onClear: () => (this.error = '') }), h("div", { key: '64a623f897dfb96d922ddc0cbdfcf529c52bef76', class: "test-runner-container__content" }, h(LLMTestCases, { key: '017da41567c5c13933d9cf31d1a972743bd9b100', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
|
|
30417
30483
|
}
|
|
30418
30484
|
};
|
|
30419
30485
|
LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));
|