llm-testrunner-components 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +298 -232
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +6 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-Bb89MYYu.js +7 -0
- package/dist/components/p-Bb89MYYu.js.map +1 -0
- package/dist/esm/index.js +298 -232
- package/dist/esm/index.js.map +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +0 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/dist/cjs/index.cjs.js
CHANGED
|
@@ -64,20 +64,6 @@ class RateLimitedFetcher {
|
|
|
64
64
|
}
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
var EvaluationApproach;
|
|
68
|
-
(function (EvaluationApproach) {
|
|
69
|
-
EvaluationApproach["EXACT"] = "exact";
|
|
70
|
-
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
71
|
-
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
72
|
-
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
73
|
-
EvaluationApproach["BLEU"] = "bleu";
|
|
74
|
-
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
75
|
-
// Array of all evaluation approach values for UI components
|
|
76
|
-
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
77
|
-
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
78
|
-
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
79
|
-
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
80
|
-
|
|
81
67
|
/**
|
|
82
68
|
* Reads a file asynchronously and returns its content as a string
|
|
83
69
|
* @param file - The File object to read
|
|
@@ -120,23 +106,10 @@ function formatTestSuiteAsJson(testCases) {
|
|
|
120
106
|
id: testCase.id,
|
|
121
107
|
question: testCase.question,
|
|
122
108
|
expectedOutcome: testCase.expectedOutcome,
|
|
123
|
-
evaluationParameters: testCase.evaluationParameters,
|
|
124
109
|
}));
|
|
125
110
|
return JSON.stringify(exportData, null, 2);
|
|
126
111
|
}
|
|
127
112
|
|
|
128
|
-
function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
|
|
129
|
-
return (expectedOutcome || [])
|
|
130
|
-
.map(field => {
|
|
131
|
-
if (field.type === 'chips-input') {
|
|
132
|
-
return field.value.join(', ');
|
|
133
|
-
}
|
|
134
|
-
return field.value;
|
|
135
|
-
})
|
|
136
|
-
.join(joinWith)
|
|
137
|
-
.trim();
|
|
138
|
-
}
|
|
139
|
-
|
|
140
113
|
/**
|
|
141
114
|
* Escapes a CSV field by wrapping it in quotes if it contains special characters
|
|
142
115
|
* @param field - The field to escape
|
|
@@ -155,48 +128,63 @@ function escapeCsvField(field) {
|
|
|
155
128
|
*/
|
|
156
129
|
function exportTestResultsToCsv(testCases) {
|
|
157
130
|
const csvRows = [];
|
|
131
|
+
const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
|
|
158
132
|
// Add header row
|
|
159
133
|
const headers = [
|
|
160
134
|
'Question',
|
|
161
|
-
'Expected Keywords',
|
|
162
|
-
'Generated Keywords',
|
|
163
|
-
'Keywords Match',
|
|
164
135
|
'Response Time (s)',
|
|
165
|
-
'Evaluation Approach',
|
|
166
|
-
'Evaluation Score',
|
|
167
136
|
];
|
|
137
|
+
for (let i = 1; i <= maxFieldCount; i++) {
|
|
138
|
+
headers.push('Field Name');
|
|
139
|
+
headers.push('Expected Keywords');
|
|
140
|
+
headers.push('Generated Keywords');
|
|
141
|
+
headers.push('Evaluation Strategy');
|
|
142
|
+
headers.push('Passed Evaluation');
|
|
143
|
+
headers.push('Keyword Match');
|
|
144
|
+
headers.push('Score');
|
|
145
|
+
if (i < maxFieldCount) {
|
|
146
|
+
headers.push('');
|
|
147
|
+
}
|
|
148
|
+
}
|
|
168
149
|
csvRows.push(headers.join(','));
|
|
169
|
-
// Add data rows
|
|
150
|
+
// Add data rows (one row per test case)
|
|
170
151
|
testCases.forEach(testCase => {
|
|
171
|
-
const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
|
|
172
|
-
const evaluationApproach = testCase.evaluationParameters?.approach || '';
|
|
173
|
-
const score = testCase.evaluationResult?.evaluationApproachResult?.score;
|
|
174
|
-
const evaluationScore = score !== undefined ? score.toString() : '';
|
|
175
|
-
let generatedKeywords = '';
|
|
176
|
-
let keywordsMatch = '';
|
|
177
|
-
if (testCase.evaluationResult) {
|
|
178
|
-
const foundKeywords = testCase.evaluationResult.keywordMatches
|
|
179
|
-
.filter(match => match.found)
|
|
180
|
-
.map(match => match.keyword);
|
|
181
|
-
generatedKeywords = foundKeywords.join('; ');
|
|
182
|
-
// Calculate match percentages
|
|
183
|
-
const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
|
|
184
|
-
const totalKeywords = testCase.evaluationResult.keywordMatches.length;
|
|
185
|
-
keywordsMatch =
|
|
186
|
-
totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
|
|
187
|
-
}
|
|
188
152
|
const responseTime = testCase.responseTime
|
|
189
153
|
? (testCase.responseTime / 1000).toFixed(3)
|
|
190
154
|
: 'N/A';
|
|
191
|
-
const row = [
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
155
|
+
const row = [escapeCsvField(testCase.question), responseTime];
|
|
156
|
+
for (let i = 0; i < maxFieldCount; i++) {
|
|
157
|
+
const field = testCase.expectedOutcome?.[i];
|
|
158
|
+
const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
|
|
159
|
+
const expectedKeywords = fieldResult?.expectedValue ??
|
|
160
|
+
(field
|
|
161
|
+
? field.type === 'chips-input'
|
|
162
|
+
? field.value.join(', ')
|
|
163
|
+
: field.value
|
|
164
|
+
: '');
|
|
165
|
+
const generatedKeywords = (fieldResult?.keywordMatches || [])
|
|
166
|
+
.filter(match => match.found)
|
|
167
|
+
.map(match => match.keyword)
|
|
168
|
+
.join('; ');
|
|
169
|
+
const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
|
|
170
|
+
const totalMatches = fieldResult?.keywordMatches?.length || 0;
|
|
171
|
+
const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
|
|
172
|
+
const score = fieldResult?.evaluationApproachResult?.score !== undefined
|
|
173
|
+
? fieldResult.evaluationApproachResult.score.toFixed(2)
|
|
174
|
+
: '';
|
|
175
|
+
row.push(escapeCsvField(field?.label || ''));
|
|
176
|
+
row.push(escapeCsvField(expectedKeywords || ''));
|
|
177
|
+
row.push(escapeCsvField(generatedKeywords));
|
|
178
|
+
row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
|
|
179
|
+
field?.evaluationParameters?.approach ||
|
|
180
|
+
''));
|
|
181
|
+
row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
|
|
182
|
+
row.push(keywordMatch);
|
|
183
|
+
row.push(score);
|
|
184
|
+
if (i < maxFieldCount - 1) {
|
|
185
|
+
row.push('');
|
|
186
|
+
}
|
|
187
|
+
}
|
|
200
188
|
csvRows.push(row.join(','));
|
|
201
189
|
});
|
|
202
190
|
return csvRows.join('\n');
|
|
@@ -255,6 +243,43 @@ function v4(options, buf, offset) {
|
|
|
255
243
|
return unsafeStringify(rnds);
|
|
256
244
|
}
|
|
257
245
|
|
|
246
|
+
var EvaluationApproach;
|
|
247
|
+
(function (EvaluationApproach) {
|
|
248
|
+
EvaluationApproach["EXACT"] = "exact";
|
|
249
|
+
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
250
|
+
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
251
|
+
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
252
|
+
EvaluationApproach["BLEU"] = "bleu";
|
|
253
|
+
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
254
|
+
// Array of all evaluation approach values for UI components
|
|
255
|
+
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
256
|
+
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
257
|
+
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
258
|
+
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
259
|
+
|
|
260
|
+
const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
|
|
261
|
+
function getAllowedApproachesForFieldType(fieldType) {
|
|
262
|
+
if (fieldType === 'select') {
|
|
263
|
+
return SELECT_ONLY_APPROACHES;
|
|
264
|
+
}
|
|
265
|
+
return EvaluationApproachValues;
|
|
266
|
+
}
|
|
267
|
+
function isApproachAllowedForFieldType(fieldType, approach) {
|
|
268
|
+
return getAllowedApproachesForFieldType(fieldType).includes(approach);
|
|
269
|
+
}
|
|
270
|
+
function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
|
|
271
|
+
const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
|
|
272
|
+
const fallbackApproach = allowedApproaches[0];
|
|
273
|
+
const rawApproach = evaluationParameters?.approach;
|
|
274
|
+
const approach = rawApproach && allowedApproaches.includes(rawApproach)
|
|
275
|
+
? rawApproach
|
|
276
|
+
: fallbackApproach;
|
|
277
|
+
return {
|
|
278
|
+
...evaluationParameters,
|
|
279
|
+
approach,
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
|
|
258
283
|
const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
259
284
|
{
|
|
260
285
|
type: 'textarea',
|
|
@@ -263,6 +288,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
263
288
|
rows: 2,
|
|
264
289
|
},
|
|
265
290
|
];
|
|
291
|
+
function normalizeExpectedOutcomeField(field) {
|
|
292
|
+
return {
|
|
293
|
+
...field,
|
|
294
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
295
|
+
};
|
|
296
|
+
}
|
|
266
297
|
/**
|
|
267
298
|
* Creates a new test case with default values
|
|
268
299
|
* @returns A new TestCase object with a unique ID
|
|
@@ -272,9 +303,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
|
|
|
272
303
|
id: v4(),
|
|
273
304
|
question: '',
|
|
274
305
|
expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
|
|
275
|
-
evaluationParameters: {
|
|
276
|
-
approach: EvaluationApproach.EXACT,
|
|
277
|
-
},
|
|
278
306
|
isRunning: false,
|
|
279
307
|
};
|
|
280
308
|
}
|
|
@@ -284,35 +312,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
284
312
|
return {
|
|
285
313
|
type: 'text',
|
|
286
314
|
label: schemaField.label,
|
|
287
|
-
required: schemaField.required,
|
|
288
315
|
placeholder: schemaField.placeholder,
|
|
289
316
|
value: '',
|
|
317
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
290
318
|
};
|
|
291
319
|
case 'textarea':
|
|
292
320
|
return {
|
|
293
321
|
type: 'textarea',
|
|
294
322
|
label: schemaField.label,
|
|
295
|
-
required: schemaField.required,
|
|
296
323
|
placeholder: schemaField.placeholder,
|
|
297
324
|
rows: schemaField.rows,
|
|
298
325
|
value: '',
|
|
326
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
299
327
|
};
|
|
300
328
|
case 'chips-input':
|
|
301
329
|
return {
|
|
302
330
|
type: 'chips-input',
|
|
303
331
|
label: schemaField.label,
|
|
304
|
-
required: schemaField.required,
|
|
305
332
|
placeholder: schemaField.placeholder,
|
|
306
333
|
value: [],
|
|
334
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
307
335
|
};
|
|
308
336
|
case 'select':
|
|
309
337
|
return {
|
|
310
338
|
type: 'select',
|
|
311
339
|
label: schemaField.label,
|
|
312
|
-
required: schemaField.required,
|
|
313
340
|
placeholder: schemaField.placeholder,
|
|
314
341
|
value: '',
|
|
315
342
|
options: schemaField.options,
|
|
343
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
316
344
|
};
|
|
317
345
|
default: {
|
|
318
346
|
const _exhaustiveCheck = schemaField;
|
|
@@ -323,32 +351,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
323
351
|
function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
|
|
324
352
|
return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
|
|
325
353
|
}
|
|
326
|
-
function migrateLegacyExpectedOutcomeString(value) {
|
|
327
|
-
return [
|
|
328
|
-
{
|
|
329
|
-
type: 'textarea',
|
|
330
|
-
label: 'Expected Outcome',
|
|
331
|
-
value,
|
|
332
|
-
},
|
|
333
|
-
];
|
|
334
|
-
}
|
|
335
354
|
/**
|
|
336
355
|
* Creates a runtime test case from validated input data.
|
|
337
|
-
* The input is expected to already satisfy `TestCaseInput
|
|
338
|
-
* and this function only performs normalization/defaulting
|
|
356
|
+
* The input is expected to already satisfy `TestCaseInput`,
|
|
357
|
+
* and this function only performs normalization/defaulting.
|
|
339
358
|
*
|
|
340
359
|
* @param data - Validated test case input
|
|
341
360
|
* @returns A normalized TestCase object with runtime defaults applied
|
|
342
361
|
*/
|
|
343
362
|
function createTestCaseFromInput(data) {
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
expectedOutcome
|
|
347
|
-
}
|
|
348
|
-
else {
|
|
349
|
-
expectedOutcome = data.expectedOutcome;
|
|
350
|
-
}
|
|
351
|
-
return { ...data, expectedOutcome };
|
|
363
|
+
return {
|
|
364
|
+
...data,
|
|
365
|
+
expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
|
|
366
|
+
};
|
|
352
367
|
}
|
|
353
368
|
|
|
354
369
|
/** A special constant with type `never` */
|
|
@@ -4938,27 +4953,43 @@ function superRefine(fn) {
|
|
|
4938
4953
|
const nonEmptyString = string().trim().min(1);
|
|
4939
4954
|
const optionalPositiveInt = number().int().positive().optional();
|
|
4940
4955
|
const optionalString = string().optional();
|
|
4941
|
-
const optionalBoolean = boolean().optional();
|
|
4942
4956
|
const selectOptionsSchema = array(nonEmptyString).min(1);
|
|
4957
|
+
const optionalNumber = number().optional();
|
|
4958
|
+
const evaluationParametersSchema = object({
|
|
4959
|
+
approach: _enum(EvaluationApproach),
|
|
4960
|
+
threshold: optionalNumber,
|
|
4961
|
+
});
|
|
4962
|
+
const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
|
|
4963
|
+
if (!isApproachAllowedForFieldType('select', parameters.approach)) {
|
|
4964
|
+
ctx.addIssue({
|
|
4965
|
+
code: 'custom',
|
|
4966
|
+
path: ['approach'],
|
|
4967
|
+
message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
|
|
4968
|
+
});
|
|
4969
|
+
}
|
|
4970
|
+
});
|
|
4943
4971
|
const defaultExpectedOutcomeBaseSchema = object({
|
|
4944
4972
|
label: nonEmptyString,
|
|
4945
|
-
required: optionalBoolean,
|
|
4946
4973
|
placeholder: optionalString,
|
|
4947
4974
|
});
|
|
4948
4975
|
const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
|
|
4949
4976
|
text: baseSchema.extend({
|
|
4950
4977
|
type: literal('text'),
|
|
4978
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4951
4979
|
}),
|
|
4952
4980
|
textarea: baseSchema.extend({
|
|
4953
4981
|
type: literal('textarea'),
|
|
4954
4982
|
rows: optionalPositiveInt,
|
|
4983
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4955
4984
|
}),
|
|
4956
4985
|
chipsInput: baseSchema.extend({
|
|
4957
4986
|
type: literal('chips-input'),
|
|
4987
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4958
4988
|
}),
|
|
4959
4989
|
select: baseSchema.extend({
|
|
4960
4990
|
type: literal('select'),
|
|
4961
4991
|
options: selectOptionsSchema,
|
|
4992
|
+
evaluationParameters: selectEvaluationParametersSchema.optional(),
|
|
4962
4993
|
}),
|
|
4963
4994
|
});
|
|
4964
4995
|
function hasDuplicateChips(values) {
|
|
@@ -5020,33 +5051,16 @@ function validateExpectedOutcomeSchema(schema) {
|
|
|
5020
5051
|
}
|
|
5021
5052
|
}
|
|
5022
5053
|
|
|
5023
|
-
const
|
|
5024
|
-
approach: _enum(EvaluationApproach),
|
|
5025
|
-
threshold: number().optional(),
|
|
5026
|
-
});
|
|
5027
|
-
const baseTestCaseInputSchema = object({
|
|
5054
|
+
const testCaseInputSchema = object({
|
|
5028
5055
|
id: string(),
|
|
5029
5056
|
question: string(),
|
|
5030
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5031
|
-
});
|
|
5032
|
-
const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5033
|
-
expectedOutcome: string(),
|
|
5034
|
-
});
|
|
5035
|
-
const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5036
5057
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5037
5058
|
});
|
|
5038
|
-
const
|
|
5039
|
-
legacyTestCaseInputSchema,
|
|
5040
|
-
v2TestCaseInputSchema,
|
|
5041
|
-
]);
|
|
5042
|
-
const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
|
|
5043
|
-
message: 'The test suite is empty. Please provide at least one test case.',
|
|
5044
|
-
});
|
|
5059
|
+
const testCaseInputArraySchema = array(testCaseInputSchema);
|
|
5045
5060
|
object({
|
|
5046
5061
|
id: string(),
|
|
5047
5062
|
question: string(),
|
|
5048
5063
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5049
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5050
5064
|
output: string().optional(),
|
|
5051
5065
|
isRunning: boolean().optional(),
|
|
5052
5066
|
error: string().optional(),
|
|
@@ -5097,19 +5111,69 @@ function importTestSuite(jsonContent) {
|
|
|
5097
5111
|
}
|
|
5098
5112
|
}
|
|
5099
5113
|
|
|
5114
|
+
function applyExpectedOutcomeChange(testCase, change) {
|
|
5115
|
+
const { index } = change;
|
|
5116
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5117
|
+
const target = expectedOutcome[index];
|
|
5118
|
+
if (!target) {
|
|
5119
|
+
return testCase;
|
|
5120
|
+
}
|
|
5121
|
+
switch (change.operation) {
|
|
5122
|
+
case 'set-value': {
|
|
5123
|
+
if (target.type === 'chips-input') {
|
|
5124
|
+
return testCase;
|
|
5125
|
+
}
|
|
5126
|
+
expectedOutcome[index] = {
|
|
5127
|
+
...target,
|
|
5128
|
+
value: change.value,
|
|
5129
|
+
};
|
|
5130
|
+
return { ...testCase, expectedOutcome };
|
|
5131
|
+
}
|
|
5132
|
+
case 'add-chip': {
|
|
5133
|
+
if (target.type !== 'chips-input') {
|
|
5134
|
+
return testCase;
|
|
5135
|
+
}
|
|
5136
|
+
expectedOutcome[index] = {
|
|
5137
|
+
...target,
|
|
5138
|
+
value: [...target.value, change.value],
|
|
5139
|
+
};
|
|
5140
|
+
return { ...testCase, expectedOutcome };
|
|
5141
|
+
}
|
|
5142
|
+
case 'remove-chip': {
|
|
5143
|
+
if (target.type !== 'chips-input') {
|
|
5144
|
+
return testCase;
|
|
5145
|
+
}
|
|
5146
|
+
expectedOutcome[index] = {
|
|
5147
|
+
...target,
|
|
5148
|
+
value: target.value.filter(chip => chip !== change.value),
|
|
5149
|
+
};
|
|
5150
|
+
return { ...testCase, expectedOutcome };
|
|
5151
|
+
}
|
|
5152
|
+
case 'set-evaluation-approach':
|
|
5153
|
+
return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
|
|
5154
|
+
}
|
|
5155
|
+
}
|
|
5100
5156
|
/**
|
|
5101
|
-
* Updates the evaluation approach for a
|
|
5102
|
-
*
|
|
5103
|
-
* @param approach - The new evaluation approach
|
|
5104
|
-
* @returns Updated test case with the new evaluation approach
|
|
5157
|
+
* Updates the evaluation approach for a specific expected outcome field.
|
|
5158
|
+
* Select fields always use exact matching.
|
|
5105
5159
|
*/
|
|
5106
|
-
function
|
|
5160
|
+
function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
|
|
5161
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5162
|
+
const target = expectedOutcome[fieldIndex];
|
|
5163
|
+
if (!target) {
|
|
5164
|
+
return testCase;
|
|
5165
|
+
}
|
|
5166
|
+
const currentEvaluationParameters = target.evaluationParameters;
|
|
5167
|
+
expectedOutcome[fieldIndex] = {
|
|
5168
|
+
...target,
|
|
5169
|
+
evaluationParameters: normalizeEvaluationParametersForField(target.type, {
|
|
5170
|
+
...currentEvaluationParameters,
|
|
5171
|
+
approach,
|
|
5172
|
+
}),
|
|
5173
|
+
};
|
|
5107
5174
|
return {
|
|
5108
5175
|
...testCase,
|
|
5109
|
-
|
|
5110
|
-
...testCase.evaluationParameters,
|
|
5111
|
-
approach: approach,
|
|
5112
|
-
},
|
|
5176
|
+
expectedOutcome,
|
|
5113
5177
|
};
|
|
5114
5178
|
}
|
|
5115
5179
|
|
|
@@ -29861,57 +29925,78 @@ function performBleuEvaluation(request) {
|
|
|
29861
29925
|
|
|
29862
29926
|
class LLMEvaluationEngine {
|
|
29863
29927
|
async evaluateResponse(request, callback) {
|
|
29864
|
-
|
|
29865
|
-
const
|
|
29866
|
-
switch (approach) {
|
|
29867
|
-
case EvaluationApproach.BLEU: {
|
|
29868
|
-
const bleuResult = performBleuEvaluation(request);
|
|
29869
|
-
callback(bleuResult);
|
|
29870
|
-
break;
|
|
29871
|
-
}
|
|
29872
|
-
case EvaluationApproach.EXACT: {
|
|
29873
|
-
const exactResult = await performEvaluation(request);
|
|
29874
|
-
callback(exactResult);
|
|
29875
|
-
break;
|
|
29876
|
-
}
|
|
29877
|
-
case EvaluationApproach.ROUGE_1: {
|
|
29878
|
-
const rougeResult = await performRouge1Evaluation(request);
|
|
29879
|
-
callback(rougeResult);
|
|
29880
|
-
break;
|
|
29881
|
-
}
|
|
29882
|
-
case EvaluationApproach.ROUGE_L: {
|
|
29883
|
-
const rougeLResult = await performRougeLEvaluation(request);
|
|
29884
|
-
callback(rougeLResult);
|
|
29885
|
-
break;
|
|
29886
|
-
}
|
|
29887
|
-
case EvaluationApproach.SEMANTIC: {
|
|
29888
|
-
const semanticResult = await performSemanticEvaluation(request);
|
|
29889
|
-
callback(semanticResult);
|
|
29890
|
-
break;
|
|
29891
|
-
}
|
|
29892
|
-
default: {
|
|
29893
|
-
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29894
|
-
const fallbackResult = await performEvaluation(request);
|
|
29895
|
-
callback(fallbackResult);
|
|
29896
|
-
}
|
|
29897
|
-
}
|
|
29898
|
-
}
|
|
29899
|
-
catch (error) {
|
|
29900
|
-
console.error('Evaluation failed:', error);
|
|
29901
|
-
const errorResult = {
|
|
29928
|
+
const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
|
|
29929
|
+
const fieldRequest = {
|
|
29902
29930
|
testCaseId: request.testCaseId,
|
|
29931
|
+
question: request.question,
|
|
29932
|
+
actualResponse: request.actualResponse,
|
|
29933
|
+
expectedOutcome: field.expectedValue,
|
|
29934
|
+
evaluationParameters: field.evaluationParameters,
|
|
29935
|
+
};
|
|
29936
|
+
const result = await this.evaluateField(fieldRequest);
|
|
29937
|
+
const fieldResult = {
|
|
29938
|
+
index: field.index,
|
|
29939
|
+
label: field.label,
|
|
29940
|
+
type: field.type,
|
|
29941
|
+
expectedValue: field.expectedValue,
|
|
29942
|
+
passed: result.passed,
|
|
29943
|
+
keywordMatches: result.keywordMatches,
|
|
29944
|
+
evaluationParameters: result.evaluationParameters,
|
|
29945
|
+
evaluationApproachResult: result.evaluationApproachResult,
|
|
29946
|
+
};
|
|
29947
|
+
return fieldResult;
|
|
29948
|
+
}));
|
|
29949
|
+
const fieldResults = settledResults.map((settledResult, index) => {
|
|
29950
|
+
const field = request.fields[index];
|
|
29951
|
+
if (settledResult.status === 'fulfilled') {
|
|
29952
|
+
return settledResult.value;
|
|
29953
|
+
}
|
|
29954
|
+
return {
|
|
29955
|
+
index: field.index,
|
|
29956
|
+
label: field.label,
|
|
29957
|
+
type: field.type,
|
|
29958
|
+
expectedValue: field.expectedValue,
|
|
29903
29959
|
passed: false,
|
|
29904
29960
|
keywordMatches: [],
|
|
29905
|
-
|
|
29906
|
-
evaluationParameters: request.evaluationParameters,
|
|
29961
|
+
evaluationParameters: field.evaluationParameters,
|
|
29907
29962
|
evaluationApproachResult: {
|
|
29908
29963
|
score: 0,
|
|
29909
|
-
approachUsed:
|
|
29964
|
+
approachUsed: field.evaluationParameters.approach,
|
|
29910
29965
|
},
|
|
29966
|
+
error: this.getSafeErrorMessage(settledResult.reason),
|
|
29911
29967
|
};
|
|
29912
|
-
|
|
29968
|
+
});
|
|
29969
|
+
const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
|
|
29970
|
+
const passed = fieldResults.every(field => field.passed && !field.error);
|
|
29971
|
+
callback({
|
|
29972
|
+
testCaseId: request.testCaseId,
|
|
29973
|
+
passed,
|
|
29974
|
+
keywordMatches,
|
|
29975
|
+
fieldResults,
|
|
29976
|
+
timestamp: new Date().toISOString(),
|
|
29977
|
+
});
|
|
29978
|
+
}
|
|
29979
|
+
async evaluateField(request) {
|
|
29980
|
+
const approach = request.evaluationParameters.approach;
|
|
29981
|
+
switch (approach) {
|
|
29982
|
+
case EvaluationApproach.BLEU:
|
|
29983
|
+
return performBleuEvaluation(request);
|
|
29984
|
+
case EvaluationApproach.EXACT:
|
|
29985
|
+
return performEvaluation(request);
|
|
29986
|
+
case EvaluationApproach.ROUGE_1:
|
|
29987
|
+
return performRouge1Evaluation(request);
|
|
29988
|
+
case EvaluationApproach.ROUGE_L:
|
|
29989
|
+
return performRougeLEvaluation(request);
|
|
29990
|
+
case EvaluationApproach.SEMANTIC:
|
|
29991
|
+
return performSemanticEvaluation(request);
|
|
29992
|
+
default:
|
|
29993
|
+
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29994
|
+
return performEvaluation(request);
|
|
29913
29995
|
}
|
|
29914
29996
|
}
|
|
29997
|
+
getSafeErrorMessage(error) {
|
|
29998
|
+
return error instanceof Error ? error.message : 'Field evaluation failed.';
|
|
29999
|
+
}
|
|
29915
30000
|
}
|
|
29916
30001
|
|
|
29917
30002
|
/**
|
|
@@ -29932,12 +30017,18 @@ class EvaluationService {
|
|
|
29932
30017
|
console.warn('⚠️ No output to evaluate for test case:', testCase.id);
|
|
29933
30018
|
return;
|
|
29934
30019
|
}
|
|
30020
|
+
const fields = (testCase.expectedOutcome || []).map((field, index) => ({
|
|
30021
|
+
index,
|
|
30022
|
+
label: field.label,
|
|
30023
|
+
type: field.type,
|
|
30024
|
+
expectedValue: getFieldExpectedValue(field),
|
|
30025
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
30026
|
+
}));
|
|
29935
30027
|
const evaluationRequest = {
|
|
29936
30028
|
testCaseId: testCase.id,
|
|
29937
30029
|
question: testCase.question,
|
|
29938
|
-
expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
|
|
29939
30030
|
actualResponse: testCase.output,
|
|
29940
|
-
|
|
30031
|
+
fields,
|
|
29941
30032
|
};
|
|
29942
30033
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
29943
30034
|
console.log('📊 Evaluation result received:', result);
|
|
@@ -29945,6 +30036,12 @@ class EvaluationService {
|
|
|
29945
30036
|
});
|
|
29946
30037
|
}
|
|
29947
30038
|
}
|
|
30039
|
+
function getFieldExpectedValue(field) {
|
|
30040
|
+
if (field.type === 'chips-input') {
|
|
30041
|
+
return field.value.join(', ');
|
|
30042
|
+
}
|
|
30043
|
+
return field.value;
|
|
30044
|
+
}
|
|
29948
30045
|
|
|
29949
30046
|
const Button = (props, children) => {
|
|
29950
30047
|
const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
|
|
@@ -29987,7 +30084,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
|
|
|
29987
30084
|
};
|
|
29988
30085
|
|
|
29989
30086
|
const EvaluationSummary = ({ result, isRunning, }) => {
|
|
29990
|
-
|
|
30087
|
+
const fieldResults = result?.fieldResults || [];
|
|
30088
|
+
const hasFieldResults = fieldResults.length > 0;
|
|
30089
|
+
return (index.h("div", { class: "evaluation-summary" }, result ? (index.h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (index.h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (index.h("div", { class: "evaluation-summary__field-result" }, index.h("div", { class: "evaluation-summary__field-header" }, index.h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), index.h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), index.h("div", { class: "evaluation-summary__field-details" }, index.h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (index.h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), index.h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), index.h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (index.h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
|
|
29991
30090
|
};
|
|
29992
30091
|
|
|
29993
30092
|
const IconButton = (props, children) => {
|
|
@@ -30023,6 +30122,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30023
30122
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
30024
30123
|
detail,
|
|
30025
30124
|
});
|
|
30125
|
+
const buildEvaluationConfig = (index, optionList) => ({
|
|
30126
|
+
name: `expectedOutcomeEvaluation-${index}`,
|
|
30127
|
+
fieldType: FormFieldType.SELECT,
|
|
30128
|
+
label: 'Evaluation Approach',
|
|
30129
|
+
placeholder: 'Select evaluation approach…',
|
|
30130
|
+
required: true,
|
|
30131
|
+
optionList,
|
|
30132
|
+
defaultValue: EvaluationApproach.EXACT,
|
|
30133
|
+
});
|
|
30134
|
+
const renderEvaluationSelector = (field, index$1) => {
|
|
30135
|
+
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
30136
|
+
return (index.h("app-select", { config: buildEvaluationConfig(index$1, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
30137
|
+
testCaseId,
|
|
30138
|
+
index: index$1,
|
|
30139
|
+
operation: 'set-evaluation-approach',
|
|
30140
|
+
value: e.detail.value,
|
|
30141
|
+
}) }));
|
|
30142
|
+
};
|
|
30026
30143
|
return (index.h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index$1) => {
|
|
30027
30144
|
if (field.type === 'textarea') {
|
|
30028
30145
|
const config = {
|
|
@@ -30030,15 +30147,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30030
30147
|
fieldType: FormFieldType.TEXT_AREA,
|
|
30031
30148
|
label: field.label,
|
|
30032
30149
|
placeholder: field.placeholder,
|
|
30033
|
-
required:
|
|
30150
|
+
required: true,
|
|
30034
30151
|
rows: field.rows || 2,
|
|
30035
30152
|
};
|
|
30036
|
-
return (index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30153
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30037
30154
|
testCaseId,
|
|
30038
30155
|
index: index$1,
|
|
30039
30156
|
operation: 'set-value',
|
|
30040
30157
|
value: e.detail.value,
|
|
30041
|
-
}) }));
|
|
30158
|
+
}) }), renderEvaluationSelector(field, index$1)));
|
|
30042
30159
|
}
|
|
30043
30160
|
if (field.type === 'chips-input') {
|
|
30044
30161
|
const config = {
|
|
@@ -30046,9 +30163,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30046
30163
|
fieldType: FormFieldType.CHIPS,
|
|
30047
30164
|
label: field.label,
|
|
30048
30165
|
placeholder: field.placeholder,
|
|
30049
|
-
required:
|
|
30166
|
+
required: true,
|
|
30050
30167
|
};
|
|
30051
|
-
return (index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30168
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30052
30169
|
testCaseId,
|
|
30053
30170
|
index: index$1,
|
|
30054
30171
|
operation: 'add-chip',
|
|
@@ -30058,7 +30175,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30058
30175
|
index: index$1,
|
|
30059
30176
|
operation: 'remove-chip',
|
|
30060
30177
|
value: e.detail.value,
|
|
30061
|
-
}) }));
|
|
30178
|
+
}) }), renderEvaluationSelector(field, index$1)));
|
|
30062
30179
|
}
|
|
30063
30180
|
if (field.type === 'select') {
|
|
30064
30181
|
const config = {
|
|
@@ -30066,26 +30183,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30066
30183
|
fieldType: FormFieldType.SELECT,
|
|
30067
30184
|
label: field.label,
|
|
30068
30185
|
placeholder: field.placeholder,
|
|
30069
|
-
required:
|
|
30186
|
+
required: true,
|
|
30070
30187
|
optionList: field.options,
|
|
30071
30188
|
};
|
|
30072
|
-
return (index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30189
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30073
30190
|
testCaseId,
|
|
30074
30191
|
index: index$1,
|
|
30075
30192
|
operation: 'set-value',
|
|
30076
30193
|
value: e.detail.value,
|
|
30077
|
-
}) }));
|
|
30194
|
+
}) }), renderEvaluationSelector(field, index$1)));
|
|
30078
30195
|
}
|
|
30079
|
-
return (index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30196
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30080
30197
|
testCaseId,
|
|
30081
30198
|
index: index$1,
|
|
30082
30199
|
operation: 'set-value',
|
|
30083
30200
|
value: e.target.value,
|
|
30084
|
-
}) })));
|
|
30201
|
+
}) })), renderEvaluationSelector(field, index$1)));
|
|
30085
30202
|
})));
|
|
30086
30203
|
};
|
|
30087
30204
|
|
|
30088
|
-
const LLMTestCaseRow = ({ testCase, onRun, onDelete,
|
|
30205
|
+
const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30089
30206
|
const questionConfig = {
|
|
30090
30207
|
name: 'question',
|
|
30091
30208
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -30095,26 +30212,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
|
|
|
30095
30212
|
required: true,
|
|
30096
30213
|
rows: 3,
|
|
30097
30214
|
};
|
|
30098
|
-
const evaluationConfig = {
|
|
30099
|
-
name: 'EvaluationApproach',
|
|
30100
|
-
fieldType: FormFieldType.SELECT,
|
|
30101
|
-
label: 'Evaluation',
|
|
30102
|
-
placeholder: 'Select evaluation approach…',
|
|
30103
|
-
required: true,
|
|
30104
|
-
optionList: EvaluationApproachValues,
|
|
30105
|
-
defaultValue: EvaluationApproach.EXACT,
|
|
30106
|
-
};
|
|
30107
30215
|
return (index.h("div", { class: "test-case-row", key: testCase.id }, index.h("div", { class: "test-case-row__input-column" }, index.h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
|
|
30108
30216
|
detail: {
|
|
30109
30217
|
testCaseId: testCase.id,
|
|
30110
30218
|
key: 'question',
|
|
30111
30219
|
value: e.detail.value,
|
|
30112
30220
|
},
|
|
30113
|
-
}) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })
|
|
30221
|
+
}) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30114
30222
|
};
|
|
30115
30223
|
|
|
30116
|
-
const LLMTestCases = ({ testCases, onRun, onDelete,
|
|
30117
|
-
return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete,
|
|
30224
|
+
const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30225
|
+
return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30118
30226
|
};
|
|
30119
30227
|
|
|
30120
30228
|
const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
|
|
@@ -30125,11 +30233,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
|
|
|
30125
30233
|
|
|
30126
30234
|
const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
|
|
30127
30235
|
|
|
30128
|
-
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30236
|
+
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30129
30237
|
|
|
30130
30238
|
const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
|
|
30131
30239
|
|
|
30132
|
-
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-
|
|
30240
|
+
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
|
|
30133
30241
|
|
|
30134
30242
|
const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
|
|
30135
30243
|
|
|
@@ -30162,9 +30270,6 @@ const LLMTestRunner = class {
|
|
|
30162
30270
|
value: '',
|
|
30163
30271
|
},
|
|
30164
30272
|
],
|
|
30165
|
-
evaluationParameters: {
|
|
30166
|
-
approach: EvaluationApproach.EXACT,
|
|
30167
|
-
},
|
|
30168
30273
|
isRunning: false,
|
|
30169
30274
|
},
|
|
30170
30275
|
];
|
|
@@ -30271,52 +30376,13 @@ const LLMTestRunner = class {
|
|
|
30271
30376
|
deleteTestCase(id) {
|
|
30272
30377
|
this.testCases = this.testCases.filter(tc => tc.id !== id);
|
|
30273
30378
|
}
|
|
30274
|
-
updateApproach(testCase, approach) {
|
|
30275
|
-
if (testCase) {
|
|
30276
|
-
const updated = updateApproach(testCase, approach);
|
|
30277
|
-
this.updateTestCase(testCase.id, {
|
|
30278
|
-
evaluationParameters: updated.evaluationParameters,
|
|
30279
|
-
});
|
|
30280
|
-
}
|
|
30281
|
-
}
|
|
30282
30379
|
handleExpectedOutcomeChange = (event) => {
|
|
30283
|
-
const { testCaseId,
|
|
30380
|
+
const { testCaseId, ...change } = event.detail;
|
|
30284
30381
|
this.testCases = this.testCases.map(tc => {
|
|
30285
|
-
if (tc.id !== testCaseId)
|
|
30286
|
-
return tc;
|
|
30287
|
-
const expectedOutcome = [...(tc.expectedOutcome || [])];
|
|
30288
|
-
const target = expectedOutcome[index];
|
|
30289
|
-
if (!target)
|
|
30382
|
+
if (tc.id !== testCaseId) {
|
|
30290
30383
|
return tc;
|
|
30291
|
-
if (operation === 'set-value') {
|
|
30292
|
-
if (target.type === 'chips-input') {
|
|
30293
|
-
return tc;
|
|
30294
|
-
}
|
|
30295
|
-
expectedOutcome[index] = { ...target, value: value || '' };
|
|
30296
|
-
return { ...tc, expectedOutcome };
|
|
30297
|
-
}
|
|
30298
|
-
if (operation === 'add-chip') {
|
|
30299
|
-
if (target.type !== 'chips-input' || !value) {
|
|
30300
|
-
return tc;
|
|
30301
|
-
}
|
|
30302
|
-
expectedOutcome[index] = {
|
|
30303
|
-
...target,
|
|
30304
|
-
value: [...target.value, value],
|
|
30305
|
-
};
|
|
30306
|
-
return { ...tc, expectedOutcome };
|
|
30307
|
-
}
|
|
30308
|
-
if (operation === 'remove-chip') {
|
|
30309
|
-
if (target.type !== 'chips-input' ||
|
|
30310
|
-
!value) {
|
|
30311
|
-
return tc;
|
|
30312
|
-
}
|
|
30313
|
-
expectedOutcome[index] = {
|
|
30314
|
-
...target,
|
|
30315
|
-
value: target.value.filter(chip => chip !== value),
|
|
30316
|
-
};
|
|
30317
|
-
return { ...tc, expectedOutcome };
|
|
30318
30384
|
}
|
|
30319
|
-
return tc;
|
|
30385
|
+
return applyExpectedOutcomeChange(tc, change);
|
|
30320
30386
|
});
|
|
30321
30387
|
};
|
|
30322
30388
|
async evaluateResponse(testCase) {
|
|
@@ -30416,7 +30482,7 @@ const LLMTestRunner = class {
|
|
|
30416
30482
|
}
|
|
30417
30483
|
}
|
|
30418
30484
|
render() {
|
|
30419
|
-
return (index.h("div", { key: '
|
|
30485
|
+
return (index.h("div", { key: 'e3d007b453f770fcb59c29f8ee83bd8a35e82a34', class: "test-runner-container" }, index.h(LLMTestRunnerHeader, { key: 'b7c44bf4807fe8d9e5de514818420d67d2e0dbfb', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), index.h(ErrorMessage, { key: '697237ec0f8d2e704609fd0b240629f22c2a3ef6', message: this.error, onClear: () => (this.error = '') }), index.h("div", { key: '64a623f897dfb96d922ddc0cbdfcf529c52bef76', class: "test-runner-container__content" }, index.h(LLMTestCases, { key: '017da41567c5c13933d9cf31d1a972743bd9b100', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
|
|
30420
30486
|
}
|
|
30421
30487
|
};
|
|
30422
30488
|
LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));
|