llm-testrunner-components 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +305 -237
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-JPMPoOC8.js +7 -0
- package/dist/components/p-JPMPoOC8.js.map +1 -0
- package/dist/esm/index.js +305 -237
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/components.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/dist/esm/index.js
CHANGED
|
@@ -61,20 +61,6 @@ class RateLimitedFetcher {
|
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
-
var EvaluationApproach;
|
|
65
|
-
(function (EvaluationApproach) {
|
|
66
|
-
EvaluationApproach["EXACT"] = "exact";
|
|
67
|
-
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
68
|
-
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
69
|
-
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
70
|
-
EvaluationApproach["BLEU"] = "bleu";
|
|
71
|
-
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
72
|
-
// Array of all evaluation approach values for UI components
|
|
73
|
-
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
74
|
-
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
75
|
-
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
76
|
-
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
77
|
-
|
|
78
64
|
/**
|
|
79
65
|
* Reads a file asynchronously and returns its content as a string
|
|
80
66
|
* @param file - The File object to read
|
|
@@ -117,23 +103,10 @@ function formatTestSuiteAsJson(testCases) {
|
|
|
117
103
|
id: testCase.id,
|
|
118
104
|
question: testCase.question,
|
|
119
105
|
expectedOutcome: testCase.expectedOutcome,
|
|
120
|
-
evaluationParameters: testCase.evaluationParameters,
|
|
121
106
|
}));
|
|
122
107
|
return JSON.stringify(exportData, null, 2);
|
|
123
108
|
}
|
|
124
109
|
|
|
125
|
-
function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
|
|
126
|
-
return (expectedOutcome || [])
|
|
127
|
-
.map(field => {
|
|
128
|
-
if (field.type === 'chips-input') {
|
|
129
|
-
return field.value.join(', ');
|
|
130
|
-
}
|
|
131
|
-
return field.value;
|
|
132
|
-
})
|
|
133
|
-
.join(joinWith)
|
|
134
|
-
.trim();
|
|
135
|
-
}
|
|
136
|
-
|
|
137
110
|
/**
|
|
138
111
|
* Escapes a CSV field by wrapping it in quotes if it contains special characters
|
|
139
112
|
* @param field - The field to escape
|
|
@@ -152,48 +125,63 @@ function escapeCsvField(field) {
|
|
|
152
125
|
*/
|
|
153
126
|
function exportTestResultsToCsv(testCases) {
|
|
154
127
|
const csvRows = [];
|
|
128
|
+
const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
|
|
155
129
|
// Add header row
|
|
156
130
|
const headers = [
|
|
157
131
|
'Question',
|
|
158
|
-
'Expected Keywords',
|
|
159
|
-
'Generated Keywords',
|
|
160
|
-
'Keywords Match',
|
|
161
132
|
'Response Time (s)',
|
|
162
|
-
'Evaluation Approach',
|
|
163
|
-
'Evaluation Score',
|
|
164
133
|
];
|
|
134
|
+
for (let i = 1; i <= maxFieldCount; i++) {
|
|
135
|
+
headers.push('Field Name');
|
|
136
|
+
headers.push('Expected Keywords');
|
|
137
|
+
headers.push('Generated Keywords');
|
|
138
|
+
headers.push('Evaluation Strategy');
|
|
139
|
+
headers.push('Passed Evaluation');
|
|
140
|
+
headers.push('Keyword Match');
|
|
141
|
+
headers.push('Score');
|
|
142
|
+
if (i < maxFieldCount) {
|
|
143
|
+
headers.push('');
|
|
144
|
+
}
|
|
145
|
+
}
|
|
165
146
|
csvRows.push(headers.join(','));
|
|
166
|
-
// Add data rows
|
|
147
|
+
// Add data rows (one row per test case)
|
|
167
148
|
testCases.forEach(testCase => {
|
|
168
|
-
const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
|
|
169
|
-
const evaluationApproach = testCase.evaluationParameters?.approach || '';
|
|
170
|
-
const score = testCase.evaluationResult?.evaluationApproachResult?.score;
|
|
171
|
-
const evaluationScore = score !== undefined ? score.toString() : '';
|
|
172
|
-
let generatedKeywords = '';
|
|
173
|
-
let keywordsMatch = '';
|
|
174
|
-
if (testCase.evaluationResult) {
|
|
175
|
-
const foundKeywords = testCase.evaluationResult.keywordMatches
|
|
176
|
-
.filter(match => match.found)
|
|
177
|
-
.map(match => match.keyword);
|
|
178
|
-
generatedKeywords = foundKeywords.join('; ');
|
|
179
|
-
// Calculate match percentages
|
|
180
|
-
const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
|
|
181
|
-
const totalKeywords = testCase.evaluationResult.keywordMatches.length;
|
|
182
|
-
keywordsMatch =
|
|
183
|
-
totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
|
|
184
|
-
}
|
|
185
149
|
const responseTime = testCase.responseTime
|
|
186
150
|
? (testCase.responseTime / 1000).toFixed(3)
|
|
187
151
|
: 'N/A';
|
|
188
|
-
const row = [
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
152
|
+
const row = [escapeCsvField(testCase.question), responseTime];
|
|
153
|
+
for (let i = 0; i < maxFieldCount; i++) {
|
|
154
|
+
const field = testCase.expectedOutcome?.[i];
|
|
155
|
+
const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
|
|
156
|
+
const expectedKeywords = fieldResult?.expectedValue ??
|
|
157
|
+
(field
|
|
158
|
+
? field.type === 'chips-input'
|
|
159
|
+
? field.value.join(', ')
|
|
160
|
+
: field.value
|
|
161
|
+
: '');
|
|
162
|
+
const generatedKeywords = (fieldResult?.keywordMatches || [])
|
|
163
|
+
.filter(match => match.found)
|
|
164
|
+
.map(match => match.keyword)
|
|
165
|
+
.join('; ');
|
|
166
|
+
const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
|
|
167
|
+
const totalMatches = fieldResult?.keywordMatches?.length || 0;
|
|
168
|
+
const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
|
|
169
|
+
const score = fieldResult?.evaluationApproachResult?.score !== undefined
|
|
170
|
+
? fieldResult.evaluationApproachResult.score.toFixed(2)
|
|
171
|
+
: '';
|
|
172
|
+
row.push(escapeCsvField(field?.label || ''));
|
|
173
|
+
row.push(escapeCsvField(expectedKeywords || ''));
|
|
174
|
+
row.push(escapeCsvField(generatedKeywords));
|
|
175
|
+
row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
|
|
176
|
+
field?.evaluationParameters?.approach ||
|
|
177
|
+
''));
|
|
178
|
+
row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
|
|
179
|
+
row.push(keywordMatch);
|
|
180
|
+
row.push(score);
|
|
181
|
+
if (i < maxFieldCount - 1) {
|
|
182
|
+
row.push('');
|
|
183
|
+
}
|
|
184
|
+
}
|
|
197
185
|
csvRows.push(row.join(','));
|
|
198
186
|
});
|
|
199
187
|
return csvRows.join('\n');
|
|
@@ -252,6 +240,43 @@ function v4(options, buf, offset) {
|
|
|
252
240
|
return unsafeStringify(rnds);
|
|
253
241
|
}
|
|
254
242
|
|
|
243
|
+
var EvaluationApproach;
|
|
244
|
+
(function (EvaluationApproach) {
|
|
245
|
+
EvaluationApproach["EXACT"] = "exact";
|
|
246
|
+
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
247
|
+
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
248
|
+
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
249
|
+
EvaluationApproach["BLEU"] = "bleu";
|
|
250
|
+
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
251
|
+
// Array of all evaluation approach values for UI components
|
|
252
|
+
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
253
|
+
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
254
|
+
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
255
|
+
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
256
|
+
|
|
257
|
+
const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
|
|
258
|
+
function getAllowedApproachesForFieldType(fieldType) {
|
|
259
|
+
if (fieldType === 'select') {
|
|
260
|
+
return SELECT_ONLY_APPROACHES;
|
|
261
|
+
}
|
|
262
|
+
return EvaluationApproachValues;
|
|
263
|
+
}
|
|
264
|
+
function isApproachAllowedForFieldType(fieldType, approach) {
|
|
265
|
+
return getAllowedApproachesForFieldType(fieldType).includes(approach);
|
|
266
|
+
}
|
|
267
|
+
function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
|
|
268
|
+
const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
|
|
269
|
+
const fallbackApproach = allowedApproaches[0];
|
|
270
|
+
const rawApproach = evaluationParameters?.approach;
|
|
271
|
+
const approach = rawApproach && allowedApproaches.includes(rawApproach)
|
|
272
|
+
? rawApproach
|
|
273
|
+
: fallbackApproach;
|
|
274
|
+
return {
|
|
275
|
+
...evaluationParameters,
|
|
276
|
+
approach,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
255
280
|
const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
256
281
|
{
|
|
257
282
|
type: 'textarea',
|
|
@@ -260,6 +285,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
260
285
|
rows: 2,
|
|
261
286
|
},
|
|
262
287
|
];
|
|
288
|
+
function normalizeExpectedOutcomeField(field) {
|
|
289
|
+
return {
|
|
290
|
+
...field,
|
|
291
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
292
|
+
};
|
|
293
|
+
}
|
|
263
294
|
/**
|
|
264
295
|
* Creates a new test case with default values
|
|
265
296
|
* @returns A new TestCase object with a unique ID
|
|
@@ -269,9 +300,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
|
|
|
269
300
|
id: v4(),
|
|
270
301
|
question: '',
|
|
271
302
|
expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
|
|
272
|
-
evaluationParameters: {
|
|
273
|
-
approach: EvaluationApproach.EXACT,
|
|
274
|
-
},
|
|
275
303
|
isRunning: false,
|
|
276
304
|
};
|
|
277
305
|
}
|
|
@@ -281,35 +309,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
281
309
|
return {
|
|
282
310
|
type: 'text',
|
|
283
311
|
label: schemaField.label,
|
|
284
|
-
required: schemaField.required,
|
|
285
312
|
placeholder: schemaField.placeholder,
|
|
286
313
|
value: '',
|
|
314
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
287
315
|
};
|
|
288
316
|
case 'textarea':
|
|
289
317
|
return {
|
|
290
318
|
type: 'textarea',
|
|
291
319
|
label: schemaField.label,
|
|
292
|
-
required: schemaField.required,
|
|
293
320
|
placeholder: schemaField.placeholder,
|
|
294
321
|
rows: schemaField.rows,
|
|
295
322
|
value: '',
|
|
323
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
296
324
|
};
|
|
297
325
|
case 'chips-input':
|
|
298
326
|
return {
|
|
299
327
|
type: 'chips-input',
|
|
300
328
|
label: schemaField.label,
|
|
301
|
-
required: schemaField.required,
|
|
302
329
|
placeholder: schemaField.placeholder,
|
|
303
330
|
value: [],
|
|
331
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
304
332
|
};
|
|
305
333
|
case 'select':
|
|
306
334
|
return {
|
|
307
335
|
type: 'select',
|
|
308
336
|
label: schemaField.label,
|
|
309
|
-
required: schemaField.required,
|
|
310
337
|
placeholder: schemaField.placeholder,
|
|
311
338
|
value: '',
|
|
312
339
|
options: schemaField.options,
|
|
340
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
313
341
|
};
|
|
314
342
|
default: {
|
|
315
343
|
const _exhaustiveCheck = schemaField;
|
|
@@ -320,32 +348,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
320
348
|
function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
|
|
321
349
|
return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
|
|
322
350
|
}
|
|
323
|
-
function migrateLegacyExpectedOutcomeString(value) {
|
|
324
|
-
return [
|
|
325
|
-
{
|
|
326
|
-
type: 'textarea',
|
|
327
|
-
label: 'Expected Outcome',
|
|
328
|
-
value,
|
|
329
|
-
},
|
|
330
|
-
];
|
|
331
|
-
}
|
|
332
351
|
/**
|
|
333
352
|
* Creates a runtime test case from validated input data.
|
|
334
|
-
* The input is expected to already satisfy `TestCaseInput
|
|
335
|
-
* and this function only performs normalization/defaulting
|
|
353
|
+
* The input is expected to already satisfy `TestCaseInput`,
|
|
354
|
+
* and this function only performs normalization/defaulting.
|
|
336
355
|
*
|
|
337
356
|
* @param data - Validated test case input
|
|
338
357
|
* @returns A normalized TestCase object with runtime defaults applied
|
|
339
358
|
*/
|
|
340
359
|
function createTestCaseFromInput(data) {
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
expectedOutcome
|
|
344
|
-
}
|
|
345
|
-
else {
|
|
346
|
-
expectedOutcome = data.expectedOutcome;
|
|
347
|
-
}
|
|
348
|
-
return { ...data, expectedOutcome };
|
|
360
|
+
return {
|
|
361
|
+
...data,
|
|
362
|
+
expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
|
|
363
|
+
};
|
|
349
364
|
}
|
|
350
365
|
|
|
351
366
|
/** A special constant with type `never` */
|
|
@@ -4935,27 +4950,43 @@ function superRefine(fn) {
|
|
|
4935
4950
|
const nonEmptyString = string().trim().min(1);
|
|
4936
4951
|
const optionalPositiveInt = number().int().positive().optional();
|
|
4937
4952
|
const optionalString = string().optional();
|
|
4938
|
-
const optionalBoolean = boolean().optional();
|
|
4939
4953
|
const selectOptionsSchema = array(nonEmptyString).min(1);
|
|
4954
|
+
const optionalNumber = number().optional();
|
|
4955
|
+
const evaluationParametersSchema = object({
|
|
4956
|
+
approach: _enum(EvaluationApproach),
|
|
4957
|
+
threshold: optionalNumber,
|
|
4958
|
+
});
|
|
4959
|
+
const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
|
|
4960
|
+
if (!isApproachAllowedForFieldType('select', parameters.approach)) {
|
|
4961
|
+
ctx.addIssue({
|
|
4962
|
+
code: 'custom',
|
|
4963
|
+
path: ['approach'],
|
|
4964
|
+
message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
|
|
4965
|
+
});
|
|
4966
|
+
}
|
|
4967
|
+
});
|
|
4940
4968
|
const defaultExpectedOutcomeBaseSchema = object({
|
|
4941
4969
|
label: nonEmptyString,
|
|
4942
|
-
required: optionalBoolean,
|
|
4943
4970
|
placeholder: optionalString,
|
|
4944
4971
|
});
|
|
4945
4972
|
const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
|
|
4946
4973
|
text: baseSchema.extend({
|
|
4947
4974
|
type: literal('text'),
|
|
4975
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4948
4976
|
}),
|
|
4949
4977
|
textarea: baseSchema.extend({
|
|
4950
4978
|
type: literal('textarea'),
|
|
4951
4979
|
rows: optionalPositiveInt,
|
|
4980
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4952
4981
|
}),
|
|
4953
4982
|
chipsInput: baseSchema.extend({
|
|
4954
4983
|
type: literal('chips-input'),
|
|
4984
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4955
4985
|
}),
|
|
4956
4986
|
select: baseSchema.extend({
|
|
4957
4987
|
type: literal('select'),
|
|
4958
4988
|
options: selectOptionsSchema,
|
|
4989
|
+
evaluationParameters: selectEvaluationParametersSchema.optional(),
|
|
4959
4990
|
}),
|
|
4960
4991
|
});
|
|
4961
4992
|
function hasDuplicateChips(values) {
|
|
@@ -5017,33 +5048,16 @@ function validateExpectedOutcomeSchema(schema) {
|
|
|
5017
5048
|
}
|
|
5018
5049
|
}
|
|
5019
5050
|
|
|
5020
|
-
const
|
|
5021
|
-
approach: _enum(EvaluationApproach),
|
|
5022
|
-
threshold: number().optional(),
|
|
5023
|
-
});
|
|
5024
|
-
const baseTestCaseInputSchema = object({
|
|
5051
|
+
const testCaseInputSchema = object({
|
|
5025
5052
|
id: string(),
|
|
5026
5053
|
question: string(),
|
|
5027
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5028
|
-
});
|
|
5029
|
-
const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5030
|
-
expectedOutcome: string(),
|
|
5031
|
-
});
|
|
5032
|
-
const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5033
5054
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5034
5055
|
});
|
|
5035
|
-
const
|
|
5036
|
-
legacyTestCaseInputSchema,
|
|
5037
|
-
v2TestCaseInputSchema,
|
|
5038
|
-
]);
|
|
5039
|
-
const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
|
|
5040
|
-
message: 'The test suite is empty. Please provide at least one test case.',
|
|
5041
|
-
});
|
|
5056
|
+
const testCaseInputArraySchema = array(testCaseInputSchema);
|
|
5042
5057
|
object({
|
|
5043
5058
|
id: string(),
|
|
5044
5059
|
question: string(),
|
|
5045
5060
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5046
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5047
5061
|
output: string().optional(),
|
|
5048
5062
|
isRunning: boolean().optional(),
|
|
5049
5063
|
error: string().optional(),
|
|
@@ -5094,19 +5108,69 @@ function importTestSuite(jsonContent) {
|
|
|
5094
5108
|
}
|
|
5095
5109
|
}
|
|
5096
5110
|
|
|
5111
|
+
function applyExpectedOutcomeChange(testCase, change) {
|
|
5112
|
+
const { index } = change;
|
|
5113
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5114
|
+
const target = expectedOutcome[index];
|
|
5115
|
+
if (!target) {
|
|
5116
|
+
return testCase;
|
|
5117
|
+
}
|
|
5118
|
+
switch (change.operation) {
|
|
5119
|
+
case 'set-value': {
|
|
5120
|
+
if (target.type === 'chips-input') {
|
|
5121
|
+
return testCase;
|
|
5122
|
+
}
|
|
5123
|
+
expectedOutcome[index] = {
|
|
5124
|
+
...target,
|
|
5125
|
+
value: change.value,
|
|
5126
|
+
};
|
|
5127
|
+
return { ...testCase, expectedOutcome };
|
|
5128
|
+
}
|
|
5129
|
+
case 'add-chip': {
|
|
5130
|
+
if (target.type !== 'chips-input') {
|
|
5131
|
+
return testCase;
|
|
5132
|
+
}
|
|
5133
|
+
expectedOutcome[index] = {
|
|
5134
|
+
...target,
|
|
5135
|
+
value: [...target.value, change.value],
|
|
5136
|
+
};
|
|
5137
|
+
return { ...testCase, expectedOutcome };
|
|
5138
|
+
}
|
|
5139
|
+
case 'remove-chip': {
|
|
5140
|
+
if (target.type !== 'chips-input') {
|
|
5141
|
+
return testCase;
|
|
5142
|
+
}
|
|
5143
|
+
expectedOutcome[index] = {
|
|
5144
|
+
...target,
|
|
5145
|
+
value: target.value.filter(chip => chip !== change.value),
|
|
5146
|
+
};
|
|
5147
|
+
return { ...testCase, expectedOutcome };
|
|
5148
|
+
}
|
|
5149
|
+
case 'set-evaluation-approach':
|
|
5150
|
+
return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
|
|
5151
|
+
}
|
|
5152
|
+
}
|
|
5097
5153
|
/**
|
|
5098
|
-
* Updates the evaluation approach for a
|
|
5099
|
-
*
|
|
5100
|
-
* @param approach - The new evaluation approach
|
|
5101
|
-
* @returns Updated test case with the new evaluation approach
|
|
5154
|
+
* Updates the evaluation approach for a specific expected outcome field.
|
|
5155
|
+
* Select fields always use exact matching.
|
|
5102
5156
|
*/
|
|
5103
|
-
function
|
|
5157
|
+
function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
|
|
5158
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5159
|
+
const target = expectedOutcome[fieldIndex];
|
|
5160
|
+
if (!target) {
|
|
5161
|
+
return testCase;
|
|
5162
|
+
}
|
|
5163
|
+
const currentEvaluationParameters = target.evaluationParameters;
|
|
5164
|
+
expectedOutcome[fieldIndex] = {
|
|
5165
|
+
...target,
|
|
5166
|
+
evaluationParameters: normalizeEvaluationParametersForField(target.type, {
|
|
5167
|
+
...currentEvaluationParameters,
|
|
5168
|
+
approach,
|
|
5169
|
+
}),
|
|
5170
|
+
};
|
|
5104
5171
|
return {
|
|
5105
5172
|
...testCase,
|
|
5106
|
-
|
|
5107
|
-
...testCase.evaluationParameters,
|
|
5108
|
-
approach: approach,
|
|
5109
|
-
},
|
|
5173
|
+
expectedOutcome,
|
|
5110
5174
|
};
|
|
5111
5175
|
}
|
|
5112
5176
|
|
|
@@ -29552,6 +29616,7 @@ class SemanticEvaluator {
|
|
|
29552
29616
|
}
|
|
29553
29617
|
}
|
|
29554
29618
|
async performEvaluation(request) {
|
|
29619
|
+
const threshold = request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;
|
|
29555
29620
|
try {
|
|
29556
29621
|
await this.initialize();
|
|
29557
29622
|
// Split expectedOutcome by newlines to create keywords array
|
|
@@ -29561,7 +29626,7 @@ class SemanticEvaluator {
|
|
|
29561
29626
|
.map(k => k.trim())
|
|
29562
29627
|
.filter(k => k.length > 0)
|
|
29563
29628
|
: [];
|
|
29564
|
-
const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords,
|
|
29629
|
+
const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, threshold);
|
|
29565
29630
|
const totalItems = keywordMatches.length;
|
|
29566
29631
|
// calculate the overall score by averaging the score of the keyword matches
|
|
29567
29632
|
const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
|
|
@@ -29569,7 +29634,7 @@ class SemanticEvaluator {
|
|
|
29569
29634
|
const passed = keywordMatches.every(match => match.found);
|
|
29570
29635
|
const evaluationParameters = {
|
|
29571
29636
|
approach: EvaluationApproach.SEMANTIC,
|
|
29572
|
-
threshold
|
|
29637
|
+
threshold,
|
|
29573
29638
|
};
|
|
29574
29639
|
return {
|
|
29575
29640
|
testCaseId: request.testCaseId,
|
|
@@ -29591,7 +29656,7 @@ class SemanticEvaluator {
|
|
|
29591
29656
|
keywordMatches: [],
|
|
29592
29657
|
evaluationParameters: {
|
|
29593
29658
|
approach: EvaluationApproach.SEMANTIC,
|
|
29594
|
-
threshold
|
|
29659
|
+
threshold,
|
|
29595
29660
|
},
|
|
29596
29661
|
evaluationApproachResult: {
|
|
29597
29662
|
score: 0,
|
|
@@ -29858,57 +29923,78 @@ function performBleuEvaluation(request) {
|
|
|
29858
29923
|
|
|
29859
29924
|
class LLMEvaluationEngine {
|
|
29860
29925
|
async evaluateResponse(request, callback) {
|
|
29861
|
-
|
|
29862
|
-
const
|
|
29863
|
-
switch (approach) {
|
|
29864
|
-
case EvaluationApproach.BLEU: {
|
|
29865
|
-
const bleuResult = performBleuEvaluation(request);
|
|
29866
|
-
callback(bleuResult);
|
|
29867
|
-
break;
|
|
29868
|
-
}
|
|
29869
|
-
case EvaluationApproach.EXACT: {
|
|
29870
|
-
const exactResult = await performEvaluation(request);
|
|
29871
|
-
callback(exactResult);
|
|
29872
|
-
break;
|
|
29873
|
-
}
|
|
29874
|
-
case EvaluationApproach.ROUGE_1: {
|
|
29875
|
-
const rougeResult = await performRouge1Evaluation(request);
|
|
29876
|
-
callback(rougeResult);
|
|
29877
|
-
break;
|
|
29878
|
-
}
|
|
29879
|
-
case EvaluationApproach.ROUGE_L: {
|
|
29880
|
-
const rougeLResult = await performRougeLEvaluation(request);
|
|
29881
|
-
callback(rougeLResult);
|
|
29882
|
-
break;
|
|
29883
|
-
}
|
|
29884
|
-
case EvaluationApproach.SEMANTIC: {
|
|
29885
|
-
const semanticResult = await performSemanticEvaluation(request);
|
|
29886
|
-
callback(semanticResult);
|
|
29887
|
-
break;
|
|
29888
|
-
}
|
|
29889
|
-
default: {
|
|
29890
|
-
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29891
|
-
const fallbackResult = await performEvaluation(request);
|
|
29892
|
-
callback(fallbackResult);
|
|
29893
|
-
}
|
|
29894
|
-
}
|
|
29895
|
-
}
|
|
29896
|
-
catch (error) {
|
|
29897
|
-
console.error('Evaluation failed:', error);
|
|
29898
|
-
const errorResult = {
|
|
29926
|
+
const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
|
|
29927
|
+
const fieldRequest = {
|
|
29899
29928
|
testCaseId: request.testCaseId,
|
|
29929
|
+
question: request.question,
|
|
29930
|
+
actualResponse: request.actualResponse,
|
|
29931
|
+
expectedOutcome: field.expectedValue,
|
|
29932
|
+
evaluationParameters: field.evaluationParameters,
|
|
29933
|
+
};
|
|
29934
|
+
const result = await this.evaluateField(fieldRequest);
|
|
29935
|
+
const fieldResult = {
|
|
29936
|
+
index: field.index,
|
|
29937
|
+
label: field.label,
|
|
29938
|
+
type: field.type,
|
|
29939
|
+
expectedValue: field.expectedValue,
|
|
29940
|
+
passed: result.passed,
|
|
29941
|
+
keywordMatches: result.keywordMatches,
|
|
29942
|
+
evaluationParameters: result.evaluationParameters,
|
|
29943
|
+
evaluationApproachResult: result.evaluationApproachResult,
|
|
29944
|
+
};
|
|
29945
|
+
return fieldResult;
|
|
29946
|
+
}));
|
|
29947
|
+
const fieldResults = settledResults.map((settledResult, index) => {
|
|
29948
|
+
const field = request.fields[index];
|
|
29949
|
+
if (settledResult.status === 'fulfilled') {
|
|
29950
|
+
return settledResult.value;
|
|
29951
|
+
}
|
|
29952
|
+
return {
|
|
29953
|
+
index: field.index,
|
|
29954
|
+
label: field.label,
|
|
29955
|
+
type: field.type,
|
|
29956
|
+
expectedValue: field.expectedValue,
|
|
29900
29957
|
passed: false,
|
|
29901
29958
|
keywordMatches: [],
|
|
29902
|
-
|
|
29903
|
-
evaluationParameters: request.evaluationParameters,
|
|
29959
|
+
evaluationParameters: field.evaluationParameters,
|
|
29904
29960
|
evaluationApproachResult: {
|
|
29905
29961
|
score: 0,
|
|
29906
|
-
approachUsed:
|
|
29962
|
+
approachUsed: field.evaluationParameters.approach,
|
|
29907
29963
|
},
|
|
29964
|
+
error: this.getSafeErrorMessage(settledResult.reason),
|
|
29908
29965
|
};
|
|
29909
|
-
|
|
29966
|
+
});
|
|
29967
|
+
const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
|
|
29968
|
+
const passed = fieldResults.every(field => field.passed && !field.error);
|
|
29969
|
+
callback({
|
|
29970
|
+
testCaseId: request.testCaseId,
|
|
29971
|
+
passed,
|
|
29972
|
+
keywordMatches,
|
|
29973
|
+
fieldResults,
|
|
29974
|
+
timestamp: new Date().toISOString(),
|
|
29975
|
+
});
|
|
29976
|
+
}
|
|
29977
|
+
async evaluateField(request) {
|
|
29978
|
+
const approach = request.evaluationParameters.approach;
|
|
29979
|
+
switch (approach) {
|
|
29980
|
+
case EvaluationApproach.BLEU:
|
|
29981
|
+
return performBleuEvaluation(request);
|
|
29982
|
+
case EvaluationApproach.EXACT:
|
|
29983
|
+
return performEvaluation(request);
|
|
29984
|
+
case EvaluationApproach.ROUGE_1:
|
|
29985
|
+
return performRouge1Evaluation(request);
|
|
29986
|
+
case EvaluationApproach.ROUGE_L:
|
|
29987
|
+
return performRougeLEvaluation(request);
|
|
29988
|
+
case EvaluationApproach.SEMANTIC:
|
|
29989
|
+
return performSemanticEvaluation(request);
|
|
29990
|
+
default:
|
|
29991
|
+
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29992
|
+
return performEvaluation(request);
|
|
29910
29993
|
}
|
|
29911
29994
|
}
|
|
29995
|
+
getSafeErrorMessage(error) {
|
|
29996
|
+
return error instanceof Error ? error.message : 'Field evaluation failed.';
|
|
29997
|
+
}
|
|
29912
29998
|
}
|
|
29913
29999
|
|
|
29914
30000
|
/**
|
|
@@ -29929,12 +30015,18 @@ class EvaluationService {
|
|
|
29929
30015
|
console.warn('⚠️ No output to evaluate for test case:', testCase.id);
|
|
29930
30016
|
return;
|
|
29931
30017
|
}
|
|
30018
|
+
const fields = (testCase.expectedOutcome || []).map((field, index) => ({
|
|
30019
|
+
index,
|
|
30020
|
+
label: field.label,
|
|
30021
|
+
type: field.type,
|
|
30022
|
+
expectedValue: getFieldExpectedValue(field),
|
|
30023
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
30024
|
+
}));
|
|
29932
30025
|
const evaluationRequest = {
|
|
29933
30026
|
testCaseId: testCase.id,
|
|
29934
30027
|
question: testCase.question,
|
|
29935
|
-
expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
|
|
29936
30028
|
actualResponse: testCase.output,
|
|
29937
|
-
|
|
30029
|
+
fields,
|
|
29938
30030
|
};
|
|
29939
30031
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
29940
30032
|
console.log('📊 Evaluation result received:', result);
|
|
@@ -29942,6 +30034,12 @@ class EvaluationService {
|
|
|
29942
30034
|
});
|
|
29943
30035
|
}
|
|
29944
30036
|
}
|
|
30037
|
+
function getFieldExpectedValue(field) {
|
|
30038
|
+
if (field.type === 'chips-input') {
|
|
30039
|
+
return field.value.join(', ');
|
|
30040
|
+
}
|
|
30041
|
+
return field.value;
|
|
30042
|
+
}
|
|
29945
30043
|
|
|
29946
30044
|
const Button = (props, children) => {
|
|
29947
30045
|
const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
|
|
@@ -29963,7 +30061,7 @@ const Button = (props, children) => {
|
|
|
29963
30061
|
return (h("button", { type: type, class: classes, disabled: disabled || loading, onClick: onClick, "aria-busy": loading, "aria-label": ariaLabel }, icon && h("span", { class: "icon" }, icon), children));
|
|
29964
30062
|
};
|
|
29965
30063
|
|
|
29966
|
-
const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
|
|
30064
|
+
const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, usePromptEditor = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
|
|
29967
30065
|
let fileInputRef;
|
|
29968
30066
|
const handleFileSelect = () => {
|
|
29969
30067
|
fileInputRef?.click();
|
|
@@ -29976,7 +30074,7 @@ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isR
|
|
|
29976
30074
|
onImport(file);
|
|
29977
30075
|
}
|
|
29978
30076
|
};
|
|
29979
|
-
return (h("header", { class: "test-runner-header" }, h("div", { class: "test-runner-header__left" }, h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), h("div", { class: "test-runner-header__right" }, h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor"), h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
|
|
30077
|
+
return (h("header", { class: "test-runner-header" }, h("div", { class: "test-runner-header__left" }, h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), h("div", { class: "test-runner-header__right" }, usePromptEditor && (h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor")), h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
|
|
29980
30078
|
};
|
|
29981
30079
|
|
|
29982
30080
|
const ResponseOutput = ({ output, isRunning, }) => {
|
|
@@ -29984,7 +30082,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
|
|
|
29984
30082
|
};
|
|
29985
30083
|
|
|
29986
30084
|
const EvaluationSummary = ({ result, isRunning, }) => {
|
|
29987
|
-
|
|
30085
|
+
const fieldResults = result?.fieldResults || [];
|
|
30086
|
+
const hasFieldResults = fieldResults.length > 0;
|
|
30087
|
+
return (h("div", { class: "evaluation-summary" }, result ? (h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (h("div", { class: "evaluation-summary__field-result" }, h("div", { class: "evaluation-summary__field-header" }, h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), h("div", { class: "evaluation-summary__field-details" }, h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
|
|
29988
30088
|
};
|
|
29989
30089
|
|
|
29990
30090
|
const IconButton = (props, children) => {
|
|
@@ -30020,6 +30120,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30020
30120
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
30021
30121
|
detail,
|
|
30022
30122
|
});
|
|
30123
|
+
const buildEvaluationConfig = (index, optionList) => ({
|
|
30124
|
+
name: `expectedOutcomeEvaluation-${index}`,
|
|
30125
|
+
fieldType: FormFieldType.SELECT,
|
|
30126
|
+
label: 'Evaluation Approach',
|
|
30127
|
+
placeholder: 'Select evaluation approach…',
|
|
30128
|
+
required: true,
|
|
30129
|
+
optionList,
|
|
30130
|
+
defaultValue: EvaluationApproach.EXACT,
|
|
30131
|
+
});
|
|
30132
|
+
const renderEvaluationSelector = (field, index) => {
|
|
30133
|
+
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
30134
|
+
return (h("app-select", { config: buildEvaluationConfig(index, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
30135
|
+
testCaseId,
|
|
30136
|
+
index,
|
|
30137
|
+
operation: 'set-evaluation-approach',
|
|
30138
|
+
value: e.detail.value,
|
|
30139
|
+
}) }));
|
|
30140
|
+
};
|
|
30023
30141
|
return (h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index) => {
|
|
30024
30142
|
if (field.type === 'textarea') {
|
|
30025
30143
|
const config = {
|
|
@@ -30027,15 +30145,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30027
30145
|
fieldType: FormFieldType.TEXT_AREA,
|
|
30028
30146
|
label: field.label,
|
|
30029
30147
|
placeholder: field.placeholder,
|
|
30030
|
-
required:
|
|
30148
|
+
required: true,
|
|
30031
30149
|
rows: field.rows || 2,
|
|
30032
30150
|
};
|
|
30033
|
-
return (h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30151
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30034
30152
|
testCaseId,
|
|
30035
30153
|
index,
|
|
30036
30154
|
operation: 'set-value',
|
|
30037
30155
|
value: e.detail.value,
|
|
30038
|
-
}) }));
|
|
30156
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
30039
30157
|
}
|
|
30040
30158
|
if (field.type === 'chips-input') {
|
|
30041
30159
|
const config = {
|
|
@@ -30043,9 +30161,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30043
30161
|
fieldType: FormFieldType.CHIPS,
|
|
30044
30162
|
label: field.label,
|
|
30045
30163
|
placeholder: field.placeholder,
|
|
30046
|
-
required:
|
|
30164
|
+
required: true,
|
|
30047
30165
|
};
|
|
30048
|
-
return (h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30166
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30049
30167
|
testCaseId,
|
|
30050
30168
|
index,
|
|
30051
30169
|
operation: 'add-chip',
|
|
@@ -30055,7 +30173,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30055
30173
|
index,
|
|
30056
30174
|
operation: 'remove-chip',
|
|
30057
30175
|
value: e.detail.value,
|
|
30058
|
-
}) }));
|
|
30176
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
30059
30177
|
}
|
|
30060
30178
|
if (field.type === 'select') {
|
|
30061
30179
|
const config = {
|
|
@@ -30063,26 +30181,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30063
30181
|
fieldType: FormFieldType.SELECT,
|
|
30064
30182
|
label: field.label,
|
|
30065
30183
|
placeholder: field.placeholder,
|
|
30066
|
-
required:
|
|
30184
|
+
required: true,
|
|
30067
30185
|
optionList: field.options,
|
|
30068
30186
|
};
|
|
30069
|
-
return (h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30187
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30070
30188
|
testCaseId,
|
|
30071
30189
|
index,
|
|
30072
30190
|
operation: 'set-value',
|
|
30073
30191
|
value: e.detail.value,
|
|
30074
|
-
}) }));
|
|
30192
|
+
}) }), renderEvaluationSelector(field, index)));
|
|
30075
30193
|
}
|
|
30076
|
-
return (h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30194
|
+
return (h("div", { class: "expected-outcome-renderer__group" }, h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30077
30195
|
testCaseId,
|
|
30078
30196
|
index,
|
|
30079
30197
|
operation: 'set-value',
|
|
30080
30198
|
value: e.target.value,
|
|
30081
|
-
}) })));
|
|
30199
|
+
}) })), renderEvaluationSelector(field, index)));
|
|
30082
30200
|
})));
|
|
30083
30201
|
};
|
|
30084
30202
|
|
|
30085
|
-
const LLMTestCaseRow = ({ testCase, onRun, onDelete,
|
|
30203
|
+
const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30086
30204
|
const questionConfig = {
|
|
30087
30205
|
name: 'question',
|
|
30088
30206
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -30092,26 +30210,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
|
|
|
30092
30210
|
required: true,
|
|
30093
30211
|
rows: 3,
|
|
30094
30212
|
};
|
|
30095
|
-
const evaluationConfig = {
|
|
30096
|
-
name: 'EvaluationApproach',
|
|
30097
|
-
fieldType: FormFieldType.SELECT,
|
|
30098
|
-
label: 'Evaluation',
|
|
30099
|
-
placeholder: 'Select evaluation approach…',
|
|
30100
|
-
required: true,
|
|
30101
|
-
optionList: EvaluationApproachValues,
|
|
30102
|
-
defaultValue: EvaluationApproach.EXACT,
|
|
30103
|
-
};
|
|
30104
30213
|
return (h("div", { class: "test-case-row", key: testCase.id }, h("div", { class: "test-case-row__input-column" }, h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
|
|
30105
30214
|
detail: {
|
|
30106
30215
|
testCaseId: testCase.id,
|
|
30107
30216
|
key: 'question',
|
|
30108
30217
|
value: e.detail.value,
|
|
30109
30218
|
},
|
|
30110
|
-
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })
|
|
30219
|
+
}) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30111
30220
|
};
|
|
30112
30221
|
|
|
30113
|
-
const LLMTestCases = ({ testCases, onRun, onDelete,
|
|
30114
|
-
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete,
|
|
30222
|
+
const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30223
|
+
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30115
30224
|
};
|
|
30116
30225
|
|
|
30117
30226
|
const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
|
|
@@ -30122,11 +30231,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
|
|
|
30122
30231
|
|
|
30123
30232
|
const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
|
|
30124
30233
|
|
|
30125
|
-
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30234
|
+
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30126
30235
|
|
|
30127
30236
|
const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
|
|
30128
30237
|
|
|
30129
|
-
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-
|
|
30238
|
+
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
|
|
30130
30239
|
|
|
30131
30240
|
const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
|
|
30132
30241
|
|
|
@@ -30146,6 +30255,7 @@ const LLMTestRunner = class {
|
|
|
30146
30255
|
save;
|
|
30147
30256
|
delayMs = 500;
|
|
30148
30257
|
useSave = false;
|
|
30258
|
+
usePromptEditor = false;
|
|
30149
30259
|
initialTestCases;
|
|
30150
30260
|
defaultExpectedOutcomeSchema;
|
|
30151
30261
|
testCases = [
|
|
@@ -30159,9 +30269,6 @@ const LLMTestRunner = class {
|
|
|
30159
30269
|
value: '',
|
|
30160
30270
|
},
|
|
30161
30271
|
],
|
|
30162
|
-
evaluationParameters: {
|
|
30163
|
-
approach: EvaluationApproach.EXACT,
|
|
30164
|
-
},
|
|
30165
30272
|
isRunning: false,
|
|
30166
30273
|
},
|
|
30167
30274
|
];
|
|
@@ -30268,52 +30375,13 @@ const LLMTestRunner = class {
|
|
|
30268
30375
|
deleteTestCase(id) {
|
|
30269
30376
|
this.testCases = this.testCases.filter(tc => tc.id !== id);
|
|
30270
30377
|
}
|
|
30271
|
-
updateApproach(testCase, approach) {
|
|
30272
|
-
if (testCase) {
|
|
30273
|
-
const updated = updateApproach(testCase, approach);
|
|
30274
|
-
this.updateTestCase(testCase.id, {
|
|
30275
|
-
evaluationParameters: updated.evaluationParameters,
|
|
30276
|
-
});
|
|
30277
|
-
}
|
|
30278
|
-
}
|
|
30279
30378
|
handleExpectedOutcomeChange = (event) => {
|
|
30280
|
-
const { testCaseId,
|
|
30379
|
+
const { testCaseId, ...change } = event.detail;
|
|
30281
30380
|
this.testCases = this.testCases.map(tc => {
|
|
30282
|
-
if (tc.id !== testCaseId)
|
|
30283
|
-
return tc;
|
|
30284
|
-
const expectedOutcome = [...(tc.expectedOutcome || [])];
|
|
30285
|
-
const target = expectedOutcome[index];
|
|
30286
|
-
if (!target)
|
|
30381
|
+
if (tc.id !== testCaseId) {
|
|
30287
30382
|
return tc;
|
|
30288
|
-
if (operation === 'set-value') {
|
|
30289
|
-
if (target.type === 'chips-input') {
|
|
30290
|
-
return tc;
|
|
30291
|
-
}
|
|
30292
|
-
expectedOutcome[index] = { ...target, value: value || '' };
|
|
30293
|
-
return { ...tc, expectedOutcome };
|
|
30294
|
-
}
|
|
30295
|
-
if (operation === 'add-chip') {
|
|
30296
|
-
if (target.type !== 'chips-input' || !value) {
|
|
30297
|
-
return tc;
|
|
30298
|
-
}
|
|
30299
|
-
expectedOutcome[index] = {
|
|
30300
|
-
...target,
|
|
30301
|
-
value: [...target.value, value],
|
|
30302
|
-
};
|
|
30303
|
-
return { ...tc, expectedOutcome };
|
|
30304
|
-
}
|
|
30305
|
-
if (operation === 'remove-chip') {
|
|
30306
|
-
if (target.type !== 'chips-input' ||
|
|
30307
|
-
!value) {
|
|
30308
|
-
return tc;
|
|
30309
|
-
}
|
|
30310
|
-
expectedOutcome[index] = {
|
|
30311
|
-
...target,
|
|
30312
|
-
value: target.value.filter(chip => chip !== value),
|
|
30313
|
-
};
|
|
30314
|
-
return { ...tc, expectedOutcome };
|
|
30315
30383
|
}
|
|
30316
|
-
return tc;
|
|
30384
|
+
return applyExpectedOutcomeChange(tc, change);
|
|
30317
30385
|
});
|
|
30318
30386
|
};
|
|
30319
30387
|
async evaluateResponse(testCase) {
|
|
@@ -30413,7 +30481,7 @@ const LLMTestRunner = class {
|
|
|
30413
30481
|
}
|
|
30414
30482
|
}
|
|
30415
30483
|
render() {
|
|
30416
|
-
return (h("div", { key: '
|
|
30484
|
+
return (h("div", { key: '323b5e140740bb72d4767c0763c382a6b125caa2', class: "test-runner-container" }, h(LLMTestRunnerHeader, { key: 'e1e2efdf6cfe5f406de7e26e745b5775f307d294', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, usePromptEditor: this.usePromptEditor, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), h(ErrorMessage, { key: 'c6a34b81f66c6cd835eb8bc253f7a28d68c49874', message: this.error, onClear: () => (this.error = '') }), h("div", { key: '674daad8a2754afc8144463e9a173690a3d1d589', class: "test-runner-container__content" }, h(LLMTestCases, { key: '96c1aeae37f56378b7a9b5d54be73c5df48ae448', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
|
|
30417
30485
|
}
|
|
30418
30486
|
};
|
|
30419
30487
|
LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));
|