llm-testrunner-components 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +305 -237
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-JPMPoOC8.js +7 -0
- package/dist/components/p-JPMPoOC8.js.map +1 -0
- package/dist/esm/index.js +305 -237
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/components.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/dist/cjs/index.cjs.js
CHANGED
|
@@ -64,20 +64,6 @@ class RateLimitedFetcher {
|
|
|
64
64
|
}
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
var EvaluationApproach;
|
|
68
|
-
(function (EvaluationApproach) {
|
|
69
|
-
EvaluationApproach["EXACT"] = "exact";
|
|
70
|
-
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
71
|
-
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
72
|
-
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
73
|
-
EvaluationApproach["BLEU"] = "bleu";
|
|
74
|
-
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
75
|
-
// Array of all evaluation approach values for UI components
|
|
76
|
-
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
77
|
-
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
78
|
-
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
79
|
-
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
80
|
-
|
|
81
67
|
/**
|
|
82
68
|
* Reads a file asynchronously and returns its content as a string
|
|
83
69
|
* @param file - The File object to read
|
|
@@ -120,23 +106,10 @@ function formatTestSuiteAsJson(testCases) {
|
|
|
120
106
|
id: testCase.id,
|
|
121
107
|
question: testCase.question,
|
|
122
108
|
expectedOutcome: testCase.expectedOutcome,
|
|
123
|
-
evaluationParameters: testCase.evaluationParameters,
|
|
124
109
|
}));
|
|
125
110
|
return JSON.stringify(exportData, null, 2);
|
|
126
111
|
}
|
|
127
112
|
|
|
128
|
-
function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
|
|
129
|
-
return (expectedOutcome || [])
|
|
130
|
-
.map(field => {
|
|
131
|
-
if (field.type === 'chips-input') {
|
|
132
|
-
return field.value.join(', ');
|
|
133
|
-
}
|
|
134
|
-
return field.value;
|
|
135
|
-
})
|
|
136
|
-
.join(joinWith)
|
|
137
|
-
.trim();
|
|
138
|
-
}
|
|
139
|
-
|
|
140
113
|
/**
|
|
141
114
|
* Escapes a CSV field by wrapping it in quotes if it contains special characters
|
|
142
115
|
* @param field - The field to escape
|
|
@@ -155,48 +128,63 @@ function escapeCsvField(field) {
|
|
|
155
128
|
*/
|
|
156
129
|
function exportTestResultsToCsv(testCases) {
|
|
157
130
|
const csvRows = [];
|
|
131
|
+
const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
|
|
158
132
|
// Add header row
|
|
159
133
|
const headers = [
|
|
160
134
|
'Question',
|
|
161
|
-
'Expected Keywords',
|
|
162
|
-
'Generated Keywords',
|
|
163
|
-
'Keywords Match',
|
|
164
135
|
'Response Time (s)',
|
|
165
|
-
'Evaluation Approach',
|
|
166
|
-
'Evaluation Score',
|
|
167
136
|
];
|
|
137
|
+
for (let i = 1; i <= maxFieldCount; i++) {
|
|
138
|
+
headers.push('Field Name');
|
|
139
|
+
headers.push('Expected Keywords');
|
|
140
|
+
headers.push('Generated Keywords');
|
|
141
|
+
headers.push('Evaluation Strategy');
|
|
142
|
+
headers.push('Passed Evaluation');
|
|
143
|
+
headers.push('Keyword Match');
|
|
144
|
+
headers.push('Score');
|
|
145
|
+
if (i < maxFieldCount) {
|
|
146
|
+
headers.push('');
|
|
147
|
+
}
|
|
148
|
+
}
|
|
168
149
|
csvRows.push(headers.join(','));
|
|
169
|
-
// Add data rows
|
|
150
|
+
// Add data rows (one row per test case)
|
|
170
151
|
testCases.forEach(testCase => {
|
|
171
|
-
const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
|
|
172
|
-
const evaluationApproach = testCase.evaluationParameters?.approach || '';
|
|
173
|
-
const score = testCase.evaluationResult?.evaluationApproachResult?.score;
|
|
174
|
-
const evaluationScore = score !== undefined ? score.toString() : '';
|
|
175
|
-
let generatedKeywords = '';
|
|
176
|
-
let keywordsMatch = '';
|
|
177
|
-
if (testCase.evaluationResult) {
|
|
178
|
-
const foundKeywords = testCase.evaluationResult.keywordMatches
|
|
179
|
-
.filter(match => match.found)
|
|
180
|
-
.map(match => match.keyword);
|
|
181
|
-
generatedKeywords = foundKeywords.join('; ');
|
|
182
|
-
// Calculate match percentages
|
|
183
|
-
const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
|
|
184
|
-
const totalKeywords = testCase.evaluationResult.keywordMatches.length;
|
|
185
|
-
keywordsMatch =
|
|
186
|
-
totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
|
|
187
|
-
}
|
|
188
152
|
const responseTime = testCase.responseTime
|
|
189
153
|
? (testCase.responseTime / 1000).toFixed(3)
|
|
190
154
|
: 'N/A';
|
|
191
|
-
const row = [
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
155
|
+
const row = [escapeCsvField(testCase.question), responseTime];
|
|
156
|
+
for (let i = 0; i < maxFieldCount; i++) {
|
|
157
|
+
const field = testCase.expectedOutcome?.[i];
|
|
158
|
+
const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
|
|
159
|
+
const expectedKeywords = fieldResult?.expectedValue ??
|
|
160
|
+
(field
|
|
161
|
+
? field.type === 'chips-input'
|
|
162
|
+
? field.value.join(', ')
|
|
163
|
+
: field.value
|
|
164
|
+
: '');
|
|
165
|
+
const generatedKeywords = (fieldResult?.keywordMatches || [])
|
|
166
|
+
.filter(match => match.found)
|
|
167
|
+
.map(match => match.keyword)
|
|
168
|
+
.join('; ');
|
|
169
|
+
const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
|
|
170
|
+
const totalMatches = fieldResult?.keywordMatches?.length || 0;
|
|
171
|
+
const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
|
|
172
|
+
const score = fieldResult?.evaluationApproachResult?.score !== undefined
|
|
173
|
+
? fieldResult.evaluationApproachResult.score.toFixed(2)
|
|
174
|
+
: '';
|
|
175
|
+
row.push(escapeCsvField(field?.label || ''));
|
|
176
|
+
row.push(escapeCsvField(expectedKeywords || ''));
|
|
177
|
+
row.push(escapeCsvField(generatedKeywords));
|
|
178
|
+
row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
|
|
179
|
+
field?.evaluationParameters?.approach ||
|
|
180
|
+
''));
|
|
181
|
+
row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
|
|
182
|
+
row.push(keywordMatch);
|
|
183
|
+
row.push(score);
|
|
184
|
+
if (i < maxFieldCount - 1) {
|
|
185
|
+
row.push('');
|
|
186
|
+
}
|
|
187
|
+
}
|
|
200
188
|
csvRows.push(row.join(','));
|
|
201
189
|
});
|
|
202
190
|
return csvRows.join('\n');
|
|
@@ -255,6 +243,43 @@ function v4(options, buf, offset) {
|
|
|
255
243
|
return unsafeStringify(rnds);
|
|
256
244
|
}
|
|
257
245
|
|
|
246
|
+
var EvaluationApproach;
|
|
247
|
+
(function (EvaluationApproach) {
|
|
248
|
+
EvaluationApproach["EXACT"] = "exact";
|
|
249
|
+
EvaluationApproach["SEMANTIC"] = "semantic";
|
|
250
|
+
EvaluationApproach["ROUGE_1"] = "rouge-1";
|
|
251
|
+
EvaluationApproach["ROUGE_L"] = "rouge-L";
|
|
252
|
+
EvaluationApproach["BLEU"] = "bleu";
|
|
253
|
+
})(EvaluationApproach || (EvaluationApproach = {}));
|
|
254
|
+
// Array of all evaluation approach values for UI components
|
|
255
|
+
const EvaluationApproachValues = Object.values(EvaluationApproach);
|
|
256
|
+
const DEFAULT_ROUGE_PASS_SCORE = 0.7;
|
|
257
|
+
const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
|
|
258
|
+
const DEFAULT_BLEU_PASS_SCORE = 0.7;
|
|
259
|
+
|
|
260
|
+
const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
|
|
261
|
+
function getAllowedApproachesForFieldType(fieldType) {
|
|
262
|
+
if (fieldType === 'select') {
|
|
263
|
+
return SELECT_ONLY_APPROACHES;
|
|
264
|
+
}
|
|
265
|
+
return EvaluationApproachValues;
|
|
266
|
+
}
|
|
267
|
+
function isApproachAllowedForFieldType(fieldType, approach) {
|
|
268
|
+
return getAllowedApproachesForFieldType(fieldType).includes(approach);
|
|
269
|
+
}
|
|
270
|
+
function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
|
|
271
|
+
const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
|
|
272
|
+
const fallbackApproach = allowedApproaches[0];
|
|
273
|
+
const rawApproach = evaluationParameters?.approach;
|
|
274
|
+
const approach = rawApproach && allowedApproaches.includes(rawApproach)
|
|
275
|
+
? rawApproach
|
|
276
|
+
: fallbackApproach;
|
|
277
|
+
return {
|
|
278
|
+
...evaluationParameters,
|
|
279
|
+
approach,
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
|
|
258
283
|
const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
259
284
|
{
|
|
260
285
|
type: 'textarea',
|
|
@@ -263,6 +288,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
263
288
|
rows: 2,
|
|
264
289
|
},
|
|
265
290
|
];
|
|
291
|
+
function normalizeExpectedOutcomeField(field) {
|
|
292
|
+
return {
|
|
293
|
+
...field,
|
|
294
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
295
|
+
};
|
|
296
|
+
}
|
|
266
297
|
/**
|
|
267
298
|
* Creates a new test case with default values
|
|
268
299
|
* @returns A new TestCase object with a unique ID
|
|
@@ -272,9 +303,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
|
|
|
272
303
|
id: v4(),
|
|
273
304
|
question: '',
|
|
274
305
|
expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
|
|
275
|
-
evaluationParameters: {
|
|
276
|
-
approach: EvaluationApproach.EXACT,
|
|
277
|
-
},
|
|
278
306
|
isRunning: false,
|
|
279
307
|
};
|
|
280
308
|
}
|
|
@@ -284,35 +312,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
284
312
|
return {
|
|
285
313
|
type: 'text',
|
|
286
314
|
label: schemaField.label,
|
|
287
|
-
required: schemaField.required,
|
|
288
315
|
placeholder: schemaField.placeholder,
|
|
289
316
|
value: '',
|
|
317
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
290
318
|
};
|
|
291
319
|
case 'textarea':
|
|
292
320
|
return {
|
|
293
321
|
type: 'textarea',
|
|
294
322
|
label: schemaField.label,
|
|
295
|
-
required: schemaField.required,
|
|
296
323
|
placeholder: schemaField.placeholder,
|
|
297
324
|
rows: schemaField.rows,
|
|
298
325
|
value: '',
|
|
326
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
299
327
|
};
|
|
300
328
|
case 'chips-input':
|
|
301
329
|
return {
|
|
302
330
|
type: 'chips-input',
|
|
303
331
|
label: schemaField.label,
|
|
304
|
-
required: schemaField.required,
|
|
305
332
|
placeholder: schemaField.placeholder,
|
|
306
333
|
value: [],
|
|
334
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
307
335
|
};
|
|
308
336
|
case 'select':
|
|
309
337
|
return {
|
|
310
338
|
type: 'select',
|
|
311
339
|
label: schemaField.label,
|
|
312
|
-
required: schemaField.required,
|
|
313
340
|
placeholder: schemaField.placeholder,
|
|
314
341
|
value: '',
|
|
315
342
|
options: schemaField.options,
|
|
343
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
316
344
|
};
|
|
317
345
|
default: {
|
|
318
346
|
const _exhaustiveCheck = schemaField;
|
|
@@ -323,32 +351,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
323
351
|
function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
|
|
324
352
|
return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
|
|
325
353
|
}
|
|
326
|
-
function migrateLegacyExpectedOutcomeString(value) {
|
|
327
|
-
return [
|
|
328
|
-
{
|
|
329
|
-
type: 'textarea',
|
|
330
|
-
label: 'Expected Outcome',
|
|
331
|
-
value,
|
|
332
|
-
},
|
|
333
|
-
];
|
|
334
|
-
}
|
|
335
354
|
/**
|
|
336
355
|
* Creates a runtime test case from validated input data.
|
|
337
|
-
* The input is expected to already satisfy `TestCaseInput
|
|
338
|
-
* and this function only performs normalization/defaulting
|
|
356
|
+
* The input is expected to already satisfy `TestCaseInput`,
|
|
357
|
+
* and this function only performs normalization/defaulting.
|
|
339
358
|
*
|
|
340
359
|
* @param data - Validated test case input
|
|
341
360
|
* @returns A normalized TestCase object with runtime defaults applied
|
|
342
361
|
*/
|
|
343
362
|
function createTestCaseFromInput(data) {
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
expectedOutcome
|
|
347
|
-
}
|
|
348
|
-
else {
|
|
349
|
-
expectedOutcome = data.expectedOutcome;
|
|
350
|
-
}
|
|
351
|
-
return { ...data, expectedOutcome };
|
|
363
|
+
return {
|
|
364
|
+
...data,
|
|
365
|
+
expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
|
|
366
|
+
};
|
|
352
367
|
}
|
|
353
368
|
|
|
354
369
|
/** A special constant with type `never` */
|
|
@@ -4938,27 +4953,43 @@ function superRefine(fn) {
|
|
|
4938
4953
|
const nonEmptyString = string().trim().min(1);
|
|
4939
4954
|
const optionalPositiveInt = number().int().positive().optional();
|
|
4940
4955
|
const optionalString = string().optional();
|
|
4941
|
-
const optionalBoolean = boolean().optional();
|
|
4942
4956
|
const selectOptionsSchema = array(nonEmptyString).min(1);
|
|
4957
|
+
const optionalNumber = number().optional();
|
|
4958
|
+
const evaluationParametersSchema = object({
|
|
4959
|
+
approach: _enum(EvaluationApproach),
|
|
4960
|
+
threshold: optionalNumber,
|
|
4961
|
+
});
|
|
4962
|
+
const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
|
|
4963
|
+
if (!isApproachAllowedForFieldType('select', parameters.approach)) {
|
|
4964
|
+
ctx.addIssue({
|
|
4965
|
+
code: 'custom',
|
|
4966
|
+
path: ['approach'],
|
|
4967
|
+
message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
|
|
4968
|
+
});
|
|
4969
|
+
}
|
|
4970
|
+
});
|
|
4943
4971
|
const defaultExpectedOutcomeBaseSchema = object({
|
|
4944
4972
|
label: nonEmptyString,
|
|
4945
|
-
required: optionalBoolean,
|
|
4946
4973
|
placeholder: optionalString,
|
|
4947
4974
|
});
|
|
4948
4975
|
const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
|
|
4949
4976
|
text: baseSchema.extend({
|
|
4950
4977
|
type: literal('text'),
|
|
4978
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4951
4979
|
}),
|
|
4952
4980
|
textarea: baseSchema.extend({
|
|
4953
4981
|
type: literal('textarea'),
|
|
4954
4982
|
rows: optionalPositiveInt,
|
|
4983
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4955
4984
|
}),
|
|
4956
4985
|
chipsInput: baseSchema.extend({
|
|
4957
4986
|
type: literal('chips-input'),
|
|
4987
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
4958
4988
|
}),
|
|
4959
4989
|
select: baseSchema.extend({
|
|
4960
4990
|
type: literal('select'),
|
|
4961
4991
|
options: selectOptionsSchema,
|
|
4992
|
+
evaluationParameters: selectEvaluationParametersSchema.optional(),
|
|
4962
4993
|
}),
|
|
4963
4994
|
});
|
|
4964
4995
|
function hasDuplicateChips(values) {
|
|
@@ -5020,33 +5051,16 @@ function validateExpectedOutcomeSchema(schema) {
|
|
|
5020
5051
|
}
|
|
5021
5052
|
}
|
|
5022
5053
|
|
|
5023
|
-
const
|
|
5024
|
-
approach: _enum(EvaluationApproach),
|
|
5025
|
-
threshold: number().optional(),
|
|
5026
|
-
});
|
|
5027
|
-
const baseTestCaseInputSchema = object({
|
|
5054
|
+
const testCaseInputSchema = object({
|
|
5028
5055
|
id: string(),
|
|
5029
5056
|
question: string(),
|
|
5030
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5031
|
-
});
|
|
5032
|
-
const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5033
|
-
expectedOutcome: string(),
|
|
5034
|
-
});
|
|
5035
|
-
const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
|
|
5036
5057
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5037
5058
|
});
|
|
5038
|
-
const
|
|
5039
|
-
legacyTestCaseInputSchema,
|
|
5040
|
-
v2TestCaseInputSchema,
|
|
5041
|
-
]);
|
|
5042
|
-
const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
|
|
5043
|
-
message: 'The test suite is empty. Please provide at least one test case.',
|
|
5044
|
-
});
|
|
5059
|
+
const testCaseInputArraySchema = array(testCaseInputSchema);
|
|
5045
5060
|
object({
|
|
5046
5061
|
id: string(),
|
|
5047
5062
|
question: string(),
|
|
5048
5063
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5049
|
-
evaluationParameters: evaluationParametersSchema.optional(),
|
|
5050
5064
|
output: string().optional(),
|
|
5051
5065
|
isRunning: boolean().optional(),
|
|
5052
5066
|
error: string().optional(),
|
|
@@ -5097,19 +5111,69 @@ function importTestSuite(jsonContent) {
|
|
|
5097
5111
|
}
|
|
5098
5112
|
}
|
|
5099
5113
|
|
|
5114
|
+
function applyExpectedOutcomeChange(testCase, change) {
|
|
5115
|
+
const { index } = change;
|
|
5116
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5117
|
+
const target = expectedOutcome[index];
|
|
5118
|
+
if (!target) {
|
|
5119
|
+
return testCase;
|
|
5120
|
+
}
|
|
5121
|
+
switch (change.operation) {
|
|
5122
|
+
case 'set-value': {
|
|
5123
|
+
if (target.type === 'chips-input') {
|
|
5124
|
+
return testCase;
|
|
5125
|
+
}
|
|
5126
|
+
expectedOutcome[index] = {
|
|
5127
|
+
...target,
|
|
5128
|
+
value: change.value,
|
|
5129
|
+
};
|
|
5130
|
+
return { ...testCase, expectedOutcome };
|
|
5131
|
+
}
|
|
5132
|
+
case 'add-chip': {
|
|
5133
|
+
if (target.type !== 'chips-input') {
|
|
5134
|
+
return testCase;
|
|
5135
|
+
}
|
|
5136
|
+
expectedOutcome[index] = {
|
|
5137
|
+
...target,
|
|
5138
|
+
value: [...target.value, change.value],
|
|
5139
|
+
};
|
|
5140
|
+
return { ...testCase, expectedOutcome };
|
|
5141
|
+
}
|
|
5142
|
+
case 'remove-chip': {
|
|
5143
|
+
if (target.type !== 'chips-input') {
|
|
5144
|
+
return testCase;
|
|
5145
|
+
}
|
|
5146
|
+
expectedOutcome[index] = {
|
|
5147
|
+
...target,
|
|
5148
|
+
value: target.value.filter(chip => chip !== change.value),
|
|
5149
|
+
};
|
|
5150
|
+
return { ...testCase, expectedOutcome };
|
|
5151
|
+
}
|
|
5152
|
+
case 'set-evaluation-approach':
|
|
5153
|
+
return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
|
|
5154
|
+
}
|
|
5155
|
+
}
|
|
5100
5156
|
/**
|
|
5101
|
-
* Updates the evaluation approach for a
|
|
5102
|
-
*
|
|
5103
|
-
* @param approach - The new evaluation approach
|
|
5104
|
-
* @returns Updated test case with the new evaluation approach
|
|
5157
|
+
* Updates the evaluation approach for a specific expected outcome field.
|
|
5158
|
+
* Select fields always use exact matching.
|
|
5105
5159
|
*/
|
|
5106
|
-
function
|
|
5160
|
+
function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
|
|
5161
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5162
|
+
const target = expectedOutcome[fieldIndex];
|
|
5163
|
+
if (!target) {
|
|
5164
|
+
return testCase;
|
|
5165
|
+
}
|
|
5166
|
+
const currentEvaluationParameters = target.evaluationParameters;
|
|
5167
|
+
expectedOutcome[fieldIndex] = {
|
|
5168
|
+
...target,
|
|
5169
|
+
evaluationParameters: normalizeEvaluationParametersForField(target.type, {
|
|
5170
|
+
...currentEvaluationParameters,
|
|
5171
|
+
approach,
|
|
5172
|
+
}),
|
|
5173
|
+
};
|
|
5107
5174
|
return {
|
|
5108
5175
|
...testCase,
|
|
5109
|
-
|
|
5110
|
-
...testCase.evaluationParameters,
|
|
5111
|
-
approach: approach,
|
|
5112
|
-
},
|
|
5176
|
+
expectedOutcome,
|
|
5113
5177
|
};
|
|
5114
5178
|
}
|
|
5115
5179
|
|
|
@@ -29555,6 +29619,7 @@ class SemanticEvaluator {
|
|
|
29555
29619
|
}
|
|
29556
29620
|
}
|
|
29557
29621
|
async performEvaluation(request) {
|
|
29622
|
+
const threshold = request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;
|
|
29558
29623
|
try {
|
|
29559
29624
|
await this.initialize();
|
|
29560
29625
|
// Split expectedOutcome by newlines to create keywords array
|
|
@@ -29564,7 +29629,7 @@ class SemanticEvaluator {
|
|
|
29564
29629
|
.map(k => k.trim())
|
|
29565
29630
|
.filter(k => k.length > 0)
|
|
29566
29631
|
: [];
|
|
29567
|
-
const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords,
|
|
29632
|
+
const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, threshold);
|
|
29568
29633
|
const totalItems = keywordMatches.length;
|
|
29569
29634
|
// calculate the overall score by averaging the score of the keyword matches
|
|
29570
29635
|
const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
|
|
@@ -29572,7 +29637,7 @@ class SemanticEvaluator {
|
|
|
29572
29637
|
const passed = keywordMatches.every(match => match.found);
|
|
29573
29638
|
const evaluationParameters = {
|
|
29574
29639
|
approach: EvaluationApproach.SEMANTIC,
|
|
29575
|
-
threshold
|
|
29640
|
+
threshold,
|
|
29576
29641
|
};
|
|
29577
29642
|
return {
|
|
29578
29643
|
testCaseId: request.testCaseId,
|
|
@@ -29594,7 +29659,7 @@ class SemanticEvaluator {
|
|
|
29594
29659
|
keywordMatches: [],
|
|
29595
29660
|
evaluationParameters: {
|
|
29596
29661
|
approach: EvaluationApproach.SEMANTIC,
|
|
29597
|
-
threshold
|
|
29662
|
+
threshold,
|
|
29598
29663
|
},
|
|
29599
29664
|
evaluationApproachResult: {
|
|
29600
29665
|
score: 0,
|
|
@@ -29861,57 +29926,78 @@ function performBleuEvaluation(request) {
|
|
|
29861
29926
|
|
|
29862
29927
|
class LLMEvaluationEngine {
|
|
29863
29928
|
async evaluateResponse(request, callback) {
|
|
29864
|
-
|
|
29865
|
-
const
|
|
29866
|
-
switch (approach) {
|
|
29867
|
-
case EvaluationApproach.BLEU: {
|
|
29868
|
-
const bleuResult = performBleuEvaluation(request);
|
|
29869
|
-
callback(bleuResult);
|
|
29870
|
-
break;
|
|
29871
|
-
}
|
|
29872
|
-
case EvaluationApproach.EXACT: {
|
|
29873
|
-
const exactResult = await performEvaluation(request);
|
|
29874
|
-
callback(exactResult);
|
|
29875
|
-
break;
|
|
29876
|
-
}
|
|
29877
|
-
case EvaluationApproach.ROUGE_1: {
|
|
29878
|
-
const rougeResult = await performRouge1Evaluation(request);
|
|
29879
|
-
callback(rougeResult);
|
|
29880
|
-
break;
|
|
29881
|
-
}
|
|
29882
|
-
case EvaluationApproach.ROUGE_L: {
|
|
29883
|
-
const rougeLResult = await performRougeLEvaluation(request);
|
|
29884
|
-
callback(rougeLResult);
|
|
29885
|
-
break;
|
|
29886
|
-
}
|
|
29887
|
-
case EvaluationApproach.SEMANTIC: {
|
|
29888
|
-
const semanticResult = await performSemanticEvaluation(request);
|
|
29889
|
-
callback(semanticResult);
|
|
29890
|
-
break;
|
|
29891
|
-
}
|
|
29892
|
-
default: {
|
|
29893
|
-
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29894
|
-
const fallbackResult = await performEvaluation(request);
|
|
29895
|
-
callback(fallbackResult);
|
|
29896
|
-
}
|
|
29897
|
-
}
|
|
29898
|
-
}
|
|
29899
|
-
catch (error) {
|
|
29900
|
-
console.error('Evaluation failed:', error);
|
|
29901
|
-
const errorResult = {
|
|
29929
|
+
const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
|
|
29930
|
+
const fieldRequest = {
|
|
29902
29931
|
testCaseId: request.testCaseId,
|
|
29932
|
+
question: request.question,
|
|
29933
|
+
actualResponse: request.actualResponse,
|
|
29934
|
+
expectedOutcome: field.expectedValue,
|
|
29935
|
+
evaluationParameters: field.evaluationParameters,
|
|
29936
|
+
};
|
|
29937
|
+
const result = await this.evaluateField(fieldRequest);
|
|
29938
|
+
const fieldResult = {
|
|
29939
|
+
index: field.index,
|
|
29940
|
+
label: field.label,
|
|
29941
|
+
type: field.type,
|
|
29942
|
+
expectedValue: field.expectedValue,
|
|
29943
|
+
passed: result.passed,
|
|
29944
|
+
keywordMatches: result.keywordMatches,
|
|
29945
|
+
evaluationParameters: result.evaluationParameters,
|
|
29946
|
+
evaluationApproachResult: result.evaluationApproachResult,
|
|
29947
|
+
};
|
|
29948
|
+
return fieldResult;
|
|
29949
|
+
}));
|
|
29950
|
+
const fieldResults = settledResults.map((settledResult, index) => {
|
|
29951
|
+
const field = request.fields[index];
|
|
29952
|
+
if (settledResult.status === 'fulfilled') {
|
|
29953
|
+
return settledResult.value;
|
|
29954
|
+
}
|
|
29955
|
+
return {
|
|
29956
|
+
index: field.index,
|
|
29957
|
+
label: field.label,
|
|
29958
|
+
type: field.type,
|
|
29959
|
+
expectedValue: field.expectedValue,
|
|
29903
29960
|
passed: false,
|
|
29904
29961
|
keywordMatches: [],
|
|
29905
|
-
|
|
29906
|
-
evaluationParameters: request.evaluationParameters,
|
|
29962
|
+
evaluationParameters: field.evaluationParameters,
|
|
29907
29963
|
evaluationApproachResult: {
|
|
29908
29964
|
score: 0,
|
|
29909
|
-
approachUsed:
|
|
29965
|
+
approachUsed: field.evaluationParameters.approach,
|
|
29910
29966
|
},
|
|
29967
|
+
error: this.getSafeErrorMessage(settledResult.reason),
|
|
29911
29968
|
};
|
|
29912
|
-
|
|
29969
|
+
});
|
|
29970
|
+
const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
|
|
29971
|
+
const passed = fieldResults.every(field => field.passed && !field.error);
|
|
29972
|
+
callback({
|
|
29973
|
+
testCaseId: request.testCaseId,
|
|
29974
|
+
passed,
|
|
29975
|
+
keywordMatches,
|
|
29976
|
+
fieldResults,
|
|
29977
|
+
timestamp: new Date().toISOString(),
|
|
29978
|
+
});
|
|
29979
|
+
}
|
|
29980
|
+
async evaluateField(request) {
|
|
29981
|
+
const approach = request.evaluationParameters.approach;
|
|
29982
|
+
switch (approach) {
|
|
29983
|
+
case EvaluationApproach.BLEU:
|
|
29984
|
+
return performBleuEvaluation(request);
|
|
29985
|
+
case EvaluationApproach.EXACT:
|
|
29986
|
+
return performEvaluation(request);
|
|
29987
|
+
case EvaluationApproach.ROUGE_1:
|
|
29988
|
+
return performRouge1Evaluation(request);
|
|
29989
|
+
case EvaluationApproach.ROUGE_L:
|
|
29990
|
+
return performRougeLEvaluation(request);
|
|
29991
|
+
case EvaluationApproach.SEMANTIC:
|
|
29992
|
+
return performSemanticEvaluation(request);
|
|
29993
|
+
default:
|
|
29994
|
+
console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
|
|
29995
|
+
return performEvaluation(request);
|
|
29913
29996
|
}
|
|
29914
29997
|
}
|
|
29998
|
+
getSafeErrorMessage(error) {
|
|
29999
|
+
return error instanceof Error ? error.message : 'Field evaluation failed.';
|
|
30000
|
+
}
|
|
29915
30001
|
}
|
|
29916
30002
|
|
|
29917
30003
|
/**
|
|
@@ -29932,12 +30018,18 @@ class EvaluationService {
|
|
|
29932
30018
|
console.warn('⚠️ No output to evaluate for test case:', testCase.id);
|
|
29933
30019
|
return;
|
|
29934
30020
|
}
|
|
30021
|
+
const fields = (testCase.expectedOutcome || []).map((field, index) => ({
|
|
30022
|
+
index,
|
|
30023
|
+
label: field.label,
|
|
30024
|
+
type: field.type,
|
|
30025
|
+
expectedValue: getFieldExpectedValue(field),
|
|
30026
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
30027
|
+
}));
|
|
29935
30028
|
const evaluationRequest = {
|
|
29936
30029
|
testCaseId: testCase.id,
|
|
29937
30030
|
question: testCase.question,
|
|
29938
|
-
expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
|
|
29939
30031
|
actualResponse: testCase.output,
|
|
29940
|
-
|
|
30032
|
+
fields,
|
|
29941
30033
|
};
|
|
29942
30034
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
29943
30035
|
console.log('📊 Evaluation result received:', result);
|
|
@@ -29945,6 +30037,12 @@ class EvaluationService {
|
|
|
29945
30037
|
});
|
|
29946
30038
|
}
|
|
29947
30039
|
}
|
|
30040
|
+
function getFieldExpectedValue(field) {
|
|
30041
|
+
if (field.type === 'chips-input') {
|
|
30042
|
+
return field.value.join(', ');
|
|
30043
|
+
}
|
|
30044
|
+
return field.value;
|
|
30045
|
+
}
|
|
29948
30046
|
|
|
29949
30047
|
const Button = (props, children) => {
|
|
29950
30048
|
const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
|
|
@@ -29966,7 +30064,7 @@ const Button = (props, children) => {
|
|
|
29966
30064
|
return (index.h("button", { type: type, class: classes, disabled: disabled || loading, onClick: onClick, "aria-busy": loading, "aria-label": ariaLabel }, icon && index.h("span", { class: "icon" }, icon), children));
|
|
29967
30065
|
};
|
|
29968
30066
|
|
|
29969
|
-
const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
|
|
30067
|
+
const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, usePromptEditor = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
|
|
29970
30068
|
let fileInputRef;
|
|
29971
30069
|
const handleFileSelect = () => {
|
|
29972
30070
|
fileInputRef?.click();
|
|
@@ -29979,7 +30077,7 @@ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isR
|
|
|
29979
30077
|
onImport(file);
|
|
29980
30078
|
}
|
|
29981
30079
|
};
|
|
29982
|
-
return (index.h("header", { class: "test-runner-header" }, index.h("div", { class: "test-runner-header__left" }, index.h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), index.h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), index.h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), index.h("div", { class: "test-runner-header__right" }, index.h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor"), index.h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (index.h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), index.h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
|
|
30080
|
+
return (index.h("header", { class: "test-runner-header" }, index.h("div", { class: "test-runner-header__left" }, index.h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), index.h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), index.h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), index.h("div", { class: "test-runner-header__right" }, usePromptEditor && (index.h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor")), index.h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (index.h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), index.h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
|
|
29983
30081
|
};
|
|
29984
30082
|
|
|
29985
30083
|
const ResponseOutput = ({ output, isRunning, }) => {
|
|
@@ -29987,7 +30085,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
|
|
|
29987
30085
|
};
|
|
29988
30086
|
|
|
29989
30087
|
const EvaluationSummary = ({ result, isRunning, }) => {
|
|
29990
|
-
|
|
30088
|
+
const fieldResults = result?.fieldResults || [];
|
|
30089
|
+
const hasFieldResults = fieldResults.length > 0;
|
|
30090
|
+
return (index.h("div", { class: "evaluation-summary" }, result ? (index.h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (index.h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (index.h("div", { class: "evaluation-summary__field-result" }, index.h("div", { class: "evaluation-summary__field-header" }, index.h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), index.h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), index.h("div", { class: "evaluation-summary__field-details" }, index.h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (index.h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), index.h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), index.h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (index.h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
|
|
29991
30091
|
};
|
|
29992
30092
|
|
|
29993
30093
|
const IconButton = (props, children) => {
|
|
@@ -30023,6 +30123,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30023
30123
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
30024
30124
|
detail,
|
|
30025
30125
|
});
|
|
30126
|
+
const buildEvaluationConfig = (index, optionList) => ({
|
|
30127
|
+
name: `expectedOutcomeEvaluation-${index}`,
|
|
30128
|
+
fieldType: FormFieldType.SELECT,
|
|
30129
|
+
label: 'Evaluation Approach',
|
|
30130
|
+
placeholder: 'Select evaluation approach…',
|
|
30131
|
+
required: true,
|
|
30132
|
+
optionList,
|
|
30133
|
+
defaultValue: EvaluationApproach.EXACT,
|
|
30134
|
+
});
|
|
30135
|
+
const renderEvaluationSelector = (field, index$1) => {
|
|
30136
|
+
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
30137
|
+
return (index.h("app-select", { config: buildEvaluationConfig(index$1, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
30138
|
+
testCaseId,
|
|
30139
|
+
index: index$1,
|
|
30140
|
+
operation: 'set-evaluation-approach',
|
|
30141
|
+
value: e.detail.value,
|
|
30142
|
+
}) }));
|
|
30143
|
+
};
|
|
30026
30144
|
return (index.h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index$1) => {
|
|
30027
30145
|
if (field.type === 'textarea') {
|
|
30028
30146
|
const config = {
|
|
@@ -30030,15 +30148,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30030
30148
|
fieldType: FormFieldType.TEXT_AREA,
|
|
30031
30149
|
label: field.label,
|
|
30032
30150
|
placeholder: field.placeholder,
|
|
30033
|
-
required:
|
|
30151
|
+
required: true,
|
|
30034
30152
|
rows: field.rows || 2,
|
|
30035
30153
|
};
|
|
30036
|
-
return (index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30154
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30037
30155
|
testCaseId,
|
|
30038
30156
|
index: index$1,
|
|
30039
30157
|
operation: 'set-value',
|
|
30040
30158
|
value: e.detail.value,
|
|
30041
|
-
}) }));
|
|
30159
|
+
}) }), renderEvaluationSelector(field, index$1)));
|
|
30042
30160
|
}
|
|
30043
30161
|
if (field.type === 'chips-input') {
|
|
30044
30162
|
const config = {
|
|
@@ -30046,9 +30164,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30046
30164
|
fieldType: FormFieldType.CHIPS,
|
|
30047
30165
|
label: field.label,
|
|
30048
30166
|
placeholder: field.placeholder,
|
|
30049
|
-
required:
|
|
30167
|
+
required: true,
|
|
30050
30168
|
};
|
|
30051
|
-
return (index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30169
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
|
|
30052
30170
|
testCaseId,
|
|
30053
30171
|
index: index$1,
|
|
30054
30172
|
operation: 'add-chip',
|
|
@@ -30058,7 +30176,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30058
30176
|
index: index$1,
|
|
30059
30177
|
operation: 'remove-chip',
|
|
30060
30178
|
value: e.detail.value,
|
|
30061
|
-
}) }));
|
|
30179
|
+
}) }), renderEvaluationSelector(field, index$1)));
|
|
30062
30180
|
}
|
|
30063
30181
|
if (field.type === 'select') {
|
|
30064
30182
|
const config = {
|
|
@@ -30066,26 +30184,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
|
|
|
30066
30184
|
fieldType: FormFieldType.SELECT,
|
|
30067
30185
|
label: field.label,
|
|
30068
30186
|
placeholder: field.placeholder,
|
|
30069
|
-
required:
|
|
30187
|
+
required: true,
|
|
30070
30188
|
optionList: field.options,
|
|
30071
30189
|
};
|
|
30072
|
-
return (index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30190
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
|
|
30073
30191
|
testCaseId,
|
|
30074
30192
|
index: index$1,
|
|
30075
30193
|
operation: 'set-value',
|
|
30076
30194
|
value: e.detail.value,
|
|
30077
|
-
}) }));
|
|
30195
|
+
}) }), renderEvaluationSelector(field, index$1)));
|
|
30078
30196
|
}
|
|
30079
|
-
return (index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30197
|
+
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30080
30198
|
testCaseId,
|
|
30081
30199
|
index: index$1,
|
|
30082
30200
|
operation: 'set-value',
|
|
30083
30201
|
value: e.target.value,
|
|
30084
|
-
}) })));
|
|
30202
|
+
}) })), renderEvaluationSelector(field, index$1)));
|
|
30085
30203
|
})));
|
|
30086
30204
|
};
|
|
30087
30205
|
|
|
30088
|
-
const LLMTestCaseRow = ({ testCase, onRun, onDelete,
|
|
30206
|
+
const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30089
30207
|
const questionConfig = {
|
|
30090
30208
|
name: 'question',
|
|
30091
30209
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -30095,26 +30213,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
|
|
|
30095
30213
|
required: true,
|
|
30096
30214
|
rows: 3,
|
|
30097
30215
|
};
|
|
30098
|
-
const evaluationConfig = {
|
|
30099
|
-
name: 'EvaluationApproach',
|
|
30100
|
-
fieldType: FormFieldType.SELECT,
|
|
30101
|
-
label: 'Evaluation',
|
|
30102
|
-
placeholder: 'Select evaluation approach…',
|
|
30103
|
-
required: true,
|
|
30104
|
-
optionList: EvaluationApproachValues,
|
|
30105
|
-
defaultValue: EvaluationApproach.EXACT,
|
|
30106
|
-
};
|
|
30107
30216
|
return (index.h("div", { class: "test-case-row", key: testCase.id }, index.h("div", { class: "test-case-row__input-column" }, index.h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
|
|
30108
30217
|
detail: {
|
|
30109
30218
|
testCaseId: testCase.id,
|
|
30110
30219
|
key: 'question',
|
|
30111
30220
|
value: e.detail.value,
|
|
30112
30221
|
},
|
|
30113
|
-
}) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })
|
|
30222
|
+
}) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30114
30223
|
};
|
|
30115
30224
|
|
|
30116
|
-
const LLMTestCases = ({ testCases, onRun, onDelete,
|
|
30117
|
-
return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete,
|
|
30225
|
+
const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
|
|
30226
|
+
return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30118
30227
|
};
|
|
30119
30228
|
|
|
30120
30229
|
const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
|
|
@@ -30125,11 +30234,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
|
|
|
30125
30234
|
|
|
30126
30235
|
const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
|
|
30127
30236
|
|
|
30128
|
-
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30237
|
+
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30129
30238
|
|
|
30130
30239
|
const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
|
|
30131
30240
|
|
|
30132
|
-
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-
|
|
30241
|
+
const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
|
|
30133
30242
|
|
|
30134
30243
|
const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
|
|
30135
30244
|
|
|
@@ -30149,6 +30258,7 @@ const LLMTestRunner = class {
|
|
|
30149
30258
|
save;
|
|
30150
30259
|
delayMs = 500;
|
|
30151
30260
|
useSave = false;
|
|
30261
|
+
usePromptEditor = false;
|
|
30152
30262
|
initialTestCases;
|
|
30153
30263
|
defaultExpectedOutcomeSchema;
|
|
30154
30264
|
testCases = [
|
|
@@ -30162,9 +30272,6 @@ const LLMTestRunner = class {
|
|
|
30162
30272
|
value: '',
|
|
30163
30273
|
},
|
|
30164
30274
|
],
|
|
30165
|
-
evaluationParameters: {
|
|
30166
|
-
approach: EvaluationApproach.EXACT,
|
|
30167
|
-
},
|
|
30168
30275
|
isRunning: false,
|
|
30169
30276
|
},
|
|
30170
30277
|
];
|
|
@@ -30271,52 +30378,13 @@ const LLMTestRunner = class {
|
|
|
30271
30378
|
deleteTestCase(id) {
|
|
30272
30379
|
this.testCases = this.testCases.filter(tc => tc.id !== id);
|
|
30273
30380
|
}
|
|
30274
|
-
updateApproach(testCase, approach) {
|
|
30275
|
-
if (testCase) {
|
|
30276
|
-
const updated = updateApproach(testCase, approach);
|
|
30277
|
-
this.updateTestCase(testCase.id, {
|
|
30278
|
-
evaluationParameters: updated.evaluationParameters,
|
|
30279
|
-
});
|
|
30280
|
-
}
|
|
30281
|
-
}
|
|
30282
30381
|
handleExpectedOutcomeChange = (event) => {
|
|
30283
|
-
const { testCaseId,
|
|
30382
|
+
const { testCaseId, ...change } = event.detail;
|
|
30284
30383
|
this.testCases = this.testCases.map(tc => {
|
|
30285
|
-
if (tc.id !== testCaseId)
|
|
30286
|
-
return tc;
|
|
30287
|
-
const expectedOutcome = [...(tc.expectedOutcome || [])];
|
|
30288
|
-
const target = expectedOutcome[index];
|
|
30289
|
-
if (!target)
|
|
30384
|
+
if (tc.id !== testCaseId) {
|
|
30290
30385
|
return tc;
|
|
30291
|
-
if (operation === 'set-value') {
|
|
30292
|
-
if (target.type === 'chips-input') {
|
|
30293
|
-
return tc;
|
|
30294
|
-
}
|
|
30295
|
-
expectedOutcome[index] = { ...target, value: value || '' };
|
|
30296
|
-
return { ...tc, expectedOutcome };
|
|
30297
|
-
}
|
|
30298
|
-
if (operation === 'add-chip') {
|
|
30299
|
-
if (target.type !== 'chips-input' || !value) {
|
|
30300
|
-
return tc;
|
|
30301
|
-
}
|
|
30302
|
-
expectedOutcome[index] = {
|
|
30303
|
-
...target,
|
|
30304
|
-
value: [...target.value, value],
|
|
30305
|
-
};
|
|
30306
|
-
return { ...tc, expectedOutcome };
|
|
30307
|
-
}
|
|
30308
|
-
if (operation === 'remove-chip') {
|
|
30309
|
-
if (target.type !== 'chips-input' ||
|
|
30310
|
-
!value) {
|
|
30311
|
-
return tc;
|
|
30312
|
-
}
|
|
30313
|
-
expectedOutcome[index] = {
|
|
30314
|
-
...target,
|
|
30315
|
-
value: target.value.filter(chip => chip !== value),
|
|
30316
|
-
};
|
|
30317
|
-
return { ...tc, expectedOutcome };
|
|
30318
30386
|
}
|
|
30319
|
-
return tc;
|
|
30387
|
+
return applyExpectedOutcomeChange(tc, change);
|
|
30320
30388
|
});
|
|
30321
30389
|
};
|
|
30322
30390
|
async evaluateResponse(testCase) {
|
|
@@ -30416,7 +30484,7 @@ const LLMTestRunner = class {
|
|
|
30416
30484
|
}
|
|
30417
30485
|
}
|
|
30418
30486
|
render() {
|
|
30419
|
-
return (index.h("div", { key: '
|
|
30487
|
+
return (index.h("div", { key: '323b5e140740bb72d4767c0763c382a6b125caa2', class: "test-runner-container" }, index.h(LLMTestRunnerHeader, { key: 'e1e2efdf6cfe5f406de7e26e745b5775f307d294', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, usePromptEditor: this.usePromptEditor, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), index.h(ErrorMessage, { key: 'c6a34b81f66c6cd835eb8bc253f7a28d68c49874', message: this.error, onClear: () => (this.error = '') }), index.h("div", { key: '674daad8a2754afc8144463e9a173690a3d1d589', class: "test-runner-container__content" }, index.h(LLMTestCases, { key: '96c1aeae37f56378b7a9b5d54be73c5df48ae448', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
|
|
30420
30488
|
}
|
|
30421
30489
|
};
|
|
30422
30490
|
LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));
|