llm-testrunner-components 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +165 -242
  3. package/dist/cjs/index.cjs.js +298 -232
  4. package/dist/cjs/index.cjs.js.map +1 -1
  5. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
  6. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
  7. package/dist/collection/components/llm-test-runner/llm-test-runner.js +6 -49
  8. package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
  9. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
  10. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
  11. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
  12. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
  13. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
  14. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
  15. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
  16. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
  17. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
  18. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
  19. package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
  20. package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
  21. package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
  22. package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
  23. package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
  24. package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
  25. package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
  26. package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
  27. package/dist/collection/lib/evaluation/index.js +0 -4
  28. package/dist/collection/lib/evaluation/index.js.map +1 -1
  29. package/dist/collection/lib/evaluation/types.js.map +1 -1
  30. package/dist/collection/lib/import-export/test-results-csv.js +47 -33
  31. package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
  32. package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
  33. package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
  34. package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
  35. package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
  36. package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
  37. package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
  38. package/dist/collection/schemas/expected-outcome.js +20 -2
  39. package/dist/collection/schemas/expected-outcome.js.map +1 -1
  40. package/dist/collection/schemas/test-case.js +2 -20
  41. package/dist/collection/schemas/test-case.js.map +1 -1
  42. package/dist/collection/types/llm-test-runner.js.map +1 -1
  43. package/dist/collection/types/test-case.js.map +1 -1
  44. package/dist/components/index.js +1 -1
  45. package/dist/components/llm-test-runner.js +1 -1
  46. package/dist/components/p-Bb89MYYu.js +7 -0
  47. package/dist/components/p-Bb89MYYu.js.map +1 -0
  48. package/dist/esm/index.js +298 -232
  49. package/dist/esm/index.js.map +1 -1
  50. package/dist/llm-testrunner/index.esm.js +2 -2
  51. package/dist/llm-testrunner/index.esm.js.map +1 -1
  52. package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +0 -1
  53. package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
  54. package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
  55. package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
  56. package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
  57. package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
  58. package/dist/types/lib/evaluation/index.d.ts +0 -1
  59. package/dist/types/lib/evaluation/types.d.ts +26 -0
  60. package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
  61. package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
  62. package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
  63. package/dist/types/schemas/expected-outcome.d.ts +65 -17
  64. package/dist/types/schemas/test-case.d.ts +51 -95
  65. package/dist/types/types/llm-test-runner.d.ts +1 -1
  66. package/dist/types/types/test-case.d.ts +1 -1
  67. package/package.json +9 -2
  68. package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
  69. package/dist/components/p-BF90yb1z.js +0 -7
  70. package/dist/components/p-BF90yb1z.js.map +0 -1
  71. /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
@@ -64,20 +64,6 @@ class RateLimitedFetcher {
64
64
  }
65
65
  }
66
66
 
67
- var EvaluationApproach;
68
- (function (EvaluationApproach) {
69
- EvaluationApproach["EXACT"] = "exact";
70
- EvaluationApproach["SEMANTIC"] = "semantic";
71
- EvaluationApproach["ROUGE_1"] = "rouge-1";
72
- EvaluationApproach["ROUGE_L"] = "rouge-L";
73
- EvaluationApproach["BLEU"] = "bleu";
74
- })(EvaluationApproach || (EvaluationApproach = {}));
75
- // Array of all evaluation approach values for UI components
76
- const EvaluationApproachValues = Object.values(EvaluationApproach);
77
- const DEFAULT_ROUGE_PASS_SCORE = 0.7;
78
- const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
79
- const DEFAULT_BLEU_PASS_SCORE = 0.7;
80
-
81
67
  /**
82
68
  * Reads a file asynchronously and returns its content as a string
83
69
  * @param file - The File object to read
@@ -120,23 +106,10 @@ function formatTestSuiteAsJson(testCases) {
120
106
  id: testCase.id,
121
107
  question: testCase.question,
122
108
  expectedOutcome: testCase.expectedOutcome,
123
- evaluationParameters: testCase.evaluationParameters,
124
109
  }));
125
110
  return JSON.stringify(exportData, null, 2);
126
111
  }
127
112
 
128
- function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
129
- return (expectedOutcome || [])
130
- .map(field => {
131
- if (field.type === 'chips-input') {
132
- return field.value.join(', ');
133
- }
134
- return field.value;
135
- })
136
- .join(joinWith)
137
- .trim();
138
- }
139
-
140
113
  /**
141
114
  * Escapes a CSV field by wrapping it in quotes if it contains special characters
142
115
  * @param field - The field to escape
@@ -155,48 +128,63 @@ function escapeCsvField(field) {
155
128
  */
156
129
  function exportTestResultsToCsv(testCases) {
157
130
  const csvRows = [];
131
+ const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
158
132
  // Add header row
159
133
  const headers = [
160
134
  'Question',
161
- 'Expected Keywords',
162
- 'Generated Keywords',
163
- 'Keywords Match',
164
135
  'Response Time (s)',
165
- 'Evaluation Approach',
166
- 'Evaluation Score',
167
136
  ];
137
+ for (let i = 1; i <= maxFieldCount; i++) {
138
+ headers.push('Field Name');
139
+ headers.push('Expected Keywords');
140
+ headers.push('Generated Keywords');
141
+ headers.push('Evaluation Strategy');
142
+ headers.push('Passed Evaluation');
143
+ headers.push('Keyword Match');
144
+ headers.push('Score');
145
+ if (i < maxFieldCount) {
146
+ headers.push('');
147
+ }
148
+ }
168
149
  csvRows.push(headers.join(','));
169
- // Add data rows
150
+ // Add data rows (one row per test case)
170
151
  testCases.forEach(testCase => {
171
- const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
172
- const evaluationApproach = testCase.evaluationParameters?.approach || '';
173
- const score = testCase.evaluationResult?.evaluationApproachResult?.score;
174
- const evaluationScore = score !== undefined ? score.toString() : '';
175
- let generatedKeywords = '';
176
- let keywordsMatch = '';
177
- if (testCase.evaluationResult) {
178
- const foundKeywords = testCase.evaluationResult.keywordMatches
179
- .filter(match => match.found)
180
- .map(match => match.keyword);
181
- generatedKeywords = foundKeywords.join('; ');
182
- // Calculate match percentages
183
- const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
184
- const totalKeywords = testCase.evaluationResult.keywordMatches.length;
185
- keywordsMatch =
186
- totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
187
- }
188
152
  const responseTime = testCase.responseTime
189
153
  ? (testCase.responseTime / 1000).toFixed(3)
190
154
  : 'N/A';
191
- const row = [
192
- escapeCsvField(testCase.question),
193
- escapeCsvField(expectedOutcome),
194
- escapeCsvField(generatedKeywords),
195
- keywordsMatch,
196
- responseTime,
197
- escapeCsvField(evaluationApproach),
198
- escapeCsvField(evaluationScore),
199
- ];
155
+ const row = [escapeCsvField(testCase.question), responseTime];
156
+ for (let i = 0; i < maxFieldCount; i++) {
157
+ const field = testCase.expectedOutcome?.[i];
158
+ const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
159
+ const expectedKeywords = fieldResult?.expectedValue ??
160
+ (field
161
+ ? field.type === 'chips-input'
162
+ ? field.value.join(', ')
163
+ : field.value
164
+ : '');
165
+ const generatedKeywords = (fieldResult?.keywordMatches || [])
166
+ .filter(match => match.found)
167
+ .map(match => match.keyword)
168
+ .join('; ');
169
+ const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
170
+ const totalMatches = fieldResult?.keywordMatches?.length || 0;
171
+ const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
172
+ const score = fieldResult?.evaluationApproachResult?.score !== undefined
173
+ ? fieldResult.evaluationApproachResult.score.toFixed(2)
174
+ : '';
175
+ row.push(escapeCsvField(field?.label || ''));
176
+ row.push(escapeCsvField(expectedKeywords || ''));
177
+ row.push(escapeCsvField(generatedKeywords));
178
+ row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
179
+ field?.evaluationParameters?.approach ||
180
+ ''));
181
+ row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
182
+ row.push(keywordMatch);
183
+ row.push(score);
184
+ if (i < maxFieldCount - 1) {
185
+ row.push('');
186
+ }
187
+ }
200
188
  csvRows.push(row.join(','));
201
189
  });
202
190
  return csvRows.join('\n');
@@ -255,6 +243,43 @@ function v4(options, buf, offset) {
255
243
  return unsafeStringify(rnds);
256
244
  }
257
245
 
246
+ var EvaluationApproach;
247
+ (function (EvaluationApproach) {
248
+ EvaluationApproach["EXACT"] = "exact";
249
+ EvaluationApproach["SEMANTIC"] = "semantic";
250
+ EvaluationApproach["ROUGE_1"] = "rouge-1";
251
+ EvaluationApproach["ROUGE_L"] = "rouge-L";
252
+ EvaluationApproach["BLEU"] = "bleu";
253
+ })(EvaluationApproach || (EvaluationApproach = {}));
254
+ // Array of all evaluation approach values for UI components
255
+ const EvaluationApproachValues = Object.values(EvaluationApproach);
256
+ const DEFAULT_ROUGE_PASS_SCORE = 0.7;
257
+ const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
258
+ const DEFAULT_BLEU_PASS_SCORE = 0.7;
259
+
260
+ const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
261
+ function getAllowedApproachesForFieldType(fieldType) {
262
+ if (fieldType === 'select') {
263
+ return SELECT_ONLY_APPROACHES;
264
+ }
265
+ return EvaluationApproachValues;
266
+ }
267
+ function isApproachAllowedForFieldType(fieldType, approach) {
268
+ return getAllowedApproachesForFieldType(fieldType).includes(approach);
269
+ }
270
+ function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
271
+ const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
272
+ const fallbackApproach = allowedApproaches[0];
273
+ const rawApproach = evaluationParameters?.approach;
274
+ const approach = rawApproach && allowedApproaches.includes(rawApproach)
275
+ ? rawApproach
276
+ : fallbackApproach;
277
+ return {
278
+ ...evaluationParameters,
279
+ approach,
280
+ };
281
+ }
282
+
258
283
  const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
259
284
  {
260
285
  type: 'textarea',
@@ -263,6 +288,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
263
288
  rows: 2,
264
289
  },
265
290
  ];
291
+ function normalizeExpectedOutcomeField(field) {
292
+ return {
293
+ ...field,
294
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
295
+ };
296
+ }
266
297
  /**
267
298
  * Creates a new test case with default values
268
299
  * @returns A new TestCase object with a unique ID
@@ -272,9 +303,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
272
303
  id: v4(),
273
304
  question: '',
274
305
  expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
275
- evaluationParameters: {
276
- approach: EvaluationApproach.EXACT,
277
- },
278
306
  isRunning: false,
279
307
  };
280
308
  }
@@ -284,35 +312,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
284
312
  return {
285
313
  type: 'text',
286
314
  label: schemaField.label,
287
- required: schemaField.required,
288
315
  placeholder: schemaField.placeholder,
289
316
  value: '',
317
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
290
318
  };
291
319
  case 'textarea':
292
320
  return {
293
321
  type: 'textarea',
294
322
  label: schemaField.label,
295
- required: schemaField.required,
296
323
  placeholder: schemaField.placeholder,
297
324
  rows: schemaField.rows,
298
325
  value: '',
326
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
299
327
  };
300
328
  case 'chips-input':
301
329
  return {
302
330
  type: 'chips-input',
303
331
  label: schemaField.label,
304
- required: schemaField.required,
305
332
  placeholder: schemaField.placeholder,
306
333
  value: [],
334
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
307
335
  };
308
336
  case 'select':
309
337
  return {
310
338
  type: 'select',
311
339
  label: schemaField.label,
312
- required: schemaField.required,
313
340
  placeholder: schemaField.placeholder,
314
341
  value: '',
315
342
  options: schemaField.options,
343
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
316
344
  };
317
345
  default: {
318
346
  const _exhaustiveCheck = schemaField;
@@ -323,32 +351,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
323
351
  function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
324
352
  return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
325
353
  }
326
- function migrateLegacyExpectedOutcomeString(value) {
327
- return [
328
- {
329
- type: 'textarea',
330
- label: 'Expected Outcome',
331
- value,
332
- },
333
- ];
334
- }
335
354
  /**
336
355
  * Creates a runtime test case from validated input data.
337
- * The input is expected to already satisfy `TestCaseInput` (legacy string or v2 shape),
338
- * and this function only performs normalization/defaulting (including legacy migration).
356
+ * The input is expected to already satisfy `TestCaseInput`,
357
+ * and this function only performs normalization/defaulting.
339
358
  *
340
359
  * @param data - Validated test case input
341
360
  * @returns A normalized TestCase object with runtime defaults applied
342
361
  */
343
362
  function createTestCaseFromInput(data) {
344
- let expectedOutcome;
345
- if (typeof data.expectedOutcome === 'string') {
346
- expectedOutcome = migrateLegacyExpectedOutcomeString(data.expectedOutcome);
347
- }
348
- else {
349
- expectedOutcome = data.expectedOutcome;
350
- }
351
- return { ...data, expectedOutcome };
363
+ return {
364
+ ...data,
365
+ expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
366
+ };
352
367
  }
353
368
 
354
369
  /** A special constant with type `never` */
@@ -4938,27 +4953,43 @@ function superRefine(fn) {
4938
4953
  const nonEmptyString = string().trim().min(1);
4939
4954
  const optionalPositiveInt = number().int().positive().optional();
4940
4955
  const optionalString = string().optional();
4941
- const optionalBoolean = boolean().optional();
4942
4956
  const selectOptionsSchema = array(nonEmptyString).min(1);
4957
+ const optionalNumber = number().optional();
4958
+ const evaluationParametersSchema = object({
4959
+ approach: _enum(EvaluationApproach),
4960
+ threshold: optionalNumber,
4961
+ });
4962
+ const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
4963
+ if (!isApproachAllowedForFieldType('select', parameters.approach)) {
4964
+ ctx.addIssue({
4965
+ code: 'custom',
4966
+ path: ['approach'],
4967
+ message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
4968
+ });
4969
+ }
4970
+ });
4943
4971
  const defaultExpectedOutcomeBaseSchema = object({
4944
4972
  label: nonEmptyString,
4945
- required: optionalBoolean,
4946
4973
  placeholder: optionalString,
4947
4974
  });
4948
4975
  const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
4949
4976
  text: baseSchema.extend({
4950
4977
  type: literal('text'),
4978
+ evaluationParameters: evaluationParametersSchema.optional(),
4951
4979
  }),
4952
4980
  textarea: baseSchema.extend({
4953
4981
  type: literal('textarea'),
4954
4982
  rows: optionalPositiveInt,
4983
+ evaluationParameters: evaluationParametersSchema.optional(),
4955
4984
  }),
4956
4985
  chipsInput: baseSchema.extend({
4957
4986
  type: literal('chips-input'),
4987
+ evaluationParameters: evaluationParametersSchema.optional(),
4958
4988
  }),
4959
4989
  select: baseSchema.extend({
4960
4990
  type: literal('select'),
4961
4991
  options: selectOptionsSchema,
4992
+ evaluationParameters: selectEvaluationParametersSchema.optional(),
4962
4993
  }),
4963
4994
  });
4964
4995
  function hasDuplicateChips(values) {
@@ -5020,33 +5051,16 @@ function validateExpectedOutcomeSchema(schema) {
5020
5051
  }
5021
5052
  }
5022
5053
 
5023
- const evaluationParametersSchema = object({
5024
- approach: _enum(EvaluationApproach),
5025
- threshold: number().optional(),
5026
- });
5027
- const baseTestCaseInputSchema = object({
5054
+ const testCaseInputSchema = object({
5028
5055
  id: string(),
5029
5056
  question: string(),
5030
- evaluationParameters: evaluationParametersSchema.optional(),
5031
- });
5032
- const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
5033
- expectedOutcome: string(),
5034
- });
5035
- const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
5036
5057
  expectedOutcome: expectedOutcomeArraySchema,
5037
5058
  });
5038
- const testCaseInputSchema = union([
5039
- legacyTestCaseInputSchema,
5040
- v2TestCaseInputSchema,
5041
- ]);
5042
- const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
5043
- message: 'The test suite is empty. Please provide at least one test case.',
5044
- });
5059
+ const testCaseInputArraySchema = array(testCaseInputSchema);
5045
5060
  object({
5046
5061
  id: string(),
5047
5062
  question: string(),
5048
5063
  expectedOutcome: expectedOutcomeArraySchema,
5049
- evaluationParameters: evaluationParametersSchema.optional(),
5050
5064
  output: string().optional(),
5051
5065
  isRunning: boolean().optional(),
5052
5066
  error: string().optional(),
@@ -5097,19 +5111,69 @@ function importTestSuite(jsonContent) {
5097
5111
  }
5098
5112
  }
5099
5113
 
5114
+ function applyExpectedOutcomeChange(testCase, change) {
5115
+ const { index } = change;
5116
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5117
+ const target = expectedOutcome[index];
5118
+ if (!target) {
5119
+ return testCase;
5120
+ }
5121
+ switch (change.operation) {
5122
+ case 'set-value': {
5123
+ if (target.type === 'chips-input') {
5124
+ return testCase;
5125
+ }
5126
+ expectedOutcome[index] = {
5127
+ ...target,
5128
+ value: change.value,
5129
+ };
5130
+ return { ...testCase, expectedOutcome };
5131
+ }
5132
+ case 'add-chip': {
5133
+ if (target.type !== 'chips-input') {
5134
+ return testCase;
5135
+ }
5136
+ expectedOutcome[index] = {
5137
+ ...target,
5138
+ value: [...target.value, change.value],
5139
+ };
5140
+ return { ...testCase, expectedOutcome };
5141
+ }
5142
+ case 'remove-chip': {
5143
+ if (target.type !== 'chips-input') {
5144
+ return testCase;
5145
+ }
5146
+ expectedOutcome[index] = {
5147
+ ...target,
5148
+ value: target.value.filter(chip => chip !== change.value),
5149
+ };
5150
+ return { ...testCase, expectedOutcome };
5151
+ }
5152
+ case 'set-evaluation-approach':
5153
+ return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
5154
+ }
5155
+ }
5100
5156
  /**
5101
- * Updates the evaluation approach for a test case
5102
- * @param testCase - The test case to update
5103
- * @param approach - The new evaluation approach
5104
- * @returns Updated test case with the new evaluation approach
5157
+ * Updates the evaluation approach for a specific expected outcome field.
5158
+ * Select fields always use exact matching.
5105
5159
  */
5106
- function updateApproach(testCase, approach) {
5160
+ function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
5161
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5162
+ const target = expectedOutcome[fieldIndex];
5163
+ if (!target) {
5164
+ return testCase;
5165
+ }
5166
+ const currentEvaluationParameters = target.evaluationParameters;
5167
+ expectedOutcome[fieldIndex] = {
5168
+ ...target,
5169
+ evaluationParameters: normalizeEvaluationParametersForField(target.type, {
5170
+ ...currentEvaluationParameters,
5171
+ approach,
5172
+ }),
5173
+ };
5107
5174
  return {
5108
5175
  ...testCase,
5109
- evaluationParameters: {
5110
- ...testCase.evaluationParameters,
5111
- approach: approach,
5112
- },
5176
+ expectedOutcome,
5113
5177
  };
5114
5178
  }
5115
5179
 
@@ -29861,57 +29925,78 @@ function performBleuEvaluation(request) {
29861
29925
 
29862
29926
  class LLMEvaluationEngine {
29863
29927
  async evaluateResponse(request, callback) {
29864
- try {
29865
- const approach = request.evaluationParameters.approach;
29866
- switch (approach) {
29867
- case EvaluationApproach.BLEU: {
29868
- const bleuResult = performBleuEvaluation(request);
29869
- callback(bleuResult);
29870
- break;
29871
- }
29872
- case EvaluationApproach.EXACT: {
29873
- const exactResult = await performEvaluation(request);
29874
- callback(exactResult);
29875
- break;
29876
- }
29877
- case EvaluationApproach.ROUGE_1: {
29878
- const rougeResult = await performRouge1Evaluation(request);
29879
- callback(rougeResult);
29880
- break;
29881
- }
29882
- case EvaluationApproach.ROUGE_L: {
29883
- const rougeLResult = await performRougeLEvaluation(request);
29884
- callback(rougeLResult);
29885
- break;
29886
- }
29887
- case EvaluationApproach.SEMANTIC: {
29888
- const semanticResult = await performSemanticEvaluation(request);
29889
- callback(semanticResult);
29890
- break;
29891
- }
29892
- default: {
29893
- console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29894
- const fallbackResult = await performEvaluation(request);
29895
- callback(fallbackResult);
29896
- }
29897
- }
29898
- }
29899
- catch (error) {
29900
- console.error('Evaluation failed:', error);
29901
- const errorResult = {
29928
+ const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
29929
+ const fieldRequest = {
29902
29930
  testCaseId: request.testCaseId,
29931
+ question: request.question,
29932
+ actualResponse: request.actualResponse,
29933
+ expectedOutcome: field.expectedValue,
29934
+ evaluationParameters: field.evaluationParameters,
29935
+ };
29936
+ const result = await this.evaluateField(fieldRequest);
29937
+ const fieldResult = {
29938
+ index: field.index,
29939
+ label: field.label,
29940
+ type: field.type,
29941
+ expectedValue: field.expectedValue,
29942
+ passed: result.passed,
29943
+ keywordMatches: result.keywordMatches,
29944
+ evaluationParameters: result.evaluationParameters,
29945
+ evaluationApproachResult: result.evaluationApproachResult,
29946
+ };
29947
+ return fieldResult;
29948
+ }));
29949
+ const fieldResults = settledResults.map((settledResult, index) => {
29950
+ const field = request.fields[index];
29951
+ if (settledResult.status === 'fulfilled') {
29952
+ return settledResult.value;
29953
+ }
29954
+ return {
29955
+ index: field.index,
29956
+ label: field.label,
29957
+ type: field.type,
29958
+ expectedValue: field.expectedValue,
29903
29959
  passed: false,
29904
29960
  keywordMatches: [],
29905
- timestamp: new Date().toISOString(),
29906
- evaluationParameters: request.evaluationParameters,
29961
+ evaluationParameters: field.evaluationParameters,
29907
29962
  evaluationApproachResult: {
29908
29963
  score: 0,
29909
- approachUsed: EvaluationApproach.EXACT,
29964
+ approachUsed: field.evaluationParameters.approach,
29910
29965
  },
29966
+ error: this.getSafeErrorMessage(settledResult.reason),
29911
29967
  };
29912
- callback(errorResult);
29968
+ });
29969
+ const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
29970
+ const passed = fieldResults.every(field => field.passed && !field.error);
29971
+ callback({
29972
+ testCaseId: request.testCaseId,
29973
+ passed,
29974
+ keywordMatches,
29975
+ fieldResults,
29976
+ timestamp: new Date().toISOString(),
29977
+ });
29978
+ }
29979
+ async evaluateField(request) {
29980
+ const approach = request.evaluationParameters.approach;
29981
+ switch (approach) {
29982
+ case EvaluationApproach.BLEU:
29983
+ return performBleuEvaluation(request);
29984
+ case EvaluationApproach.EXACT:
29985
+ return performEvaluation(request);
29986
+ case EvaluationApproach.ROUGE_1:
29987
+ return performRouge1Evaluation(request);
29988
+ case EvaluationApproach.ROUGE_L:
29989
+ return performRougeLEvaluation(request);
29990
+ case EvaluationApproach.SEMANTIC:
29991
+ return performSemanticEvaluation(request);
29992
+ default:
29993
+ console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29994
+ return performEvaluation(request);
29913
29995
  }
29914
29996
  }
29997
+ getSafeErrorMessage(error) {
29998
+ return error instanceof Error ? error.message : 'Field evaluation failed.';
29999
+ }
29915
30000
  }
29916
30001
 
29917
30002
  /**
@@ -29932,12 +30017,18 @@ class EvaluationService {
29932
30017
  console.warn('⚠️ No output to evaluate for test case:', testCase.id);
29933
30018
  return;
29934
30019
  }
30020
+ const fields = (testCase.expectedOutcome || []).map((field, index) => ({
30021
+ index,
30022
+ label: field.label,
30023
+ type: field.type,
30024
+ expectedValue: getFieldExpectedValue(field),
30025
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
30026
+ }));
29935
30027
  const evaluationRequest = {
29936
30028
  testCaseId: testCase.id,
29937
30029
  question: testCase.question,
29938
- expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
29939
30030
  actualResponse: testCase.output,
29940
- evaluationParameters: testCase.evaluationParameters,
30031
+ fields,
29941
30032
  };
29942
30033
  await this.engine.evaluateResponse(evaluationRequest, (result) => {
29943
30034
  console.log('📊 Evaluation result received:', result);
@@ -29945,6 +30036,12 @@ class EvaluationService {
29945
30036
  });
29946
30037
  }
29947
30038
  }
30039
+ function getFieldExpectedValue(field) {
30040
+ if (field.type === 'chips-input') {
30041
+ return field.value.join(', ');
30042
+ }
30043
+ return field.value;
30044
+ }
29948
30045
 
29949
30046
  const Button = (props, children) => {
29950
30047
  const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
@@ -29987,7 +30084,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
29987
30084
  };
29988
30085
 
29989
30086
  const EvaluationSummary = ({ result, isRunning, }) => {
29990
- return (index.h("div", { class: "evaluation-summary" }, result ? (index.h("div", { class: "evaluation-summary__result" }, index.h("div", { class: `evaluation-summary__result-status evaluation-summary__result-status--${result.passed ? 'passed' : 'failed'}` }, result.passed ? '✅ PASSED' : '❌ FAILED'), index.h("div", { class: "evaluation-summary__details" }, "Keywords: ", result.keywordMatches.filter(m => m.found).length, "/", result.keywordMatches.length, " found"))) : (index.h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
30087
+ const fieldResults = result?.fieldResults || [];
30088
+ const hasFieldResults = fieldResults.length > 0;
30089
+ return (index.h("div", { class: "evaluation-summary" }, result ? (index.h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (index.h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (index.h("div", { class: "evaluation-summary__field-result" }, index.h("div", { class: "evaluation-summary__field-header" }, index.h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), index.h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), index.h("div", { class: "evaluation-summary__field-details" }, index.h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (index.h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), index.h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), index.h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (index.h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
29991
30090
  };
29992
30091
 
29993
30092
  const IconButton = (props, children) => {
@@ -30023,6 +30122,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30023
30122
  const emit = (detail) => onExpectedOutcomeChange({
30024
30123
  detail,
30025
30124
  });
30125
+ const buildEvaluationConfig = (index, optionList) => ({
30126
+ name: `expectedOutcomeEvaluation-${index}`,
30127
+ fieldType: FormFieldType.SELECT,
30128
+ label: 'Evaluation Approach',
30129
+ placeholder: 'Select evaluation approach…',
30130
+ required: true,
30131
+ optionList,
30132
+ defaultValue: EvaluationApproach.EXACT,
30133
+ });
30134
+ const renderEvaluationSelector = (field, index$1) => {
30135
+ const optionList = getAllowedApproachesForFieldType(field.type);
30136
+ return (index.h("app-select", { config: buildEvaluationConfig(index$1, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
30137
+ testCaseId,
30138
+ index: index$1,
30139
+ operation: 'set-evaluation-approach',
30140
+ value: e.detail.value,
30141
+ }) }));
30142
+ };
30026
30143
  return (index.h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index$1) => {
30027
30144
  if (field.type === 'textarea') {
30028
30145
  const config = {
@@ -30030,15 +30147,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30030
30147
  fieldType: FormFieldType.TEXT_AREA,
30031
30148
  label: field.label,
30032
30149
  placeholder: field.placeholder,
30033
- required: field.required,
30150
+ required: true,
30034
30151
  rows: field.rows || 2,
30035
30152
  };
30036
- return (index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30153
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30037
30154
  testCaseId,
30038
30155
  index: index$1,
30039
30156
  operation: 'set-value',
30040
30157
  value: e.detail.value,
30041
- }) }));
30158
+ }) }), renderEvaluationSelector(field, index$1)));
30042
30159
  }
30043
30160
  if (field.type === 'chips-input') {
30044
30161
  const config = {
@@ -30046,9 +30163,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30046
30163
  fieldType: FormFieldType.CHIPS,
30047
30164
  label: field.label,
30048
30165
  placeholder: field.placeholder,
30049
- required: field.required,
30166
+ required: true,
30050
30167
  };
30051
- return (index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30168
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30052
30169
  testCaseId,
30053
30170
  index: index$1,
30054
30171
  operation: 'add-chip',
@@ -30058,7 +30175,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30058
30175
  index: index$1,
30059
30176
  operation: 'remove-chip',
30060
30177
  value: e.detail.value,
30061
- }) }));
30178
+ }) }), renderEvaluationSelector(field, index$1)));
30062
30179
  }
30063
30180
  if (field.type === 'select') {
30064
30181
  const config = {
@@ -30066,26 +30183,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30066
30183
  fieldType: FormFieldType.SELECT,
30067
30184
  label: field.label,
30068
30185
  placeholder: field.placeholder,
30069
- required: field.required,
30186
+ required: true,
30070
30187
  optionList: field.options,
30071
30188
  };
30072
- return (index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30189
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30073
30190
  testCaseId,
30074
30191
  index: index$1,
30075
30192
  operation: 'set-value',
30076
30193
  value: e.detail.value,
30077
- }) }));
30194
+ }) }), renderEvaluationSelector(field, index$1)));
30078
30195
  }
30079
- return (index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30196
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30080
30197
  testCaseId,
30081
30198
  index: index$1,
30082
30199
  operation: 'set-value',
30083
30200
  value: e.target.value,
30084
- }) })));
30201
+ }) })), renderEvaluationSelector(field, index$1)));
30085
30202
  })));
30086
30203
  };
30087
30204
 
30088
- const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30205
+ const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30089
30206
  const questionConfig = {
30090
30207
  name: 'question',
30091
30208
  fieldType: FormFieldType.TEXT_AREA,
@@ -30095,26 +30212,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
30095
30212
  required: true,
30096
30213
  rows: 3,
30097
30214
  };
30098
- const evaluationConfig = {
30099
- name: 'EvaluationApproach',
30100
- fieldType: FormFieldType.SELECT,
30101
- label: 'Evaluation',
30102
- placeholder: 'Select evaluation approach…',
30103
- required: true,
30104
- optionList: EvaluationApproachValues,
30105
- defaultValue: EvaluationApproach.EXACT,
30106
- };
30107
30215
  return (index.h("div", { class: "test-case-row", key: testCase.id }, index.h("div", { class: "test-case-row__input-column" }, index.h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
30108
30216
  detail: {
30109
30217
  testCaseId: testCase.id,
30110
30218
  key: 'question',
30111
30219
  value: e.detail.value,
30112
30220
  },
30113
- }) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange }), index.h("app-select", { config: evaluationConfig, value: testCase.evaluationParameters?.approach, onValueChange: (e) => onUpdateApproach(testCase, e.detail.value) })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30221
+ }) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30114
30222
  };
30115
30223
 
30116
- const LLMTestCases = ({ testCases, onRun, onDelete, onUpdateApproach, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30117
- return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, onUpdateApproach: onUpdateApproach, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30224
+ const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30225
+ return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30118
30226
  };
30119
30227
 
30120
30228
  const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
@@ -30125,11 +30233,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
30125
30233
 
30126
30234
  const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
30127
30235
 
30128
- const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30236
+ const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30129
30237
 
30130
30238
  const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
30131
30239
 
30132
- const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__details{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__result-status{font-weight:var(--font-weight-semibold);font-size:var(--font-size-sm);padding:var(--spacing-2) var(--spacing-3);border-radius:var(--radius-md);text-align:center}.evaluation-summary__result-status--passed{background:var(--success);color:var(--success-foreground);border:var(--border-width) solid var(--success)}.evaluation-summary__result-status--failed{background:var(--destructive);color:var(--destructive-foreground);border:var(--border-width) solid var(--destructive)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30240
+ const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30133
30241
 
30134
30242
  const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
30135
30243
 
@@ -30162,9 +30270,6 @@ const LLMTestRunner = class {
30162
30270
  value: '',
30163
30271
  },
30164
30272
  ],
30165
- evaluationParameters: {
30166
- approach: EvaluationApproach.EXACT,
30167
- },
30168
30273
  isRunning: false,
30169
30274
  },
30170
30275
  ];
@@ -30271,52 +30376,13 @@ const LLMTestRunner = class {
30271
30376
  deleteTestCase(id) {
30272
30377
  this.testCases = this.testCases.filter(tc => tc.id !== id);
30273
30378
  }
30274
- updateApproach(testCase, approach) {
30275
- if (testCase) {
30276
- const updated = updateApproach(testCase, approach);
30277
- this.updateTestCase(testCase.id, {
30278
- evaluationParameters: updated.evaluationParameters,
30279
- });
30280
- }
30281
- }
30282
30379
  handleExpectedOutcomeChange = (event) => {
30283
- const { testCaseId, index, operation, value } = event.detail;
30380
+ const { testCaseId, ...change } = event.detail;
30284
30381
  this.testCases = this.testCases.map(tc => {
30285
- if (tc.id !== testCaseId)
30286
- return tc;
30287
- const expectedOutcome = [...(tc.expectedOutcome || [])];
30288
- const target = expectedOutcome[index];
30289
- if (!target)
30382
+ if (tc.id !== testCaseId) {
30290
30383
  return tc;
30291
- if (operation === 'set-value') {
30292
- if (target.type === 'chips-input') {
30293
- return tc;
30294
- }
30295
- expectedOutcome[index] = { ...target, value: value || '' };
30296
- return { ...tc, expectedOutcome };
30297
- }
30298
- if (operation === 'add-chip') {
30299
- if (target.type !== 'chips-input' || !value) {
30300
- return tc;
30301
- }
30302
- expectedOutcome[index] = {
30303
- ...target,
30304
- value: [...target.value, value],
30305
- };
30306
- return { ...tc, expectedOutcome };
30307
- }
30308
- if (operation === 'remove-chip') {
30309
- if (target.type !== 'chips-input' ||
30310
- !value) {
30311
- return tc;
30312
- }
30313
- expectedOutcome[index] = {
30314
- ...target,
30315
- value: target.value.filter(chip => chip !== value),
30316
- };
30317
- return { ...tc, expectedOutcome };
30318
30384
  }
30319
- return tc;
30385
+ return applyExpectedOutcomeChange(tc, change);
30320
30386
  });
30321
30387
  };
30322
30388
  async evaluateResponse(testCase) {
@@ -30416,7 +30482,7 @@ const LLMTestRunner = class {
30416
30482
  }
30417
30483
  }
30418
30484
  render() {
30419
- return (index.h("div", { key: '5cbdc388678929c271fd2a040aca8118344024c3', class: "test-runner-container" }, index.h(LLMTestRunnerHeader, { key: '92533803732fc5ec28da802ac9d367f9fbbffe72', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), index.h(ErrorMessage, { key: 'c16a0334b1a71d676a128de18a83991c2625a075', message: this.error, onClear: () => (this.error = '') }), index.h("div", { key: 'e757f49052a9516c12af858b46b32a957707524c', class: "test-runner-container__content" }, index.h(LLMTestCases, { key: 'e9a9f6553a3ce97aeb80924b116e1b73c2397b15', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onUpdateApproach: (testCase, approach) => this.updateApproach(testCase, approach), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30485
+ return (index.h("div", { key: 'e3d007b453f770fcb59c29f8ee83bd8a35e82a34', class: "test-runner-container" }, index.h(LLMTestRunnerHeader, { key: 'b7c44bf4807fe8d9e5de514818420d67d2e0dbfb', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), index.h(ErrorMessage, { key: '697237ec0f8d2e704609fd0b240629f22c2a3ef6', message: this.error, onClear: () => (this.error = '') }), index.h("div", { key: '64a623f897dfb96d922ddc0cbdfcf529c52bef76', class: "test-runner-container__content" }, index.h(LLMTestCases, { key: '017da41567c5c13933d9cf31d1a972743bd9b100', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30420
30486
  }
30421
30487
  };
30422
30488
  LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));