llm-testrunner-components 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +165 -242
  3. package/dist/cjs/index.cjs.js +298 -232
  4. package/dist/cjs/index.cjs.js.map +1 -1
  5. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
  6. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
  7. package/dist/collection/components/llm-test-runner/llm-test-runner.js +6 -49
  8. package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
  9. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
  10. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
  11. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
  12. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
  13. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
  14. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
  15. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
  16. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
  17. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
  18. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
  19. package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
  20. package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
  21. package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
  22. package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
  23. package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
  24. package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
  25. package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
  26. package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
  27. package/dist/collection/lib/evaluation/index.js +0 -4
  28. package/dist/collection/lib/evaluation/index.js.map +1 -1
  29. package/dist/collection/lib/evaluation/types.js.map +1 -1
  30. package/dist/collection/lib/import-export/test-results-csv.js +47 -33
  31. package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
  32. package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
  33. package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
  34. package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
  35. package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
  36. package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
  37. package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
  38. package/dist/collection/schemas/expected-outcome.js +20 -2
  39. package/dist/collection/schemas/expected-outcome.js.map +1 -1
  40. package/dist/collection/schemas/test-case.js +2 -20
  41. package/dist/collection/schemas/test-case.js.map +1 -1
  42. package/dist/collection/types/llm-test-runner.js.map +1 -1
  43. package/dist/collection/types/test-case.js.map +1 -1
  44. package/dist/components/index.js +1 -1
  45. package/dist/components/llm-test-runner.js +1 -1
  46. package/dist/components/p-Bb89MYYu.js +7 -0
  47. package/dist/components/p-Bb89MYYu.js.map +1 -0
  48. package/dist/esm/index.js +298 -232
  49. package/dist/esm/index.js.map +1 -1
  50. package/dist/llm-testrunner/index.esm.js +2 -2
  51. package/dist/llm-testrunner/index.esm.js.map +1 -1
  52. package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +0 -1
  53. package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
  54. package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
  55. package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
  56. package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
  57. package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
  58. package/dist/types/lib/evaluation/index.d.ts +0 -1
  59. package/dist/types/lib/evaluation/types.d.ts +26 -0
  60. package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
  61. package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
  62. package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
  63. package/dist/types/schemas/expected-outcome.d.ts +65 -17
  64. package/dist/types/schemas/test-case.d.ts +51 -95
  65. package/dist/types/types/llm-test-runner.d.ts +1 -1
  66. package/dist/types/types/test-case.d.ts +1 -1
  67. package/package.json +9 -2
  68. package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
  69. package/dist/components/p-BF90yb1z.js +0 -7
  70. package/dist/components/p-BF90yb1z.js.map +0 -1
  71. /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/dist/esm/index.js CHANGED
@@ -61,20 +61,6 @@ class RateLimitedFetcher {
61
61
  }
62
62
  }
63
63
 
64
- var EvaluationApproach;
65
- (function (EvaluationApproach) {
66
- EvaluationApproach["EXACT"] = "exact";
67
- EvaluationApproach["SEMANTIC"] = "semantic";
68
- EvaluationApproach["ROUGE_1"] = "rouge-1";
69
- EvaluationApproach["ROUGE_L"] = "rouge-L";
70
- EvaluationApproach["BLEU"] = "bleu";
71
- })(EvaluationApproach || (EvaluationApproach = {}));
72
- // Array of all evaluation approach values for UI components
73
- const EvaluationApproachValues = Object.values(EvaluationApproach);
74
- const DEFAULT_ROUGE_PASS_SCORE = 0.7;
75
- const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
76
- const DEFAULT_BLEU_PASS_SCORE = 0.7;
77
-
78
64
  /**
79
65
  * Reads a file asynchronously and returns its content as a string
80
66
  * @param file - The File object to read
@@ -117,23 +103,10 @@ function formatTestSuiteAsJson(testCases) {
117
103
  id: testCase.id,
118
104
  question: testCase.question,
119
105
  expectedOutcome: testCase.expectedOutcome,
120
- evaluationParameters: testCase.evaluationParameters,
121
106
  }));
122
107
  return JSON.stringify(exportData, null, 2);
123
108
  }
124
109
 
125
- function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
126
- return (expectedOutcome || [])
127
- .map(field => {
128
- if (field.type === 'chips-input') {
129
- return field.value.join(', ');
130
- }
131
- return field.value;
132
- })
133
- .join(joinWith)
134
- .trim();
135
- }
136
-
137
110
  /**
138
111
  * Escapes a CSV field by wrapping it in quotes if it contains special characters
139
112
  * @param field - The field to escape
@@ -152,48 +125,63 @@ function escapeCsvField(field) {
152
125
  */
153
126
  function exportTestResultsToCsv(testCases) {
154
127
  const csvRows = [];
128
+ const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
155
129
  // Add header row
156
130
  const headers = [
157
131
  'Question',
158
- 'Expected Keywords',
159
- 'Generated Keywords',
160
- 'Keywords Match',
161
132
  'Response Time (s)',
162
- 'Evaluation Approach',
163
- 'Evaluation Score',
164
133
  ];
134
+ for (let i = 1; i <= maxFieldCount; i++) {
135
+ headers.push('Field Name');
136
+ headers.push('Expected Keywords');
137
+ headers.push('Generated Keywords');
138
+ headers.push('Evaluation Strategy');
139
+ headers.push('Passed Evaluation');
140
+ headers.push('Keyword Match');
141
+ headers.push('Score');
142
+ if (i < maxFieldCount) {
143
+ headers.push('');
144
+ }
145
+ }
165
146
  csvRows.push(headers.join(','));
166
- // Add data rows
147
+ // Add data rows (one row per test case)
167
148
  testCases.forEach(testCase => {
168
- const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
169
- const evaluationApproach = testCase.evaluationParameters?.approach || '';
170
- const score = testCase.evaluationResult?.evaluationApproachResult?.score;
171
- const evaluationScore = score !== undefined ? score.toString() : '';
172
- let generatedKeywords = '';
173
- let keywordsMatch = '';
174
- if (testCase.evaluationResult) {
175
- const foundKeywords = testCase.evaluationResult.keywordMatches
176
- .filter(match => match.found)
177
- .map(match => match.keyword);
178
- generatedKeywords = foundKeywords.join('; ');
179
- // Calculate match percentages
180
- const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
181
- const totalKeywords = testCase.evaluationResult.keywordMatches.length;
182
- keywordsMatch =
183
- totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
184
- }
185
149
  const responseTime = testCase.responseTime
186
150
  ? (testCase.responseTime / 1000).toFixed(3)
187
151
  : 'N/A';
188
- const row = [
189
- escapeCsvField(testCase.question),
190
- escapeCsvField(expectedOutcome),
191
- escapeCsvField(generatedKeywords),
192
- keywordsMatch,
193
- responseTime,
194
- escapeCsvField(evaluationApproach),
195
- escapeCsvField(evaluationScore),
196
- ];
152
+ const row = [escapeCsvField(testCase.question), responseTime];
153
+ for (let i = 0; i < maxFieldCount; i++) {
154
+ const field = testCase.expectedOutcome?.[i];
155
+ const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
156
+ const expectedKeywords = fieldResult?.expectedValue ??
157
+ (field
158
+ ? field.type === 'chips-input'
159
+ ? field.value.join(', ')
160
+ : field.value
161
+ : '');
162
+ const generatedKeywords = (fieldResult?.keywordMatches || [])
163
+ .filter(match => match.found)
164
+ .map(match => match.keyword)
165
+ .join('; ');
166
+ const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
167
+ const totalMatches = fieldResult?.keywordMatches?.length || 0;
168
+ const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
169
+ const score = fieldResult?.evaluationApproachResult?.score !== undefined
170
+ ? fieldResult.evaluationApproachResult.score.toFixed(2)
171
+ : '';
172
+ row.push(escapeCsvField(field?.label || ''));
173
+ row.push(escapeCsvField(expectedKeywords || ''));
174
+ row.push(escapeCsvField(generatedKeywords));
175
+ row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
176
+ field?.evaluationParameters?.approach ||
177
+ ''));
178
+ row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
179
+ row.push(keywordMatch);
180
+ row.push(score);
181
+ if (i < maxFieldCount - 1) {
182
+ row.push('');
183
+ }
184
+ }
197
185
  csvRows.push(row.join(','));
198
186
  });
199
187
  return csvRows.join('\n');
@@ -252,6 +240,43 @@ function v4(options, buf, offset) {
252
240
  return unsafeStringify(rnds);
253
241
  }
254
242
 
243
+ var EvaluationApproach;
244
+ (function (EvaluationApproach) {
245
+ EvaluationApproach["EXACT"] = "exact";
246
+ EvaluationApproach["SEMANTIC"] = "semantic";
247
+ EvaluationApproach["ROUGE_1"] = "rouge-1";
248
+ EvaluationApproach["ROUGE_L"] = "rouge-L";
249
+ EvaluationApproach["BLEU"] = "bleu";
250
+ })(EvaluationApproach || (EvaluationApproach = {}));
251
+ // Array of all evaluation approach values for UI components
252
+ const EvaluationApproachValues = Object.values(EvaluationApproach);
253
+ const DEFAULT_ROUGE_PASS_SCORE = 0.7;
254
+ const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
255
+ const DEFAULT_BLEU_PASS_SCORE = 0.7;
256
+
257
+ const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
258
+ function getAllowedApproachesForFieldType(fieldType) {
259
+ if (fieldType === 'select') {
260
+ return SELECT_ONLY_APPROACHES;
261
+ }
262
+ return EvaluationApproachValues;
263
+ }
264
+ function isApproachAllowedForFieldType(fieldType, approach) {
265
+ return getAllowedApproachesForFieldType(fieldType).includes(approach);
266
+ }
267
+ function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
268
+ const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
269
+ const fallbackApproach = allowedApproaches[0];
270
+ const rawApproach = evaluationParameters?.approach;
271
+ const approach = rawApproach && allowedApproaches.includes(rawApproach)
272
+ ? rawApproach
273
+ : fallbackApproach;
274
+ return {
275
+ ...evaluationParameters,
276
+ approach,
277
+ };
278
+ }
279
+
255
280
  const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
256
281
  {
257
282
  type: 'textarea',
@@ -260,6 +285,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
260
285
  rows: 2,
261
286
  },
262
287
  ];
288
+ function normalizeExpectedOutcomeField(field) {
289
+ return {
290
+ ...field,
291
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
292
+ };
293
+ }
263
294
  /**
264
295
  * Creates a new test case with default values
265
296
  * @returns A new TestCase object with a unique ID
@@ -269,9 +300,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
269
300
  id: v4(),
270
301
  question: '',
271
302
  expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
272
- evaluationParameters: {
273
- approach: EvaluationApproach.EXACT,
274
- },
275
303
  isRunning: false,
276
304
  };
277
305
  }
@@ -281,35 +309,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
281
309
  return {
282
310
  type: 'text',
283
311
  label: schemaField.label,
284
- required: schemaField.required,
285
312
  placeholder: schemaField.placeholder,
286
313
  value: '',
314
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
287
315
  };
288
316
  case 'textarea':
289
317
  return {
290
318
  type: 'textarea',
291
319
  label: schemaField.label,
292
- required: schemaField.required,
293
320
  placeholder: schemaField.placeholder,
294
321
  rows: schemaField.rows,
295
322
  value: '',
323
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
296
324
  };
297
325
  case 'chips-input':
298
326
  return {
299
327
  type: 'chips-input',
300
328
  label: schemaField.label,
301
- required: schemaField.required,
302
329
  placeholder: schemaField.placeholder,
303
330
  value: [],
331
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
304
332
  };
305
333
  case 'select':
306
334
  return {
307
335
  type: 'select',
308
336
  label: schemaField.label,
309
- required: schemaField.required,
310
337
  placeholder: schemaField.placeholder,
311
338
  value: '',
312
339
  options: schemaField.options,
340
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
313
341
  };
314
342
  default: {
315
343
  const _exhaustiveCheck = schemaField;
@@ -320,32 +348,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
320
348
  function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
321
349
  return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
322
350
  }
323
- function migrateLegacyExpectedOutcomeString(value) {
324
- return [
325
- {
326
- type: 'textarea',
327
- label: 'Expected Outcome',
328
- value,
329
- },
330
- ];
331
- }
332
351
  /**
333
352
  * Creates a runtime test case from validated input data.
334
- * The input is expected to already satisfy `TestCaseInput` (legacy string or v2 shape),
335
- * and this function only performs normalization/defaulting (including legacy migration).
353
+ * The input is expected to already satisfy `TestCaseInput`,
354
+ * and this function only performs normalization/defaulting.
336
355
  *
337
356
  * @param data - Validated test case input
338
357
  * @returns A normalized TestCase object with runtime defaults applied
339
358
  */
340
359
  function createTestCaseFromInput(data) {
341
- let expectedOutcome;
342
- if (typeof data.expectedOutcome === 'string') {
343
- expectedOutcome = migrateLegacyExpectedOutcomeString(data.expectedOutcome);
344
- }
345
- else {
346
- expectedOutcome = data.expectedOutcome;
347
- }
348
- return { ...data, expectedOutcome };
360
+ return {
361
+ ...data,
362
+ expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
363
+ };
349
364
  }
350
365
 
351
366
  /** A special constant with type `never` */
@@ -4935,27 +4950,43 @@ function superRefine(fn) {
4935
4950
  const nonEmptyString = string().trim().min(1);
4936
4951
  const optionalPositiveInt = number().int().positive().optional();
4937
4952
  const optionalString = string().optional();
4938
- const optionalBoolean = boolean().optional();
4939
4953
  const selectOptionsSchema = array(nonEmptyString).min(1);
4954
+ const optionalNumber = number().optional();
4955
+ const evaluationParametersSchema = object({
4956
+ approach: _enum(EvaluationApproach),
4957
+ threshold: optionalNumber,
4958
+ });
4959
+ const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
4960
+ if (!isApproachAllowedForFieldType('select', parameters.approach)) {
4961
+ ctx.addIssue({
4962
+ code: 'custom',
4963
+ path: ['approach'],
4964
+ message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
4965
+ });
4966
+ }
4967
+ });
4940
4968
  const defaultExpectedOutcomeBaseSchema = object({
4941
4969
  label: nonEmptyString,
4942
- required: optionalBoolean,
4943
4970
  placeholder: optionalString,
4944
4971
  });
4945
4972
  const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
4946
4973
  text: baseSchema.extend({
4947
4974
  type: literal('text'),
4975
+ evaluationParameters: evaluationParametersSchema.optional(),
4948
4976
  }),
4949
4977
  textarea: baseSchema.extend({
4950
4978
  type: literal('textarea'),
4951
4979
  rows: optionalPositiveInt,
4980
+ evaluationParameters: evaluationParametersSchema.optional(),
4952
4981
  }),
4953
4982
  chipsInput: baseSchema.extend({
4954
4983
  type: literal('chips-input'),
4984
+ evaluationParameters: evaluationParametersSchema.optional(),
4955
4985
  }),
4956
4986
  select: baseSchema.extend({
4957
4987
  type: literal('select'),
4958
4988
  options: selectOptionsSchema,
4989
+ evaluationParameters: selectEvaluationParametersSchema.optional(),
4959
4990
  }),
4960
4991
  });
4961
4992
  function hasDuplicateChips(values) {
@@ -5017,33 +5048,16 @@ function validateExpectedOutcomeSchema(schema) {
5017
5048
  }
5018
5049
  }
5019
5050
 
5020
- const evaluationParametersSchema = object({
5021
- approach: _enum(EvaluationApproach),
5022
- threshold: number().optional(),
5023
- });
5024
- const baseTestCaseInputSchema = object({
5051
+ const testCaseInputSchema = object({
5025
5052
  id: string(),
5026
5053
  question: string(),
5027
- evaluationParameters: evaluationParametersSchema.optional(),
5028
- });
5029
- const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
5030
- expectedOutcome: string(),
5031
- });
5032
- const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
5033
5054
  expectedOutcome: expectedOutcomeArraySchema,
5034
5055
  });
5035
- const testCaseInputSchema = union([
5036
- legacyTestCaseInputSchema,
5037
- v2TestCaseInputSchema,
5038
- ]);
5039
- const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
5040
- message: 'The test suite is empty. Please provide at least one test case.',
5041
- });
5056
+ const testCaseInputArraySchema = array(testCaseInputSchema);
5042
5057
  object({
5043
5058
  id: string(),
5044
5059
  question: string(),
5045
5060
  expectedOutcome: expectedOutcomeArraySchema,
5046
- evaluationParameters: evaluationParametersSchema.optional(),
5047
5061
  output: string().optional(),
5048
5062
  isRunning: boolean().optional(),
5049
5063
  error: string().optional(),
@@ -5094,19 +5108,69 @@ function importTestSuite(jsonContent) {
5094
5108
  }
5095
5109
  }
5096
5110
 
5111
+ function applyExpectedOutcomeChange(testCase, change) {
5112
+ const { index } = change;
5113
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5114
+ const target = expectedOutcome[index];
5115
+ if (!target) {
5116
+ return testCase;
5117
+ }
5118
+ switch (change.operation) {
5119
+ case 'set-value': {
5120
+ if (target.type === 'chips-input') {
5121
+ return testCase;
5122
+ }
5123
+ expectedOutcome[index] = {
5124
+ ...target,
5125
+ value: change.value,
5126
+ };
5127
+ return { ...testCase, expectedOutcome };
5128
+ }
5129
+ case 'add-chip': {
5130
+ if (target.type !== 'chips-input') {
5131
+ return testCase;
5132
+ }
5133
+ expectedOutcome[index] = {
5134
+ ...target,
5135
+ value: [...target.value, change.value],
5136
+ };
5137
+ return { ...testCase, expectedOutcome };
5138
+ }
5139
+ case 'remove-chip': {
5140
+ if (target.type !== 'chips-input') {
5141
+ return testCase;
5142
+ }
5143
+ expectedOutcome[index] = {
5144
+ ...target,
5145
+ value: target.value.filter(chip => chip !== change.value),
5146
+ };
5147
+ return { ...testCase, expectedOutcome };
5148
+ }
5149
+ case 'set-evaluation-approach':
5150
+ return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
5151
+ }
5152
+ }
5097
5153
  /**
5098
- * Updates the evaluation approach for a test case
5099
- * @param testCase - The test case to update
5100
- * @param approach - The new evaluation approach
5101
- * @returns Updated test case with the new evaluation approach
5154
+ * Updates the evaluation approach for a specific expected outcome field.
5155
+ * Select fields always use exact matching.
5102
5156
  */
5103
- function updateApproach(testCase, approach) {
5157
+ function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
5158
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5159
+ const target = expectedOutcome[fieldIndex];
5160
+ if (!target) {
5161
+ return testCase;
5162
+ }
5163
+ const currentEvaluationParameters = target.evaluationParameters;
5164
+ expectedOutcome[fieldIndex] = {
5165
+ ...target,
5166
+ evaluationParameters: normalizeEvaluationParametersForField(target.type, {
5167
+ ...currentEvaluationParameters,
5168
+ approach,
5169
+ }),
5170
+ };
5104
5171
  return {
5105
5172
  ...testCase,
5106
- evaluationParameters: {
5107
- ...testCase.evaluationParameters,
5108
- approach: approach,
5109
- },
5173
+ expectedOutcome,
5110
5174
  };
5111
5175
  }
5112
5176
 
@@ -29858,57 +29922,78 @@ function performBleuEvaluation(request) {
29858
29922
 
29859
29923
  class LLMEvaluationEngine {
29860
29924
  async evaluateResponse(request, callback) {
29861
- try {
29862
- const approach = request.evaluationParameters.approach;
29863
- switch (approach) {
29864
- case EvaluationApproach.BLEU: {
29865
- const bleuResult = performBleuEvaluation(request);
29866
- callback(bleuResult);
29867
- break;
29868
- }
29869
- case EvaluationApproach.EXACT: {
29870
- const exactResult = await performEvaluation(request);
29871
- callback(exactResult);
29872
- break;
29873
- }
29874
- case EvaluationApproach.ROUGE_1: {
29875
- const rougeResult = await performRouge1Evaluation(request);
29876
- callback(rougeResult);
29877
- break;
29878
- }
29879
- case EvaluationApproach.ROUGE_L: {
29880
- const rougeLResult = await performRougeLEvaluation(request);
29881
- callback(rougeLResult);
29882
- break;
29883
- }
29884
- case EvaluationApproach.SEMANTIC: {
29885
- const semanticResult = await performSemanticEvaluation(request);
29886
- callback(semanticResult);
29887
- break;
29888
- }
29889
- default: {
29890
- console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29891
- const fallbackResult = await performEvaluation(request);
29892
- callback(fallbackResult);
29893
- }
29894
- }
29895
- }
29896
- catch (error) {
29897
- console.error('Evaluation failed:', error);
29898
- const errorResult = {
29925
+ const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
29926
+ const fieldRequest = {
29899
29927
  testCaseId: request.testCaseId,
29928
+ question: request.question,
29929
+ actualResponse: request.actualResponse,
29930
+ expectedOutcome: field.expectedValue,
29931
+ evaluationParameters: field.evaluationParameters,
29932
+ };
29933
+ const result = await this.evaluateField(fieldRequest);
29934
+ const fieldResult = {
29935
+ index: field.index,
29936
+ label: field.label,
29937
+ type: field.type,
29938
+ expectedValue: field.expectedValue,
29939
+ passed: result.passed,
29940
+ keywordMatches: result.keywordMatches,
29941
+ evaluationParameters: result.evaluationParameters,
29942
+ evaluationApproachResult: result.evaluationApproachResult,
29943
+ };
29944
+ return fieldResult;
29945
+ }));
29946
+ const fieldResults = settledResults.map((settledResult, index) => {
29947
+ const field = request.fields[index];
29948
+ if (settledResult.status === 'fulfilled') {
29949
+ return settledResult.value;
29950
+ }
29951
+ return {
29952
+ index: field.index,
29953
+ label: field.label,
29954
+ type: field.type,
29955
+ expectedValue: field.expectedValue,
29900
29956
  passed: false,
29901
29957
  keywordMatches: [],
29902
- timestamp: new Date().toISOString(),
29903
- evaluationParameters: request.evaluationParameters,
29958
+ evaluationParameters: field.evaluationParameters,
29904
29959
  evaluationApproachResult: {
29905
29960
  score: 0,
29906
- approachUsed: EvaluationApproach.EXACT,
29961
+ approachUsed: field.evaluationParameters.approach,
29907
29962
  },
29963
+ error: this.getSafeErrorMessage(settledResult.reason),
29908
29964
  };
29909
- callback(errorResult);
29965
+ });
29966
+ const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
29967
+ const passed = fieldResults.every(field => field.passed && !field.error);
29968
+ callback({
29969
+ testCaseId: request.testCaseId,
29970
+ passed,
29971
+ keywordMatches,
29972
+ fieldResults,
29973
+ timestamp: new Date().toISOString(),
29974
+ });
29975
+ }
29976
+ async evaluateField(request) {
29977
+ const approach = request.evaluationParameters.approach;
29978
+ switch (approach) {
29979
+ case EvaluationApproach.BLEU:
29980
+ return performBleuEvaluation(request);
29981
+ case EvaluationApproach.EXACT:
29982
+ return performEvaluation(request);
29983
+ case EvaluationApproach.ROUGE_1:
29984
+ return performRouge1Evaluation(request);
29985
+ case EvaluationApproach.ROUGE_L:
29986
+ return performRougeLEvaluation(request);
29987
+ case EvaluationApproach.SEMANTIC:
29988
+ return performSemanticEvaluation(request);
29989
+ default:
29990
+ console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29991
+ return performEvaluation(request);
29910
29992
  }
29911
29993
  }
29994
+ getSafeErrorMessage(error) {
29995
+ return error instanceof Error ? error.message : 'Field evaluation failed.';
29996
+ }
29912
29997
  }
29913
29998
 
29914
29999
  /**
@@ -29929,12 +30014,18 @@ class EvaluationService {
29929
30014
  console.warn('⚠️ No output to evaluate for test case:', testCase.id);
29930
30015
  return;
29931
30016
  }
30017
+ const fields = (testCase.expectedOutcome || []).map((field, index) => ({
30018
+ index,
30019
+ label: field.label,
30020
+ type: field.type,
30021
+ expectedValue: getFieldExpectedValue(field),
30022
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
30023
+ }));
29932
30024
  const evaluationRequest = {
29933
30025
  testCaseId: testCase.id,
29934
30026
  question: testCase.question,
29935
- expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
29936
30027
  actualResponse: testCase.output,
29937
- evaluationParameters: testCase.evaluationParameters,
30028
+ fields,
29938
30029
  };
29939
30030
  await this.engine.evaluateResponse(evaluationRequest, (result) => {
29940
30031
  console.log('📊 Evaluation result received:', result);
@@ -29942,6 +30033,12 @@ class EvaluationService {
29942
30033
  });
29943
30034
  }
29944
30035
  }
30036
+ function getFieldExpectedValue(field) {
30037
+ if (field.type === 'chips-input') {
30038
+ return field.value.join(', ');
30039
+ }
30040
+ return field.value;
30041
+ }
29945
30042
 
29946
30043
  const Button = (props, children) => {
29947
30044
  const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
@@ -29984,7 +30081,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
29984
30081
  };
29985
30082
 
29986
30083
  const EvaluationSummary = ({ result, isRunning, }) => {
29987
- return (h("div", { class: "evaluation-summary" }, result ? (h("div", { class: "evaluation-summary__result" }, h("div", { class: `evaluation-summary__result-status evaluation-summary__result-status--${result.passed ? 'passed' : 'failed'}` }, result.passed ? '✅ PASSED' : '❌ FAILED'), h("div", { class: "evaluation-summary__details" }, "Keywords: ", result.keywordMatches.filter(m => m.found).length, "/", result.keywordMatches.length, " found"))) : (h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
30084
+ const fieldResults = result?.fieldResults || [];
30085
+ const hasFieldResults = fieldResults.length > 0;
30086
+ return (h("div", { class: "evaluation-summary" }, result ? (h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (h("div", { class: "evaluation-summary__field-result" }, h("div", { class: "evaluation-summary__field-header" }, h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), h("div", { class: "evaluation-summary__field-details" }, h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
29988
30087
  };
29989
30088
 
29990
30089
  const IconButton = (props, children) => {
@@ -30020,6 +30119,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30020
30119
  const emit = (detail) => onExpectedOutcomeChange({
30021
30120
  detail,
30022
30121
  });
30122
+ const buildEvaluationConfig = (index, optionList) => ({
30123
+ name: `expectedOutcomeEvaluation-${index}`,
30124
+ fieldType: FormFieldType.SELECT,
30125
+ label: 'Evaluation Approach',
30126
+ placeholder: 'Select evaluation approach…',
30127
+ required: true,
30128
+ optionList,
30129
+ defaultValue: EvaluationApproach.EXACT,
30130
+ });
30131
+ const renderEvaluationSelector = (field, index) => {
30132
+ const optionList = getAllowedApproachesForFieldType(field.type);
30133
+ return (h("app-select", { config: buildEvaluationConfig(index, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
30134
+ testCaseId,
30135
+ index,
30136
+ operation: 'set-evaluation-approach',
30137
+ value: e.detail.value,
30138
+ }) }));
30139
+ };
30023
30140
  return (h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index) => {
30024
30141
  if (field.type === 'textarea') {
30025
30142
  const config = {
@@ -30027,15 +30144,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30027
30144
  fieldType: FormFieldType.TEXT_AREA,
30028
30145
  label: field.label,
30029
30146
  placeholder: field.placeholder,
30030
- required: field.required,
30147
+ required: true,
30031
30148
  rows: field.rows || 2,
30032
30149
  };
30033
- return (h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30150
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30034
30151
  testCaseId,
30035
30152
  index,
30036
30153
  operation: 'set-value',
30037
30154
  value: e.detail.value,
30038
- }) }));
30155
+ }) }), renderEvaluationSelector(field, index)));
30039
30156
  }
30040
30157
  if (field.type === 'chips-input') {
30041
30158
  const config = {
@@ -30043,9 +30160,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30043
30160
  fieldType: FormFieldType.CHIPS,
30044
30161
  label: field.label,
30045
30162
  placeholder: field.placeholder,
30046
- required: field.required,
30163
+ required: true,
30047
30164
  };
30048
- return (h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30165
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30049
30166
  testCaseId,
30050
30167
  index,
30051
30168
  operation: 'add-chip',
@@ -30055,7 +30172,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30055
30172
  index,
30056
30173
  operation: 'remove-chip',
30057
30174
  value: e.detail.value,
30058
- }) }));
30175
+ }) }), renderEvaluationSelector(field, index)));
30059
30176
  }
30060
30177
  if (field.type === 'select') {
30061
30178
  const config = {
@@ -30063,26 +30180,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30063
30180
  fieldType: FormFieldType.SELECT,
30064
30181
  label: field.label,
30065
30182
  placeholder: field.placeholder,
30066
- required: field.required,
30183
+ required: true,
30067
30184
  optionList: field.options,
30068
30185
  };
30069
- return (h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30186
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30070
30187
  testCaseId,
30071
30188
  index,
30072
30189
  operation: 'set-value',
30073
30190
  value: e.detail.value,
30074
- }) }));
30191
+ }) }), renderEvaluationSelector(field, index)));
30075
30192
  }
30076
- return (h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30193
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30077
30194
  testCaseId,
30078
30195
  index,
30079
30196
  operation: 'set-value',
30080
30197
  value: e.target.value,
30081
- }) })));
30198
+ }) })), renderEvaluationSelector(field, index)));
30082
30199
  })));
30083
30200
  };
30084
30201
 
30085
- const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30202
+ const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30086
30203
  const questionConfig = {
30087
30204
  name: 'question',
30088
30205
  fieldType: FormFieldType.TEXT_AREA,
@@ -30092,26 +30209,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
30092
30209
  required: true,
30093
30210
  rows: 3,
30094
30211
  };
30095
- const evaluationConfig = {
30096
- name: 'EvaluationApproach',
30097
- fieldType: FormFieldType.SELECT,
30098
- label: 'Evaluation',
30099
- placeholder: 'Select evaluation approach…',
30100
- required: true,
30101
- optionList: EvaluationApproachValues,
30102
- defaultValue: EvaluationApproach.EXACT,
30103
- };
30104
30212
  return (h("div", { class: "test-case-row", key: testCase.id }, h("div", { class: "test-case-row__input-column" }, h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
30105
30213
  detail: {
30106
30214
  testCaseId: testCase.id,
30107
30215
  key: 'question',
30108
30216
  value: e.detail.value,
30109
30217
  },
30110
- }) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange }), h("app-select", { config: evaluationConfig, value: testCase.evaluationParameters?.approach, onValueChange: (e) => onUpdateApproach(testCase, e.detail.value) })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30218
+ }) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30111
30219
  };
30112
30220
 
30113
- const LLMTestCases = ({ testCases, onRun, onDelete, onUpdateApproach, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30114
- return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, onUpdateApproach: onUpdateApproach, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30221
+ const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30222
+ return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30115
30223
  };
30116
30224
 
30117
30225
  const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
@@ -30122,11 +30230,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
30122
30230
 
30123
30231
  const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
30124
30232
 
30125
- const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30233
+ const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30126
30234
 
30127
30235
  const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
30128
30236
 
30129
- const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__details{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__result-status{font-weight:var(--font-weight-semibold);font-size:var(--font-size-sm);padding:var(--spacing-2) var(--spacing-3);border-radius:var(--radius-md);text-align:center}.evaluation-summary__result-status--passed{background:var(--success);color:var(--success-foreground);border:var(--border-width) solid var(--success)}.evaluation-summary__result-status--failed{background:var(--destructive);color:var(--destructive-foreground);border:var(--border-width) solid var(--destructive)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30237
+ const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30130
30238
 
30131
30239
  const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
30132
30240
 
@@ -30159,9 +30267,6 @@ const LLMTestRunner = class {
30159
30267
  value: '',
30160
30268
  },
30161
30269
  ],
30162
- evaluationParameters: {
30163
- approach: EvaluationApproach.EXACT,
30164
- },
30165
30270
  isRunning: false,
30166
30271
  },
30167
30272
  ];
@@ -30268,52 +30373,13 @@ const LLMTestRunner = class {
30268
30373
  deleteTestCase(id) {
30269
30374
  this.testCases = this.testCases.filter(tc => tc.id !== id);
30270
30375
  }
30271
- updateApproach(testCase, approach) {
30272
- if (testCase) {
30273
- const updated = updateApproach(testCase, approach);
30274
- this.updateTestCase(testCase.id, {
30275
- evaluationParameters: updated.evaluationParameters,
30276
- });
30277
- }
30278
- }
30279
30376
  handleExpectedOutcomeChange = (event) => {
30280
- const { testCaseId, index, operation, value } = event.detail;
30377
+ const { testCaseId, ...change } = event.detail;
30281
30378
  this.testCases = this.testCases.map(tc => {
30282
- if (tc.id !== testCaseId)
30283
- return tc;
30284
- const expectedOutcome = [...(tc.expectedOutcome || [])];
30285
- const target = expectedOutcome[index];
30286
- if (!target)
30379
+ if (tc.id !== testCaseId) {
30287
30380
  return tc;
30288
- if (operation === 'set-value') {
30289
- if (target.type === 'chips-input') {
30290
- return tc;
30291
- }
30292
- expectedOutcome[index] = { ...target, value: value || '' };
30293
- return { ...tc, expectedOutcome };
30294
- }
30295
- if (operation === 'add-chip') {
30296
- if (target.type !== 'chips-input' || !value) {
30297
- return tc;
30298
- }
30299
- expectedOutcome[index] = {
30300
- ...target,
30301
- value: [...target.value, value],
30302
- };
30303
- return { ...tc, expectedOutcome };
30304
- }
30305
- if (operation === 'remove-chip') {
30306
- if (target.type !== 'chips-input' ||
30307
- !value) {
30308
- return tc;
30309
- }
30310
- expectedOutcome[index] = {
30311
- ...target,
30312
- value: target.value.filter(chip => chip !== value),
30313
- };
30314
- return { ...tc, expectedOutcome };
30315
30381
  }
30316
- return tc;
30382
+ return applyExpectedOutcomeChange(tc, change);
30317
30383
  });
30318
30384
  };
30319
30385
  async evaluateResponse(testCase) {
@@ -30413,7 +30479,7 @@ const LLMTestRunner = class {
30413
30479
  }
30414
30480
  }
30415
30481
  render() {
30416
- return (h("div", { key: '5cbdc388678929c271fd2a040aca8118344024c3', class: "test-runner-container" }, h(LLMTestRunnerHeader, { key: '92533803732fc5ec28da802ac9d367f9fbbffe72', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), h(ErrorMessage, { key: 'c16a0334b1a71d676a128de18a83991c2625a075', message: this.error, onClear: () => (this.error = '') }), h("div", { key: 'e757f49052a9516c12af858b46b32a957707524c', class: "test-runner-container__content" }, h(LLMTestCases, { key: 'e9a9f6553a3ce97aeb80924b116e1b73c2397b15', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onUpdateApproach: (testCase, approach) => this.updateApproach(testCase, approach), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30482
+ return (h("div", { key: 'e3d007b453f770fcb59c29f8ee83bd8a35e82a34', class: "test-runner-container" }, h(LLMTestRunnerHeader, { key: 'b7c44bf4807fe8d9e5de514818420d67d2e0dbfb', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), h(ErrorMessage, { key: '697237ec0f8d2e704609fd0b240629f22c2a3ef6', message: this.error, onClear: () => (this.error = '') }), h("div", { key: '64a623f897dfb96d922ddc0cbdfcf529c52bef76', class: "test-runner-container__content" }, h(LLMTestCases, { key: '017da41567c5c13933d9cf31d1a972743bd9b100', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30417
30483
  }
30418
30484
  };
30419
30485
  LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));