llm-testrunner-components 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +165 -242
  3. package/dist/cjs/index.cjs.js +305 -237
  4. package/dist/cjs/index.cjs.js.map +1 -1
  5. package/dist/cjs/llm-testrunner.cjs.js +1 -1
  6. package/dist/cjs/loader.cjs.js +1 -1
  7. package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
  8. package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
  9. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
  10. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
  11. package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
  12. package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
  13. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
  14. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
  15. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
  16. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
  17. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
  18. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
  19. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
  20. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
  21. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
  22. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
  23. package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
  24. package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
  25. package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
  26. package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
  27. package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
  28. package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
  29. package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
  30. package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
  31. package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
  32. package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
  33. package/dist/collection/lib/evaluation/index.js +0 -4
  34. package/dist/collection/lib/evaluation/index.js.map +1 -1
  35. package/dist/collection/lib/evaluation/types.js.map +1 -1
  36. package/dist/collection/lib/import-export/test-results-csv.js +47 -33
  37. package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
  38. package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
  39. package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
  40. package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
  41. package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
  42. package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
  43. package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
  44. package/dist/collection/schemas/expected-outcome.js +20 -2
  45. package/dist/collection/schemas/expected-outcome.js.map +1 -1
  46. package/dist/collection/schemas/test-case.js +2 -20
  47. package/dist/collection/schemas/test-case.js.map +1 -1
  48. package/dist/collection/types/llm-test-runner.js.map +1 -1
  49. package/dist/collection/types/test-case.js.map +1 -1
  50. package/dist/components/index.js +1 -1
  51. package/dist/components/llm-test-runner.js +1 -1
  52. package/dist/components/p-JPMPoOC8.js +7 -0
  53. package/dist/components/p-JPMPoOC8.js.map +1 -0
  54. package/dist/esm/index.js +305 -237
  55. package/dist/esm/index.js.map +1 -1
  56. package/dist/esm/llm-testrunner.js +1 -1
  57. package/dist/esm/loader.js +1 -1
  58. package/dist/llm-testrunner/index.esm.js +2 -2
  59. package/dist/llm-testrunner/index.esm.js.map +1 -1
  60. package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
  61. package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
  62. package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
  63. package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
  64. package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
  65. package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
  66. package/dist/types/components.d.ts +9 -0
  67. package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
  68. package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
  69. package/dist/types/lib/evaluation/index.d.ts +0 -1
  70. package/dist/types/lib/evaluation/types.d.ts +26 -0
  71. package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
  72. package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
  73. package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
  74. package/dist/types/schemas/expected-outcome.d.ts +65 -17
  75. package/dist/types/schemas/test-case.d.ts +51 -95
  76. package/dist/types/types/llm-test-runner.d.ts +1 -1
  77. package/dist/types/types/test-case.d.ts +1 -1
  78. package/package.json +9 -2
  79. package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
  80. package/dist/components/p-BF90yb1z.js +0 -7
  81. package/dist/components/p-BF90yb1z.js.map +0 -1
  82. /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
@@ -64,20 +64,6 @@ class RateLimitedFetcher {
64
64
  }
65
65
  }
66
66
 
67
- var EvaluationApproach;
68
- (function (EvaluationApproach) {
69
- EvaluationApproach["EXACT"] = "exact";
70
- EvaluationApproach["SEMANTIC"] = "semantic";
71
- EvaluationApproach["ROUGE_1"] = "rouge-1";
72
- EvaluationApproach["ROUGE_L"] = "rouge-L";
73
- EvaluationApproach["BLEU"] = "bleu";
74
- })(EvaluationApproach || (EvaluationApproach = {}));
75
- // Array of all evaluation approach values for UI components
76
- const EvaluationApproachValues = Object.values(EvaluationApproach);
77
- const DEFAULT_ROUGE_PASS_SCORE = 0.7;
78
- const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
79
- const DEFAULT_BLEU_PASS_SCORE = 0.7;
80
-
81
67
  /**
82
68
  * Reads a file asynchronously and returns its content as a string
83
69
  * @param file - The File object to read
@@ -120,23 +106,10 @@ function formatTestSuiteAsJson(testCases) {
120
106
  id: testCase.id,
121
107
  question: testCase.question,
122
108
  expectedOutcome: testCase.expectedOutcome,
123
- evaluationParameters: testCase.evaluationParameters,
124
109
  }));
125
110
  return JSON.stringify(exportData, null, 2);
126
111
  }
127
112
 
128
- function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
129
- return (expectedOutcome || [])
130
- .map(field => {
131
- if (field.type === 'chips-input') {
132
- return field.value.join(', ');
133
- }
134
- return field.value;
135
- })
136
- .join(joinWith)
137
- .trim();
138
- }
139
-
140
113
  /**
141
114
  * Escapes a CSV field by wrapping it in quotes if it contains special characters
142
115
  * @param field - The field to escape
@@ -155,48 +128,63 @@ function escapeCsvField(field) {
155
128
  */
156
129
  function exportTestResultsToCsv(testCases) {
157
130
  const csvRows = [];
131
+ const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
158
132
  // Add header row
159
133
  const headers = [
160
134
  'Question',
161
- 'Expected Keywords',
162
- 'Generated Keywords',
163
- 'Keywords Match',
164
135
  'Response Time (s)',
165
- 'Evaluation Approach',
166
- 'Evaluation Score',
167
136
  ];
137
+ for (let i = 1; i <= maxFieldCount; i++) {
138
+ headers.push('Field Name');
139
+ headers.push('Expected Keywords');
140
+ headers.push('Generated Keywords');
141
+ headers.push('Evaluation Strategy');
142
+ headers.push('Passed Evaluation');
143
+ headers.push('Keyword Match');
144
+ headers.push('Score');
145
+ if (i < maxFieldCount) {
146
+ headers.push('');
147
+ }
148
+ }
168
149
  csvRows.push(headers.join(','));
169
- // Add data rows
150
+ // Add data rows (one row per test case)
170
151
  testCases.forEach(testCase => {
171
- const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
172
- const evaluationApproach = testCase.evaluationParameters?.approach || '';
173
- const score = testCase.evaluationResult?.evaluationApproachResult?.score;
174
- const evaluationScore = score !== undefined ? score.toString() : '';
175
- let generatedKeywords = '';
176
- let keywordsMatch = '';
177
- if (testCase.evaluationResult) {
178
- const foundKeywords = testCase.evaluationResult.keywordMatches
179
- .filter(match => match.found)
180
- .map(match => match.keyword);
181
- generatedKeywords = foundKeywords.join('; ');
182
- // Calculate match percentages
183
- const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
184
- const totalKeywords = testCase.evaluationResult.keywordMatches.length;
185
- keywordsMatch =
186
- totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
187
- }
188
152
  const responseTime = testCase.responseTime
189
153
  ? (testCase.responseTime / 1000).toFixed(3)
190
154
  : 'N/A';
191
- const row = [
192
- escapeCsvField(testCase.question),
193
- escapeCsvField(expectedOutcome),
194
- escapeCsvField(generatedKeywords),
195
- keywordsMatch,
196
- responseTime,
197
- escapeCsvField(evaluationApproach),
198
- escapeCsvField(evaluationScore),
199
- ];
155
+ const row = [escapeCsvField(testCase.question), responseTime];
156
+ for (let i = 0; i < maxFieldCount; i++) {
157
+ const field = testCase.expectedOutcome?.[i];
158
+ const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
159
+ const expectedKeywords = fieldResult?.expectedValue ??
160
+ (field
161
+ ? field.type === 'chips-input'
162
+ ? field.value.join(', ')
163
+ : field.value
164
+ : '');
165
+ const generatedKeywords = (fieldResult?.keywordMatches || [])
166
+ .filter(match => match.found)
167
+ .map(match => match.keyword)
168
+ .join('; ');
169
+ const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
170
+ const totalMatches = fieldResult?.keywordMatches?.length || 0;
171
+ const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
172
+ const score = fieldResult?.evaluationApproachResult?.score !== undefined
173
+ ? fieldResult.evaluationApproachResult.score.toFixed(2)
174
+ : '';
175
+ row.push(escapeCsvField(field?.label || ''));
176
+ row.push(escapeCsvField(expectedKeywords || ''));
177
+ row.push(escapeCsvField(generatedKeywords));
178
+ row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
179
+ field?.evaluationParameters?.approach ||
180
+ ''));
181
+ row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
182
+ row.push(keywordMatch);
183
+ row.push(score);
184
+ if (i < maxFieldCount - 1) {
185
+ row.push('');
186
+ }
187
+ }
200
188
  csvRows.push(row.join(','));
201
189
  });
202
190
  return csvRows.join('\n');
@@ -255,6 +243,43 @@ function v4(options, buf, offset) {
255
243
  return unsafeStringify(rnds);
256
244
  }
257
245
 
246
+ var EvaluationApproach;
247
+ (function (EvaluationApproach) {
248
+ EvaluationApproach["EXACT"] = "exact";
249
+ EvaluationApproach["SEMANTIC"] = "semantic";
250
+ EvaluationApproach["ROUGE_1"] = "rouge-1";
251
+ EvaluationApproach["ROUGE_L"] = "rouge-L";
252
+ EvaluationApproach["BLEU"] = "bleu";
253
+ })(EvaluationApproach || (EvaluationApproach = {}));
254
+ // Array of all evaluation approach values for UI components
255
+ const EvaluationApproachValues = Object.values(EvaluationApproach);
256
+ const DEFAULT_ROUGE_PASS_SCORE = 0.7;
257
+ const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
258
+ const DEFAULT_BLEU_PASS_SCORE = 0.7;
259
+
260
+ const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
261
+ function getAllowedApproachesForFieldType(fieldType) {
262
+ if (fieldType === 'select') {
263
+ return SELECT_ONLY_APPROACHES;
264
+ }
265
+ return EvaluationApproachValues;
266
+ }
267
+ function isApproachAllowedForFieldType(fieldType, approach) {
268
+ return getAllowedApproachesForFieldType(fieldType).includes(approach);
269
+ }
270
+ function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
271
+ const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
272
+ const fallbackApproach = allowedApproaches[0];
273
+ const rawApproach = evaluationParameters?.approach;
274
+ const approach = rawApproach && allowedApproaches.includes(rawApproach)
275
+ ? rawApproach
276
+ : fallbackApproach;
277
+ return {
278
+ ...evaluationParameters,
279
+ approach,
280
+ };
281
+ }
282
+
258
283
  const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
259
284
  {
260
285
  type: 'textarea',
@@ -263,6 +288,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
263
288
  rows: 2,
264
289
  },
265
290
  ];
291
+ function normalizeExpectedOutcomeField(field) {
292
+ return {
293
+ ...field,
294
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
295
+ };
296
+ }
266
297
  /**
267
298
  * Creates a new test case with default values
268
299
  * @returns A new TestCase object with a unique ID
@@ -272,9 +303,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
272
303
  id: v4(),
273
304
  question: '',
274
305
  expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
275
- evaluationParameters: {
276
- approach: EvaluationApproach.EXACT,
277
- },
278
306
  isRunning: false,
279
307
  };
280
308
  }
@@ -284,35 +312,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
284
312
  return {
285
313
  type: 'text',
286
314
  label: schemaField.label,
287
- required: schemaField.required,
288
315
  placeholder: schemaField.placeholder,
289
316
  value: '',
317
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
290
318
  };
291
319
  case 'textarea':
292
320
  return {
293
321
  type: 'textarea',
294
322
  label: schemaField.label,
295
- required: schemaField.required,
296
323
  placeholder: schemaField.placeholder,
297
324
  rows: schemaField.rows,
298
325
  value: '',
326
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
299
327
  };
300
328
  case 'chips-input':
301
329
  return {
302
330
  type: 'chips-input',
303
331
  label: schemaField.label,
304
- required: schemaField.required,
305
332
  placeholder: schemaField.placeholder,
306
333
  value: [],
334
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
307
335
  };
308
336
  case 'select':
309
337
  return {
310
338
  type: 'select',
311
339
  label: schemaField.label,
312
- required: schemaField.required,
313
340
  placeholder: schemaField.placeholder,
314
341
  value: '',
315
342
  options: schemaField.options,
343
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
316
344
  };
317
345
  default: {
318
346
  const _exhaustiveCheck = schemaField;
@@ -323,32 +351,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
323
351
  function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
324
352
  return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
325
353
  }
326
- function migrateLegacyExpectedOutcomeString(value) {
327
- return [
328
- {
329
- type: 'textarea',
330
- label: 'Expected Outcome',
331
- value,
332
- },
333
- ];
334
- }
335
354
  /**
336
355
  * Creates a runtime test case from validated input data.
337
- * The input is expected to already satisfy `TestCaseInput` (legacy string or v2 shape),
338
- * and this function only performs normalization/defaulting (including legacy migration).
356
+ * The input is expected to already satisfy `TestCaseInput`,
357
+ * and this function only performs normalization/defaulting.
339
358
  *
340
359
  * @param data - Validated test case input
341
360
  * @returns A normalized TestCase object with runtime defaults applied
342
361
  */
343
362
  function createTestCaseFromInput(data) {
344
- let expectedOutcome;
345
- if (typeof data.expectedOutcome === 'string') {
346
- expectedOutcome = migrateLegacyExpectedOutcomeString(data.expectedOutcome);
347
- }
348
- else {
349
- expectedOutcome = data.expectedOutcome;
350
- }
351
- return { ...data, expectedOutcome };
363
+ return {
364
+ ...data,
365
+ expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
366
+ };
352
367
  }
353
368
 
354
369
  /** A special constant with type `never` */
@@ -4938,27 +4953,43 @@ function superRefine(fn) {
4938
4953
  const nonEmptyString = string().trim().min(1);
4939
4954
  const optionalPositiveInt = number().int().positive().optional();
4940
4955
  const optionalString = string().optional();
4941
- const optionalBoolean = boolean().optional();
4942
4956
  const selectOptionsSchema = array(nonEmptyString).min(1);
4957
+ const optionalNumber = number().optional();
4958
+ const evaluationParametersSchema = object({
4959
+ approach: _enum(EvaluationApproach),
4960
+ threshold: optionalNumber,
4961
+ });
4962
+ const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
4963
+ if (!isApproachAllowedForFieldType('select', parameters.approach)) {
4964
+ ctx.addIssue({
4965
+ code: 'custom',
4966
+ path: ['approach'],
4967
+ message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
4968
+ });
4969
+ }
4970
+ });
4943
4971
  const defaultExpectedOutcomeBaseSchema = object({
4944
4972
  label: nonEmptyString,
4945
- required: optionalBoolean,
4946
4973
  placeholder: optionalString,
4947
4974
  });
4948
4975
  const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
4949
4976
  text: baseSchema.extend({
4950
4977
  type: literal('text'),
4978
+ evaluationParameters: evaluationParametersSchema.optional(),
4951
4979
  }),
4952
4980
  textarea: baseSchema.extend({
4953
4981
  type: literal('textarea'),
4954
4982
  rows: optionalPositiveInt,
4983
+ evaluationParameters: evaluationParametersSchema.optional(),
4955
4984
  }),
4956
4985
  chipsInput: baseSchema.extend({
4957
4986
  type: literal('chips-input'),
4987
+ evaluationParameters: evaluationParametersSchema.optional(),
4958
4988
  }),
4959
4989
  select: baseSchema.extend({
4960
4990
  type: literal('select'),
4961
4991
  options: selectOptionsSchema,
4992
+ evaluationParameters: selectEvaluationParametersSchema.optional(),
4962
4993
  }),
4963
4994
  });
4964
4995
  function hasDuplicateChips(values) {
@@ -5020,33 +5051,16 @@ function validateExpectedOutcomeSchema(schema) {
5020
5051
  }
5021
5052
  }
5022
5053
 
5023
- const evaluationParametersSchema = object({
5024
- approach: _enum(EvaluationApproach),
5025
- threshold: number().optional(),
5026
- });
5027
- const baseTestCaseInputSchema = object({
5054
+ const testCaseInputSchema = object({
5028
5055
  id: string(),
5029
5056
  question: string(),
5030
- evaluationParameters: evaluationParametersSchema.optional(),
5031
- });
5032
- const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
5033
- expectedOutcome: string(),
5034
- });
5035
- const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
5036
5057
  expectedOutcome: expectedOutcomeArraySchema,
5037
5058
  });
5038
- const testCaseInputSchema = union([
5039
- legacyTestCaseInputSchema,
5040
- v2TestCaseInputSchema,
5041
- ]);
5042
- const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
5043
- message: 'The test suite is empty. Please provide at least one test case.',
5044
- });
5059
+ const testCaseInputArraySchema = array(testCaseInputSchema);
5045
5060
  object({
5046
5061
  id: string(),
5047
5062
  question: string(),
5048
5063
  expectedOutcome: expectedOutcomeArraySchema,
5049
- evaluationParameters: evaluationParametersSchema.optional(),
5050
5064
  output: string().optional(),
5051
5065
  isRunning: boolean().optional(),
5052
5066
  error: string().optional(),
@@ -5097,19 +5111,69 @@ function importTestSuite(jsonContent) {
5097
5111
  }
5098
5112
  }
5099
5113
 
5114
+ function applyExpectedOutcomeChange(testCase, change) {
5115
+ const { index } = change;
5116
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5117
+ const target = expectedOutcome[index];
5118
+ if (!target) {
5119
+ return testCase;
5120
+ }
5121
+ switch (change.operation) {
5122
+ case 'set-value': {
5123
+ if (target.type === 'chips-input') {
5124
+ return testCase;
5125
+ }
5126
+ expectedOutcome[index] = {
5127
+ ...target,
5128
+ value: change.value,
5129
+ };
5130
+ return { ...testCase, expectedOutcome };
5131
+ }
5132
+ case 'add-chip': {
5133
+ if (target.type !== 'chips-input') {
5134
+ return testCase;
5135
+ }
5136
+ expectedOutcome[index] = {
5137
+ ...target,
5138
+ value: [...target.value, change.value],
5139
+ };
5140
+ return { ...testCase, expectedOutcome };
5141
+ }
5142
+ case 'remove-chip': {
5143
+ if (target.type !== 'chips-input') {
5144
+ return testCase;
5145
+ }
5146
+ expectedOutcome[index] = {
5147
+ ...target,
5148
+ value: target.value.filter(chip => chip !== change.value),
5149
+ };
5150
+ return { ...testCase, expectedOutcome };
5151
+ }
5152
+ case 'set-evaluation-approach':
5153
+ return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
5154
+ }
5155
+ }
5100
5156
  /**
5101
- * Updates the evaluation approach for a test case
5102
- * @param testCase - The test case to update
5103
- * @param approach - The new evaluation approach
5104
- * @returns Updated test case with the new evaluation approach
5157
+ * Updates the evaluation approach for a specific expected outcome field.
5158
+ * Select fields always use exact matching.
5105
5159
  */
5106
- function updateApproach(testCase, approach) {
5160
+ function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
5161
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5162
+ const target = expectedOutcome[fieldIndex];
5163
+ if (!target) {
5164
+ return testCase;
5165
+ }
5166
+ const currentEvaluationParameters = target.evaluationParameters;
5167
+ expectedOutcome[fieldIndex] = {
5168
+ ...target,
5169
+ evaluationParameters: normalizeEvaluationParametersForField(target.type, {
5170
+ ...currentEvaluationParameters,
5171
+ approach,
5172
+ }),
5173
+ };
5107
5174
  return {
5108
5175
  ...testCase,
5109
- evaluationParameters: {
5110
- ...testCase.evaluationParameters,
5111
- approach: approach,
5112
- },
5176
+ expectedOutcome,
5113
5177
  };
5114
5178
  }
5115
5179
 
@@ -29555,6 +29619,7 @@ class SemanticEvaluator {
29555
29619
  }
29556
29620
  }
29557
29621
  async performEvaluation(request) {
29622
+ const threshold = request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;
29558
29623
  try {
29559
29624
  await this.initialize();
29560
29625
  // Split expectedOutcome by newlines to create keywords array
@@ -29564,7 +29629,7 @@ class SemanticEvaluator {
29564
29629
  .map(k => k.trim())
29565
29630
  .filter(k => k.length > 0)
29566
29631
  : [];
29567
- const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, DEFAULT_SEMANTIC_PASS_SCORE);
29632
+ const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, threshold);
29568
29633
  const totalItems = keywordMatches.length;
29569
29634
  // calculate the overall score by averaging the score of the keyword matches
29570
29635
  const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
@@ -29572,7 +29637,7 @@ class SemanticEvaluator {
29572
29637
  const passed = keywordMatches.every(match => match.found);
29573
29638
  const evaluationParameters = {
29574
29639
  approach: EvaluationApproach.SEMANTIC,
29575
- threshold: DEFAULT_SEMANTIC_PASS_SCORE,
29640
+ threshold,
29576
29641
  };
29577
29642
  return {
29578
29643
  testCaseId: request.testCaseId,
@@ -29594,7 +29659,7 @@ class SemanticEvaluator {
29594
29659
  keywordMatches: [],
29595
29660
  evaluationParameters: {
29596
29661
  approach: EvaluationApproach.SEMANTIC,
29597
- threshold: DEFAULT_SEMANTIC_PASS_SCORE,
29662
+ threshold,
29598
29663
  },
29599
29664
  evaluationApproachResult: {
29600
29665
  score: 0,
@@ -29861,57 +29926,78 @@ function performBleuEvaluation(request) {
29861
29926
 
29862
29927
  class LLMEvaluationEngine {
29863
29928
  async evaluateResponse(request, callback) {
29864
- try {
29865
- const approach = request.evaluationParameters.approach;
29866
- switch (approach) {
29867
- case EvaluationApproach.BLEU: {
29868
- const bleuResult = performBleuEvaluation(request);
29869
- callback(bleuResult);
29870
- break;
29871
- }
29872
- case EvaluationApproach.EXACT: {
29873
- const exactResult = await performEvaluation(request);
29874
- callback(exactResult);
29875
- break;
29876
- }
29877
- case EvaluationApproach.ROUGE_1: {
29878
- const rougeResult = await performRouge1Evaluation(request);
29879
- callback(rougeResult);
29880
- break;
29881
- }
29882
- case EvaluationApproach.ROUGE_L: {
29883
- const rougeLResult = await performRougeLEvaluation(request);
29884
- callback(rougeLResult);
29885
- break;
29886
- }
29887
- case EvaluationApproach.SEMANTIC: {
29888
- const semanticResult = await performSemanticEvaluation(request);
29889
- callback(semanticResult);
29890
- break;
29891
- }
29892
- default: {
29893
- console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29894
- const fallbackResult = await performEvaluation(request);
29895
- callback(fallbackResult);
29896
- }
29897
- }
29898
- }
29899
- catch (error) {
29900
- console.error('Evaluation failed:', error);
29901
- const errorResult = {
29929
+ const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
29930
+ const fieldRequest = {
29902
29931
  testCaseId: request.testCaseId,
29932
+ question: request.question,
29933
+ actualResponse: request.actualResponse,
29934
+ expectedOutcome: field.expectedValue,
29935
+ evaluationParameters: field.evaluationParameters,
29936
+ };
29937
+ const result = await this.evaluateField(fieldRequest);
29938
+ const fieldResult = {
29939
+ index: field.index,
29940
+ label: field.label,
29941
+ type: field.type,
29942
+ expectedValue: field.expectedValue,
29943
+ passed: result.passed,
29944
+ keywordMatches: result.keywordMatches,
29945
+ evaluationParameters: result.evaluationParameters,
29946
+ evaluationApproachResult: result.evaluationApproachResult,
29947
+ };
29948
+ return fieldResult;
29949
+ }));
29950
+ const fieldResults = settledResults.map((settledResult, index) => {
29951
+ const field = request.fields[index];
29952
+ if (settledResult.status === 'fulfilled') {
29953
+ return settledResult.value;
29954
+ }
29955
+ return {
29956
+ index: field.index,
29957
+ label: field.label,
29958
+ type: field.type,
29959
+ expectedValue: field.expectedValue,
29903
29960
  passed: false,
29904
29961
  keywordMatches: [],
29905
- timestamp: new Date().toISOString(),
29906
- evaluationParameters: request.evaluationParameters,
29962
+ evaluationParameters: field.evaluationParameters,
29907
29963
  evaluationApproachResult: {
29908
29964
  score: 0,
29909
- approachUsed: EvaluationApproach.EXACT,
29965
+ approachUsed: field.evaluationParameters.approach,
29910
29966
  },
29967
+ error: this.getSafeErrorMessage(settledResult.reason),
29911
29968
  };
29912
- callback(errorResult);
29969
+ });
29970
+ const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
29971
+ const passed = fieldResults.every(field => field.passed && !field.error);
29972
+ callback({
29973
+ testCaseId: request.testCaseId,
29974
+ passed,
29975
+ keywordMatches,
29976
+ fieldResults,
29977
+ timestamp: new Date().toISOString(),
29978
+ });
29979
+ }
29980
+ async evaluateField(request) {
29981
+ const approach = request.evaluationParameters.approach;
29982
+ switch (approach) {
29983
+ case EvaluationApproach.BLEU:
29984
+ return performBleuEvaluation(request);
29985
+ case EvaluationApproach.EXACT:
29986
+ return performEvaluation(request);
29987
+ case EvaluationApproach.ROUGE_1:
29988
+ return performRouge1Evaluation(request);
29989
+ case EvaluationApproach.ROUGE_L:
29990
+ return performRougeLEvaluation(request);
29991
+ case EvaluationApproach.SEMANTIC:
29992
+ return performSemanticEvaluation(request);
29993
+ default:
29994
+ console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29995
+ return performEvaluation(request);
29913
29996
  }
29914
29997
  }
29998
+ getSafeErrorMessage(error) {
29999
+ return error instanceof Error ? error.message : 'Field evaluation failed.';
30000
+ }
29915
30001
  }
29916
30002
 
29917
30003
  /**
@@ -29932,12 +30018,18 @@ class EvaluationService {
29932
30018
  console.warn('⚠️ No output to evaluate for test case:', testCase.id);
29933
30019
  return;
29934
30020
  }
30021
+ const fields = (testCase.expectedOutcome || []).map((field, index) => ({
30022
+ index,
30023
+ label: field.label,
30024
+ type: field.type,
30025
+ expectedValue: getFieldExpectedValue(field),
30026
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
30027
+ }));
29935
30028
  const evaluationRequest = {
29936
30029
  testCaseId: testCase.id,
29937
30030
  question: testCase.question,
29938
- expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
29939
30031
  actualResponse: testCase.output,
29940
- evaluationParameters: testCase.evaluationParameters,
30032
+ fields,
29941
30033
  };
29942
30034
  await this.engine.evaluateResponse(evaluationRequest, (result) => {
29943
30035
  console.log('📊 Evaluation result received:', result);
@@ -29945,6 +30037,12 @@ class EvaluationService {
29945
30037
  });
29946
30038
  }
29947
30039
  }
30040
+ function getFieldExpectedValue(field) {
30041
+ if (field.type === 'chips-input') {
30042
+ return field.value.join(', ');
30043
+ }
30044
+ return field.value;
30045
+ }
29948
30046
 
29949
30047
  const Button = (props, children) => {
29950
30048
  const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
@@ -29966,7 +30064,7 @@ const Button = (props, children) => {
29966
30064
  return (index.h("button", { type: type, class: classes, disabled: disabled || loading, onClick: onClick, "aria-busy": loading, "aria-label": ariaLabel }, icon && index.h("span", { class: "icon" }, icon), children));
29967
30065
  };
29968
30066
 
29969
- const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
30067
+ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, usePromptEditor = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
29970
30068
  let fileInputRef;
29971
30069
  const handleFileSelect = () => {
29972
30070
  fileInputRef?.click();
@@ -29979,7 +30077,7 @@ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isR
29979
30077
  onImport(file);
29980
30078
  }
29981
30079
  };
29982
- return (index.h("header", { class: "test-runner-header" }, index.h("div", { class: "test-runner-header__left" }, index.h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), index.h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), index.h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), index.h("div", { class: "test-runner-header__right" }, index.h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor"), index.h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (index.h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), index.h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
30080
+ return (index.h("header", { class: "test-runner-header" }, index.h("div", { class: "test-runner-header__left" }, index.h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), index.h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), index.h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), index.h("div", { class: "test-runner-header__right" }, usePromptEditor && (index.h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor")), index.h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (index.h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), index.h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
29983
30081
  };
29984
30082
 
29985
30083
  const ResponseOutput = ({ output, isRunning, }) => {
@@ -29987,7 +30085,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
29987
30085
  };
29988
30086
 
29989
30087
  const EvaluationSummary = ({ result, isRunning, }) => {
29990
- return (index.h("div", { class: "evaluation-summary" }, result ? (index.h("div", { class: "evaluation-summary__result" }, index.h("div", { class: `evaluation-summary__result-status evaluation-summary__result-status--${result.passed ? 'passed' : 'failed'}` }, result.passed ? '✅ PASSED' : '❌ FAILED'), index.h("div", { class: "evaluation-summary__details" }, "Keywords: ", result.keywordMatches.filter(m => m.found).length, "/", result.keywordMatches.length, " found"))) : (index.h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
30088
+ const fieldResults = result?.fieldResults || [];
30089
+ const hasFieldResults = fieldResults.length > 0;
30090
+ return (index.h("div", { class: "evaluation-summary" }, result ? (index.h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (index.h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (index.h("div", { class: "evaluation-summary__field-result" }, index.h("div", { class: "evaluation-summary__field-header" }, index.h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), index.h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), index.h("div", { class: "evaluation-summary__field-details" }, index.h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (index.h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), index.h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), index.h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (index.h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
29991
30091
  };
29992
30092
 
29993
30093
  const IconButton = (props, children) => {
@@ -30023,6 +30123,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30023
30123
  const emit = (detail) => onExpectedOutcomeChange({
30024
30124
  detail,
30025
30125
  });
30126
+ const buildEvaluationConfig = (index, optionList) => ({
30127
+ name: `expectedOutcomeEvaluation-${index}`,
30128
+ fieldType: FormFieldType.SELECT,
30129
+ label: 'Evaluation Approach',
30130
+ placeholder: 'Select evaluation approach…',
30131
+ required: true,
30132
+ optionList,
30133
+ defaultValue: EvaluationApproach.EXACT,
30134
+ });
30135
+ const renderEvaluationSelector = (field, index$1) => {
30136
+ const optionList = getAllowedApproachesForFieldType(field.type);
30137
+ return (index.h("app-select", { config: buildEvaluationConfig(index$1, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
30138
+ testCaseId,
30139
+ index: index$1,
30140
+ operation: 'set-evaluation-approach',
30141
+ value: e.detail.value,
30142
+ }) }));
30143
+ };
30026
30144
  return (index.h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index$1) => {
30027
30145
  if (field.type === 'textarea') {
30028
30146
  const config = {
@@ -30030,15 +30148,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30030
30148
  fieldType: FormFieldType.TEXT_AREA,
30031
30149
  label: field.label,
30032
30150
  placeholder: field.placeholder,
30033
- required: field.required,
30151
+ required: true,
30034
30152
  rows: field.rows || 2,
30035
30153
  };
30036
- return (index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30154
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30037
30155
  testCaseId,
30038
30156
  index: index$1,
30039
30157
  operation: 'set-value',
30040
30158
  value: e.detail.value,
30041
- }) }));
30159
+ }) }), renderEvaluationSelector(field, index$1)));
30042
30160
  }
30043
30161
  if (field.type === 'chips-input') {
30044
30162
  const config = {
@@ -30046,9 +30164,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30046
30164
  fieldType: FormFieldType.CHIPS,
30047
30165
  label: field.label,
30048
30166
  placeholder: field.placeholder,
30049
- required: field.required,
30167
+ required: true,
30050
30168
  };
30051
- return (index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30169
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30052
30170
  testCaseId,
30053
30171
  index: index$1,
30054
30172
  operation: 'add-chip',
@@ -30058,7 +30176,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30058
30176
  index: index$1,
30059
30177
  operation: 'remove-chip',
30060
30178
  value: e.detail.value,
30061
- }) }));
30179
+ }) }), renderEvaluationSelector(field, index$1)));
30062
30180
  }
30063
30181
  if (field.type === 'select') {
30064
30182
  const config = {
@@ -30066,26 +30184,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30066
30184
  fieldType: FormFieldType.SELECT,
30067
30185
  label: field.label,
30068
30186
  placeholder: field.placeholder,
30069
- required: field.required,
30187
+ required: true,
30070
30188
  optionList: field.options,
30071
30189
  };
30072
- return (index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30190
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30073
30191
  testCaseId,
30074
30192
  index: index$1,
30075
30193
  operation: 'set-value',
30076
30194
  value: e.detail.value,
30077
- }) }));
30195
+ }) }), renderEvaluationSelector(field, index$1)));
30078
30196
  }
30079
- return (index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30197
+ return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30080
30198
  testCaseId,
30081
30199
  index: index$1,
30082
30200
  operation: 'set-value',
30083
30201
  value: e.target.value,
30084
- }) })));
30202
+ }) })), renderEvaluationSelector(field, index$1)));
30085
30203
  })));
30086
30204
  };
30087
30205
 
30088
- const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30206
+ const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30089
30207
  const questionConfig = {
30090
30208
  name: 'question',
30091
30209
  fieldType: FormFieldType.TEXT_AREA,
@@ -30095,26 +30213,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
30095
30213
  required: true,
30096
30214
  rows: 3,
30097
30215
  };
30098
- const evaluationConfig = {
30099
- name: 'EvaluationApproach',
30100
- fieldType: FormFieldType.SELECT,
30101
- label: 'Evaluation',
30102
- placeholder: 'Select evaluation approach…',
30103
- required: true,
30104
- optionList: EvaluationApproachValues,
30105
- defaultValue: EvaluationApproach.EXACT,
30106
- };
30107
30216
  return (index.h("div", { class: "test-case-row", key: testCase.id }, index.h("div", { class: "test-case-row__input-column" }, index.h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
30108
30217
  detail: {
30109
30218
  testCaseId: testCase.id,
30110
30219
  key: 'question',
30111
30220
  value: e.detail.value,
30112
30221
  },
30113
- }) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange }), index.h("app-select", { config: evaluationConfig, value: testCase.evaluationParameters?.approach, onValueChange: (e) => onUpdateApproach(testCase, e.detail.value) })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30222
+ }) }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30114
30223
  };
30115
30224
 
30116
- const LLMTestCases = ({ testCases, onRun, onDelete, onUpdateApproach, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30117
- return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, onUpdateApproach: onUpdateApproach, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30225
+ const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30226
+ return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30118
30227
  };
30119
30228
 
30120
30229
  const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
@@ -30125,11 +30234,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
30125
30234
 
30126
30235
  const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
30127
30236
 
30128
- const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30237
+ const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30129
30238
 
30130
30239
  const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
30131
30240
 
30132
- const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__details{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__result-status{font-weight:var(--font-weight-semibold);font-size:var(--font-size-sm);padding:var(--spacing-2) var(--spacing-3);border-radius:var(--radius-md);text-align:center}.evaluation-summary__result-status--passed{background:var(--success);color:var(--success-foreground);border:var(--border-width) solid var(--success)}.evaluation-summary__result-status--failed{background:var(--destructive);color:var(--destructive-foreground);border:var(--border-width) solid var(--destructive)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30241
+ const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30133
30242
 
30134
30243
  const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
30135
30244
 
@@ -30149,6 +30258,7 @@ const LLMTestRunner = class {
30149
30258
  save;
30150
30259
  delayMs = 500;
30151
30260
  useSave = false;
30261
+ usePromptEditor = false;
30152
30262
  initialTestCases;
30153
30263
  defaultExpectedOutcomeSchema;
30154
30264
  testCases = [
@@ -30162,9 +30272,6 @@ const LLMTestRunner = class {
30162
30272
  value: '',
30163
30273
  },
30164
30274
  ],
30165
- evaluationParameters: {
30166
- approach: EvaluationApproach.EXACT,
30167
- },
30168
30275
  isRunning: false,
30169
30276
  },
30170
30277
  ];
@@ -30271,52 +30378,13 @@ const LLMTestRunner = class {
30271
30378
  deleteTestCase(id) {
30272
30379
  this.testCases = this.testCases.filter(tc => tc.id !== id);
30273
30380
  }
30274
- updateApproach(testCase, approach) {
30275
- if (testCase) {
30276
- const updated = updateApproach(testCase, approach);
30277
- this.updateTestCase(testCase.id, {
30278
- evaluationParameters: updated.evaluationParameters,
30279
- });
30280
- }
30281
- }
30282
30381
  handleExpectedOutcomeChange = (event) => {
30283
- const { testCaseId, index, operation, value } = event.detail;
30382
+ const { testCaseId, ...change } = event.detail;
30284
30383
  this.testCases = this.testCases.map(tc => {
30285
- if (tc.id !== testCaseId)
30286
- return tc;
30287
- const expectedOutcome = [...(tc.expectedOutcome || [])];
30288
- const target = expectedOutcome[index];
30289
- if (!target)
30384
+ if (tc.id !== testCaseId) {
30290
30385
  return tc;
30291
- if (operation === 'set-value') {
30292
- if (target.type === 'chips-input') {
30293
- return tc;
30294
- }
30295
- expectedOutcome[index] = { ...target, value: value || '' };
30296
- return { ...tc, expectedOutcome };
30297
- }
30298
- if (operation === 'add-chip') {
30299
- if (target.type !== 'chips-input' || !value) {
30300
- return tc;
30301
- }
30302
- expectedOutcome[index] = {
30303
- ...target,
30304
- value: [...target.value, value],
30305
- };
30306
- return { ...tc, expectedOutcome };
30307
- }
30308
- if (operation === 'remove-chip') {
30309
- if (target.type !== 'chips-input' ||
30310
- !value) {
30311
- return tc;
30312
- }
30313
- expectedOutcome[index] = {
30314
- ...target,
30315
- value: target.value.filter(chip => chip !== value),
30316
- };
30317
- return { ...tc, expectedOutcome };
30318
30386
  }
30319
- return tc;
30387
+ return applyExpectedOutcomeChange(tc, change);
30320
30388
  });
30321
30389
  };
30322
30390
  async evaluateResponse(testCase) {
@@ -30416,7 +30484,7 @@ const LLMTestRunner = class {
30416
30484
  }
30417
30485
  }
30418
30486
  render() {
30419
- return (index.h("div", { key: '5cbdc388678929c271fd2a040aca8118344024c3', class: "test-runner-container" }, index.h(LLMTestRunnerHeader, { key: '92533803732fc5ec28da802ac9d367f9fbbffe72', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), index.h(ErrorMessage, { key: 'c16a0334b1a71d676a128de18a83991c2625a075', message: this.error, onClear: () => (this.error = '') }), index.h("div", { key: 'e757f49052a9516c12af858b46b32a957707524c', class: "test-runner-container__content" }, index.h(LLMTestCases, { key: 'e9a9f6553a3ce97aeb80924b116e1b73c2397b15', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onUpdateApproach: (testCase, approach) => this.updateApproach(testCase, approach), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30487
+ return (index.h("div", { key: '323b5e140740bb72d4767c0763c382a6b125caa2', class: "test-runner-container" }, index.h(LLMTestRunnerHeader, { key: 'e1e2efdf6cfe5f406de7e26e745b5775f307d294', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, usePromptEditor: this.usePromptEditor, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), index.h(ErrorMessage, { key: 'c6a34b81f66c6cd835eb8bc253f7a28d68c49874', message: this.error, onClear: () => (this.error = '') }), index.h("div", { key: '674daad8a2754afc8144463e9a173690a3d1d589', class: "test-runner-container__content" }, index.h(LLMTestCases, { key: '96c1aeae37f56378b7a9b5d54be73c5df48ae448', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30420
30488
  }
30421
30489
  };
30422
30490
  LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));