llm-testrunner-components 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +165 -242
  3. package/dist/cjs/index.cjs.js +305 -237
  4. package/dist/cjs/index.cjs.js.map +1 -1
  5. package/dist/cjs/llm-testrunner.cjs.js +1 -1
  6. package/dist/cjs/loader.cjs.js +1 -1
  7. package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
  8. package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
  9. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
  10. package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
  11. package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
  12. package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
  13. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
  14. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
  15. package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
  16. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
  17. package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
  18. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
  19. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
  20. package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
  21. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
  22. package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
  23. package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
  24. package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
  25. package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
  26. package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
  27. package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
  28. package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
  29. package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
  30. package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
  31. package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
  32. package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
  33. package/dist/collection/lib/evaluation/index.js +0 -4
  34. package/dist/collection/lib/evaluation/index.js.map +1 -1
  35. package/dist/collection/lib/evaluation/types.js.map +1 -1
  36. package/dist/collection/lib/import-export/test-results-csv.js +47 -33
  37. package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
  38. package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
  39. package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
  40. package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
  41. package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
  42. package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
  43. package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
  44. package/dist/collection/schemas/expected-outcome.js +20 -2
  45. package/dist/collection/schemas/expected-outcome.js.map +1 -1
  46. package/dist/collection/schemas/test-case.js +2 -20
  47. package/dist/collection/schemas/test-case.js.map +1 -1
  48. package/dist/collection/types/llm-test-runner.js.map +1 -1
  49. package/dist/collection/types/test-case.js.map +1 -1
  50. package/dist/components/index.js +1 -1
  51. package/dist/components/llm-test-runner.js +1 -1
  52. package/dist/components/p-JPMPoOC8.js +7 -0
  53. package/dist/components/p-JPMPoOC8.js.map +1 -0
  54. package/dist/esm/index.js +305 -237
  55. package/dist/esm/index.js.map +1 -1
  56. package/dist/esm/llm-testrunner.js +1 -1
  57. package/dist/esm/loader.js +1 -1
  58. package/dist/llm-testrunner/index.esm.js +2 -2
  59. package/dist/llm-testrunner/index.esm.js.map +1 -1
  60. package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
  61. package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
  62. package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
  63. package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
  64. package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
  65. package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
  66. package/dist/types/components.d.ts +9 -0
  67. package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
  68. package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
  69. package/dist/types/lib/evaluation/index.d.ts +0 -1
  70. package/dist/types/lib/evaluation/types.d.ts +26 -0
  71. package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
  72. package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
  73. package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
  74. package/dist/types/schemas/expected-outcome.d.ts +65 -17
  75. package/dist/types/schemas/test-case.d.ts +51 -95
  76. package/dist/types/types/llm-test-runner.d.ts +1 -1
  77. package/dist/types/types/test-case.d.ts +1 -1
  78. package/package.json +9 -2
  79. package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
  80. package/dist/components/p-BF90yb1z.js +0 -7
  81. package/dist/components/p-BF90yb1z.js.map +0 -1
  82. /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
package/dist/esm/index.js CHANGED
@@ -61,20 +61,6 @@ class RateLimitedFetcher {
61
61
  }
62
62
  }
63
63
 
64
- var EvaluationApproach;
65
- (function (EvaluationApproach) {
66
- EvaluationApproach["EXACT"] = "exact";
67
- EvaluationApproach["SEMANTIC"] = "semantic";
68
- EvaluationApproach["ROUGE_1"] = "rouge-1";
69
- EvaluationApproach["ROUGE_L"] = "rouge-L";
70
- EvaluationApproach["BLEU"] = "bleu";
71
- })(EvaluationApproach || (EvaluationApproach = {}));
72
- // Array of all evaluation approach values for UI components
73
- const EvaluationApproachValues = Object.values(EvaluationApproach);
74
- const DEFAULT_ROUGE_PASS_SCORE = 0.7;
75
- const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
76
- const DEFAULT_BLEU_PASS_SCORE = 0.7;
77
-
78
64
  /**
79
65
  * Reads a file asynchronously and returns its content as a string
80
66
  * @param file - The File object to read
@@ -117,23 +103,10 @@ function formatTestSuiteAsJson(testCases) {
117
103
  id: testCase.id,
118
104
  question: testCase.question,
119
105
  expectedOutcome: testCase.expectedOutcome,
120
- evaluationParameters: testCase.evaluationParameters,
121
106
  }));
122
107
  return JSON.stringify(exportData, null, 2);
123
108
  }
124
109
 
125
- function serializeExpectedOutcome(expectedOutcome, joinWith = '\n') {
126
- return (expectedOutcome || [])
127
- .map(field => {
128
- if (field.type === 'chips-input') {
129
- return field.value.join(', ');
130
- }
131
- return field.value;
132
- })
133
- .join(joinWith)
134
- .trim();
135
- }
136
-
137
110
  /**
138
111
  * Escapes a CSV field by wrapping it in quotes if it contains special characters
139
112
  * @param field - The field to escape
@@ -152,48 +125,63 @@ function escapeCsvField(field) {
152
125
  */
153
126
  function exportTestResultsToCsv(testCases) {
154
127
  const csvRows = [];
128
+ const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
155
129
  // Add header row
156
130
  const headers = [
157
131
  'Question',
158
- 'Expected Keywords',
159
- 'Generated Keywords',
160
- 'Keywords Match',
161
132
  'Response Time (s)',
162
- 'Evaluation Approach',
163
- 'Evaluation Score',
164
133
  ];
134
+ for (let i = 1; i <= maxFieldCount; i++) {
135
+ headers.push('Field Name');
136
+ headers.push('Expected Keywords');
137
+ headers.push('Generated Keywords');
138
+ headers.push('Evaluation Strategy');
139
+ headers.push('Passed Evaluation');
140
+ headers.push('Keyword Match');
141
+ headers.push('Score');
142
+ if (i < maxFieldCount) {
143
+ headers.push('');
144
+ }
145
+ }
165
146
  csvRows.push(headers.join(','));
166
- // Add data rows
147
+ // Add data rows (one row per test case)
167
148
  testCases.forEach(testCase => {
168
- const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
169
- const evaluationApproach = testCase.evaluationParameters?.approach || '';
170
- const score = testCase.evaluationResult?.evaluationApproachResult?.score;
171
- const evaluationScore = score !== undefined ? score.toString() : '';
172
- let generatedKeywords = '';
173
- let keywordsMatch = '';
174
- if (testCase.evaluationResult) {
175
- const foundKeywords = testCase.evaluationResult.keywordMatches
176
- .filter(match => match.found)
177
- .map(match => match.keyword);
178
- generatedKeywords = foundKeywords.join('; ');
179
- // Calculate match percentages
180
- const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
181
- const totalKeywords = testCase.evaluationResult.keywordMatches.length;
182
- keywordsMatch =
183
- totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
184
- }
185
149
  const responseTime = testCase.responseTime
186
150
  ? (testCase.responseTime / 1000).toFixed(3)
187
151
  : 'N/A';
188
- const row = [
189
- escapeCsvField(testCase.question),
190
- escapeCsvField(expectedOutcome),
191
- escapeCsvField(generatedKeywords),
192
- keywordsMatch,
193
- responseTime,
194
- escapeCsvField(evaluationApproach),
195
- escapeCsvField(evaluationScore),
196
- ];
152
+ const row = [escapeCsvField(testCase.question), responseTime];
153
+ for (let i = 0; i < maxFieldCount; i++) {
154
+ const field = testCase.expectedOutcome?.[i];
155
+ const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
156
+ const expectedKeywords = fieldResult?.expectedValue ??
157
+ (field
158
+ ? field.type === 'chips-input'
159
+ ? field.value.join(', ')
160
+ : field.value
161
+ : '');
162
+ const generatedKeywords = (fieldResult?.keywordMatches || [])
163
+ .filter(match => match.found)
164
+ .map(match => match.keyword)
165
+ .join('; ');
166
+ const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
167
+ const totalMatches = fieldResult?.keywordMatches?.length || 0;
168
+ const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
169
+ const score = fieldResult?.evaluationApproachResult?.score !== undefined
170
+ ? fieldResult.evaluationApproachResult.score.toFixed(2)
171
+ : '';
172
+ row.push(escapeCsvField(field?.label || ''));
173
+ row.push(escapeCsvField(expectedKeywords || ''));
174
+ row.push(escapeCsvField(generatedKeywords));
175
+ row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
176
+ field?.evaluationParameters?.approach ||
177
+ ''));
178
+ row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
179
+ row.push(keywordMatch);
180
+ row.push(score);
181
+ if (i < maxFieldCount - 1) {
182
+ row.push('');
183
+ }
184
+ }
197
185
  csvRows.push(row.join(','));
198
186
  });
199
187
  return csvRows.join('\n');
@@ -252,6 +240,43 @@ function v4(options, buf, offset) {
252
240
  return unsafeStringify(rnds);
253
241
  }
254
242
 
243
+ var EvaluationApproach;
244
+ (function (EvaluationApproach) {
245
+ EvaluationApproach["EXACT"] = "exact";
246
+ EvaluationApproach["SEMANTIC"] = "semantic";
247
+ EvaluationApproach["ROUGE_1"] = "rouge-1";
248
+ EvaluationApproach["ROUGE_L"] = "rouge-L";
249
+ EvaluationApproach["BLEU"] = "bleu";
250
+ })(EvaluationApproach || (EvaluationApproach = {}));
251
+ // Array of all evaluation approach values for UI components
252
+ const EvaluationApproachValues = Object.values(EvaluationApproach);
253
+ const DEFAULT_ROUGE_PASS_SCORE = 0.7;
254
+ const DEFAULT_SEMANTIC_PASS_SCORE = 0.7;
255
+ const DEFAULT_BLEU_PASS_SCORE = 0.7;
256
+
257
+ const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
258
+ function getAllowedApproachesForFieldType(fieldType) {
259
+ if (fieldType === 'select') {
260
+ return SELECT_ONLY_APPROACHES;
261
+ }
262
+ return EvaluationApproachValues;
263
+ }
264
+ function isApproachAllowedForFieldType(fieldType, approach) {
265
+ return getAllowedApproachesForFieldType(fieldType).includes(approach);
266
+ }
267
+ function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
268
+ const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
269
+ const fallbackApproach = allowedApproaches[0];
270
+ const rawApproach = evaluationParameters?.approach;
271
+ const approach = rawApproach && allowedApproaches.includes(rawApproach)
272
+ ? rawApproach
273
+ : fallbackApproach;
274
+ return {
275
+ ...evaluationParameters,
276
+ approach,
277
+ };
278
+ }
279
+
255
280
  const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
256
281
  {
257
282
  type: 'textarea',
@@ -260,6 +285,12 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
260
285
  rows: 2,
261
286
  },
262
287
  ];
288
+ function normalizeExpectedOutcomeField(field) {
289
+ return {
290
+ ...field,
291
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
292
+ };
293
+ }
263
294
  /**
264
295
  * Creates a new test case with default values
265
296
  * @returns A new TestCase object with a unique ID
@@ -269,9 +300,6 @@ function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA)
269
300
  id: v4(),
270
301
  question: '',
271
302
  expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
272
- evaluationParameters: {
273
- approach: EvaluationApproach.EXACT,
274
- },
275
303
  isRunning: false,
276
304
  };
277
305
  }
@@ -281,35 +309,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
281
309
  return {
282
310
  type: 'text',
283
311
  label: schemaField.label,
284
- required: schemaField.required,
285
312
  placeholder: schemaField.placeholder,
286
313
  value: '',
314
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
287
315
  };
288
316
  case 'textarea':
289
317
  return {
290
318
  type: 'textarea',
291
319
  label: schemaField.label,
292
- required: schemaField.required,
293
320
  placeholder: schemaField.placeholder,
294
321
  rows: schemaField.rows,
295
322
  value: '',
323
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
296
324
  };
297
325
  case 'chips-input':
298
326
  return {
299
327
  type: 'chips-input',
300
328
  label: schemaField.label,
301
- required: schemaField.required,
302
329
  placeholder: schemaField.placeholder,
303
330
  value: [],
331
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
304
332
  };
305
333
  case 'select':
306
334
  return {
307
335
  type: 'select',
308
336
  label: schemaField.label,
309
- required: schemaField.required,
310
337
  placeholder: schemaField.placeholder,
311
338
  value: '',
312
339
  options: schemaField.options,
340
+ evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
313
341
  };
314
342
  default: {
315
343
  const _exhaustiveCheck = schemaField;
@@ -320,32 +348,19 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
320
348
  function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
321
349
  return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
322
350
  }
323
- function migrateLegacyExpectedOutcomeString(value) {
324
- return [
325
- {
326
- type: 'textarea',
327
- label: 'Expected Outcome',
328
- value,
329
- },
330
- ];
331
- }
332
351
  /**
333
352
  * Creates a runtime test case from validated input data.
334
- * The input is expected to already satisfy `TestCaseInput` (legacy string or v2 shape),
335
- * and this function only performs normalization/defaulting (including legacy migration).
353
+ * The input is expected to already satisfy `TestCaseInput`,
354
+ * and this function only performs normalization/defaulting.
336
355
  *
337
356
  * @param data - Validated test case input
338
357
  * @returns A normalized TestCase object with runtime defaults applied
339
358
  */
340
359
  function createTestCaseFromInput(data) {
341
- let expectedOutcome;
342
- if (typeof data.expectedOutcome === 'string') {
343
- expectedOutcome = migrateLegacyExpectedOutcomeString(data.expectedOutcome);
344
- }
345
- else {
346
- expectedOutcome = data.expectedOutcome;
347
- }
348
- return { ...data, expectedOutcome };
360
+ return {
361
+ ...data,
362
+ expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
363
+ };
349
364
  }
350
365
 
351
366
  /** A special constant with type `never` */
@@ -4935,27 +4950,43 @@ function superRefine(fn) {
4935
4950
  const nonEmptyString = string().trim().min(1);
4936
4951
  const optionalPositiveInt = number().int().positive().optional();
4937
4952
  const optionalString = string().optional();
4938
- const optionalBoolean = boolean().optional();
4939
4953
  const selectOptionsSchema = array(nonEmptyString).min(1);
4954
+ const optionalNumber = number().optional();
4955
+ const evaluationParametersSchema = object({
4956
+ approach: _enum(EvaluationApproach),
4957
+ threshold: optionalNumber,
4958
+ });
4959
+ const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
4960
+ if (!isApproachAllowedForFieldType('select', parameters.approach)) {
4961
+ ctx.addIssue({
4962
+ code: 'custom',
4963
+ path: ['approach'],
4964
+ message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
4965
+ });
4966
+ }
4967
+ });
4940
4968
  const defaultExpectedOutcomeBaseSchema = object({
4941
4969
  label: nonEmptyString,
4942
- required: optionalBoolean,
4943
4970
  placeholder: optionalString,
4944
4971
  });
4945
4972
  const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
4946
4973
  text: baseSchema.extend({
4947
4974
  type: literal('text'),
4975
+ evaluationParameters: evaluationParametersSchema.optional(),
4948
4976
  }),
4949
4977
  textarea: baseSchema.extend({
4950
4978
  type: literal('textarea'),
4951
4979
  rows: optionalPositiveInt,
4980
+ evaluationParameters: evaluationParametersSchema.optional(),
4952
4981
  }),
4953
4982
  chipsInput: baseSchema.extend({
4954
4983
  type: literal('chips-input'),
4984
+ evaluationParameters: evaluationParametersSchema.optional(),
4955
4985
  }),
4956
4986
  select: baseSchema.extend({
4957
4987
  type: literal('select'),
4958
4988
  options: selectOptionsSchema,
4989
+ evaluationParameters: selectEvaluationParametersSchema.optional(),
4959
4990
  }),
4960
4991
  });
4961
4992
  function hasDuplicateChips(values) {
@@ -5017,33 +5048,16 @@ function validateExpectedOutcomeSchema(schema) {
5017
5048
  }
5018
5049
  }
5019
5050
 
5020
- const evaluationParametersSchema = object({
5021
- approach: _enum(EvaluationApproach),
5022
- threshold: number().optional(),
5023
- });
5024
- const baseTestCaseInputSchema = object({
5051
+ const testCaseInputSchema = object({
5025
5052
  id: string(),
5026
5053
  question: string(),
5027
- evaluationParameters: evaluationParametersSchema.optional(),
5028
- });
5029
- const legacyTestCaseInputSchema = baseTestCaseInputSchema.extend({
5030
- expectedOutcome: string(),
5031
- });
5032
- const v2TestCaseInputSchema = baseTestCaseInputSchema.extend({
5033
5054
  expectedOutcome: expectedOutcomeArraySchema,
5034
5055
  });
5035
- const testCaseInputSchema = union([
5036
- legacyTestCaseInputSchema,
5037
- v2TestCaseInputSchema,
5038
- ]);
5039
- const testCaseInputArraySchema = array(testCaseInputSchema).min(1, {
5040
- message: 'The test suite is empty. Please provide at least one test case.',
5041
- });
5056
+ const testCaseInputArraySchema = array(testCaseInputSchema);
5042
5057
  object({
5043
5058
  id: string(),
5044
5059
  question: string(),
5045
5060
  expectedOutcome: expectedOutcomeArraySchema,
5046
- evaluationParameters: evaluationParametersSchema.optional(),
5047
5061
  output: string().optional(),
5048
5062
  isRunning: boolean().optional(),
5049
5063
  error: string().optional(),
@@ -5094,19 +5108,69 @@ function importTestSuite(jsonContent) {
5094
5108
  }
5095
5109
  }
5096
5110
 
5111
+ function applyExpectedOutcomeChange(testCase, change) {
5112
+ const { index } = change;
5113
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5114
+ const target = expectedOutcome[index];
5115
+ if (!target) {
5116
+ return testCase;
5117
+ }
5118
+ switch (change.operation) {
5119
+ case 'set-value': {
5120
+ if (target.type === 'chips-input') {
5121
+ return testCase;
5122
+ }
5123
+ expectedOutcome[index] = {
5124
+ ...target,
5125
+ value: change.value,
5126
+ };
5127
+ return { ...testCase, expectedOutcome };
5128
+ }
5129
+ case 'add-chip': {
5130
+ if (target.type !== 'chips-input') {
5131
+ return testCase;
5132
+ }
5133
+ expectedOutcome[index] = {
5134
+ ...target,
5135
+ value: [...target.value, change.value],
5136
+ };
5137
+ return { ...testCase, expectedOutcome };
5138
+ }
5139
+ case 'remove-chip': {
5140
+ if (target.type !== 'chips-input') {
5141
+ return testCase;
5142
+ }
5143
+ expectedOutcome[index] = {
5144
+ ...target,
5145
+ value: target.value.filter(chip => chip !== change.value),
5146
+ };
5147
+ return { ...testCase, expectedOutcome };
5148
+ }
5149
+ case 'set-evaluation-approach':
5150
+ return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
5151
+ }
5152
+ }
5097
5153
  /**
5098
- * Updates the evaluation approach for a test case
5099
- * @param testCase - The test case to update
5100
- * @param approach - The new evaluation approach
5101
- * @returns Updated test case with the new evaluation approach
5154
+ * Updates the evaluation approach for a specific expected outcome field.
5155
+ * Select fields always use exact matching.
5102
5156
  */
5103
- function updateApproach(testCase, approach) {
5157
+ function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
5158
+ const expectedOutcome = [...(testCase.expectedOutcome || [])];
5159
+ const target = expectedOutcome[fieldIndex];
5160
+ if (!target) {
5161
+ return testCase;
5162
+ }
5163
+ const currentEvaluationParameters = target.evaluationParameters;
5164
+ expectedOutcome[fieldIndex] = {
5165
+ ...target,
5166
+ evaluationParameters: normalizeEvaluationParametersForField(target.type, {
5167
+ ...currentEvaluationParameters,
5168
+ approach,
5169
+ }),
5170
+ };
5104
5171
  return {
5105
5172
  ...testCase,
5106
- evaluationParameters: {
5107
- ...testCase.evaluationParameters,
5108
- approach: approach,
5109
- },
5173
+ expectedOutcome,
5110
5174
  };
5111
5175
  }
5112
5176
 
@@ -29552,6 +29616,7 @@ class SemanticEvaluator {
29552
29616
  }
29553
29617
  }
29554
29618
  async performEvaluation(request) {
29619
+ const threshold = request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;
29555
29620
  try {
29556
29621
  await this.initialize();
29557
29622
  // Split expectedOutcome by newlines to create keywords array
@@ -29561,7 +29626,7 @@ class SemanticEvaluator {
29561
29626
  .map(k => k.trim())
29562
29627
  .filter(k => k.length > 0)
29563
29628
  : [];
29564
- const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, DEFAULT_SEMANTIC_PASS_SCORE);
29629
+ const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, threshold);
29565
29630
  const totalItems = keywordMatches.length;
29566
29631
  // calculate the overall score by averaging the score of the keyword matches
29567
29632
  const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
@@ -29569,7 +29634,7 @@ class SemanticEvaluator {
29569
29634
  const passed = keywordMatches.every(match => match.found);
29570
29635
  const evaluationParameters = {
29571
29636
  approach: EvaluationApproach.SEMANTIC,
29572
- threshold: DEFAULT_SEMANTIC_PASS_SCORE,
29637
+ threshold,
29573
29638
  };
29574
29639
  return {
29575
29640
  testCaseId: request.testCaseId,
@@ -29591,7 +29656,7 @@ class SemanticEvaluator {
29591
29656
  keywordMatches: [],
29592
29657
  evaluationParameters: {
29593
29658
  approach: EvaluationApproach.SEMANTIC,
29594
- threshold: DEFAULT_SEMANTIC_PASS_SCORE,
29659
+ threshold,
29595
29660
  },
29596
29661
  evaluationApproachResult: {
29597
29662
  score: 0,
@@ -29858,57 +29923,78 @@ function performBleuEvaluation(request) {
29858
29923
 
29859
29924
  class LLMEvaluationEngine {
29860
29925
  async evaluateResponse(request, callback) {
29861
- try {
29862
- const approach = request.evaluationParameters.approach;
29863
- switch (approach) {
29864
- case EvaluationApproach.BLEU: {
29865
- const bleuResult = performBleuEvaluation(request);
29866
- callback(bleuResult);
29867
- break;
29868
- }
29869
- case EvaluationApproach.EXACT: {
29870
- const exactResult = await performEvaluation(request);
29871
- callback(exactResult);
29872
- break;
29873
- }
29874
- case EvaluationApproach.ROUGE_1: {
29875
- const rougeResult = await performRouge1Evaluation(request);
29876
- callback(rougeResult);
29877
- break;
29878
- }
29879
- case EvaluationApproach.ROUGE_L: {
29880
- const rougeLResult = await performRougeLEvaluation(request);
29881
- callback(rougeLResult);
29882
- break;
29883
- }
29884
- case EvaluationApproach.SEMANTIC: {
29885
- const semanticResult = await performSemanticEvaluation(request);
29886
- callback(semanticResult);
29887
- break;
29888
- }
29889
- default: {
29890
- console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29891
- const fallbackResult = await performEvaluation(request);
29892
- callback(fallbackResult);
29893
- }
29894
- }
29895
- }
29896
- catch (error) {
29897
- console.error('Evaluation failed:', error);
29898
- const errorResult = {
29926
+ const settledResults = await Promise.allSettled(request.fields.map(async (field) => {
29927
+ const fieldRequest = {
29899
29928
  testCaseId: request.testCaseId,
29929
+ question: request.question,
29930
+ actualResponse: request.actualResponse,
29931
+ expectedOutcome: field.expectedValue,
29932
+ evaluationParameters: field.evaluationParameters,
29933
+ };
29934
+ const result = await this.evaluateField(fieldRequest);
29935
+ const fieldResult = {
29936
+ index: field.index,
29937
+ label: field.label,
29938
+ type: field.type,
29939
+ expectedValue: field.expectedValue,
29940
+ passed: result.passed,
29941
+ keywordMatches: result.keywordMatches,
29942
+ evaluationParameters: result.evaluationParameters,
29943
+ evaluationApproachResult: result.evaluationApproachResult,
29944
+ };
29945
+ return fieldResult;
29946
+ }));
29947
+ const fieldResults = settledResults.map((settledResult, index) => {
29948
+ const field = request.fields[index];
29949
+ if (settledResult.status === 'fulfilled') {
29950
+ return settledResult.value;
29951
+ }
29952
+ return {
29953
+ index: field.index,
29954
+ label: field.label,
29955
+ type: field.type,
29956
+ expectedValue: field.expectedValue,
29900
29957
  passed: false,
29901
29958
  keywordMatches: [],
29902
- timestamp: new Date().toISOString(),
29903
- evaluationParameters: request.evaluationParameters,
29959
+ evaluationParameters: field.evaluationParameters,
29904
29960
  evaluationApproachResult: {
29905
29961
  score: 0,
29906
- approachUsed: EvaluationApproach.EXACT,
29962
+ approachUsed: field.evaluationParameters.approach,
29907
29963
  },
29964
+ error: this.getSafeErrorMessage(settledResult.reason),
29908
29965
  };
29909
- callback(errorResult);
29966
+ });
29967
+ const keywordMatches = fieldResults.flatMap(field => field.keywordMatches);
29968
+ const passed = fieldResults.every(field => field.passed && !field.error);
29969
+ callback({
29970
+ testCaseId: request.testCaseId,
29971
+ passed,
29972
+ keywordMatches,
29973
+ fieldResults,
29974
+ timestamp: new Date().toISOString(),
29975
+ });
29976
+ }
29977
+ async evaluateField(request) {
29978
+ const approach = request.evaluationParameters.approach;
29979
+ switch (approach) {
29980
+ case EvaluationApproach.BLEU:
29981
+ return performBleuEvaluation(request);
29982
+ case EvaluationApproach.EXACT:
29983
+ return performEvaluation(request);
29984
+ case EvaluationApproach.ROUGE_1:
29985
+ return performRouge1Evaluation(request);
29986
+ case EvaluationApproach.ROUGE_L:
29987
+ return performRougeLEvaluation(request);
29988
+ case EvaluationApproach.SEMANTIC:
29989
+ return performSemanticEvaluation(request);
29990
+ default:
29991
+ console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
29992
+ return performEvaluation(request);
29910
29993
  }
29911
29994
  }
29995
+ getSafeErrorMessage(error) {
29996
+ return error instanceof Error ? error.message : 'Field evaluation failed.';
29997
+ }
29912
29998
  }
29913
29999
 
29914
30000
  /**
@@ -29929,12 +30015,18 @@ class EvaluationService {
29929
30015
  console.warn('⚠️ No output to evaluate for test case:', testCase.id);
29930
30016
  return;
29931
30017
  }
30018
+ const fields = (testCase.expectedOutcome || []).map((field, index) => ({
30019
+ index,
30020
+ label: field.label,
30021
+ type: field.type,
30022
+ expectedValue: getFieldExpectedValue(field),
30023
+ evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
30024
+ }));
29932
30025
  const evaluationRequest = {
29933
30026
  testCaseId: testCase.id,
29934
30027
  question: testCase.question,
29935
- expectedOutcome: serializeExpectedOutcome(testCase.expectedOutcome),
29936
30028
  actualResponse: testCase.output,
29937
- evaluationParameters: testCase.evaluationParameters,
30029
+ fields,
29938
30030
  };
29939
30031
  await this.engine.evaluateResponse(evaluationRequest, (result) => {
29940
30032
  console.log('📊 Evaluation result received:', result);
@@ -29942,6 +30034,12 @@ class EvaluationService {
29942
30034
  });
29943
30035
  }
29944
30036
  }
30037
+ function getFieldExpectedValue(field) {
30038
+ if (field.type === 'chips-input') {
30039
+ return field.value.join(', ');
30040
+ }
30041
+ return field.value;
30042
+ }
29945
30043
 
29946
30044
  const Button = (props, children) => {
29947
30045
  const { variant = 'primary', size = 'md', disabled = false, loading = false, onClick, type = 'button', 'class': className = '', icon, 'aria-label': ariaLabel, } = props;
@@ -29963,7 +30061,7 @@ const Button = (props, children) => {
29963
30061
  return (h("button", { type: type, class: classes, disabled: disabled || loading, onClick: onClick, "aria-busy": loading, "aria-label": ariaLabel }, icon && h("span", { class: "icon" }, icon), children));
29964
30062
  };
29965
30063
 
29966
- const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
30064
+ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isRunningAll, useSave = false, isSaving = false, usePromptEditor = false, onImport, onExportSuite, onExportResults, onRunAll, onSave, }) => {
29967
30065
  let fileInputRef;
29968
30066
  const handleFileSelect = () => {
29969
30067
  fileInputRef?.click();
@@ -29976,7 +30074,7 @@ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isR
29976
30074
  onImport(file);
29977
30075
  }
29978
30076
  };
29979
- return (h("header", { class: "test-runner-header" }, h("div", { class: "test-runner-header__left" }, h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), h("div", { class: "test-runner-header__right" }, h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor"), h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
30077
+ return (h("header", { class: "test-runner-header" }, h("div", { class: "test-runner-header__left" }, h("input", { class: "test-runner-header--hidden", type: "file", ref: el => (fileInputRef = el), onChange: handleFileChange, accept: ".json,application/json" }), h(Button, { variant: "secondary", size: "md", onClick: handleFileSelect, icon: "\u2191" }, "Import Test Suite"), h(Button, { variant: "secondary", size: "md", onClick: onExportSuite, disabled: isExportingTestSuite, loading: isExportingTestSuite, icon: isExportingTestSuite ? '⏳' : '↓' }, isExportingTestSuite ? 'Exporting...' : 'Export Test Suite')), h("div", { class: "test-runner-header__right" }, usePromptEditor && (h(Button, { variant: "secondary", size: "md", icon: "\u2699\uFE0F" }, "Prompt Editor")), h(Button, { variant: "secondary", size: "md", onClick: onExportResults, disabled: isExportingTestResults, loading: isExportingTestResults, icon: isExportingTestResults ? '⏳' : '↓' }, isExportingTestResults ? 'Exporting...' : 'Export Test Results'), useSave && (h(Button, { variant: "secondary", size: "md", onClick: onSave, disabled: isSaving, loading: isSaving, icon: isSaving ? '⏳' : '💾' }, isSaving ? 'Saving...' : 'Save')), h(Button, { "aria-label": "Run All", variant: "primary", size: "md", onClick: onRunAll, disabled: isRunningAll, loading: isRunningAll }, isRunningAll ? 'Running...' : 'Run All'))));
29980
30078
  };
29981
30079
 
29982
30080
  const ResponseOutput = ({ output, isRunning, }) => {
@@ -29984,7 +30082,9 @@ const ResponseOutput = ({ output, isRunning, }) => {
29984
30082
  };
29985
30083
 
29986
30084
  const EvaluationSummary = ({ result, isRunning, }) => {
29987
- return (h("div", { class: "evaluation-summary" }, result ? (h("div", { class: "evaluation-summary__result" }, h("div", { class: `evaluation-summary__result-status evaluation-summary__result-status--${result.passed ? 'passed' : 'failed'}` }, result.passed ? '✅ PASSED' : '❌ FAILED'), h("div", { class: "evaluation-summary__details" }, "Keywords: ", result.keywordMatches.filter(m => m.found).length, "/", result.keywordMatches.length, " found"))) : (h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
30085
+ const fieldResults = result?.fieldResults || [];
30086
+ const hasFieldResults = fieldResults.length > 0;
30087
+ return (h("div", { class: "evaluation-summary" }, result ? (h("div", { class: "evaluation-summary__result" }, hasFieldResults ? (h("div", { class: "evaluation-summary__field-results" }, fieldResults.map(fieldResult => (h("div", { class: "evaluation-summary__field-result" }, h("div", { class: "evaluation-summary__field-header" }, h("span", { class: "evaluation-summary__field-label" }, fieldResult.label), h("span", { class: "evaluation-summary__field-approach" }, "Strategy: ", fieldResult.evaluationParameters.approach)), h("div", { class: "evaluation-summary__field-details" }, h("span", { class: `evaluation-summary__field-status evaluation-summary__field-status--${fieldResult.passed ? 'passed' : 'failed'}` }, fieldResult.passed ? 'PASSED' : 'FAILED'), fieldResult.error && (h("span", { class: "evaluation-summary__error-message" }, fieldResult.error)), h("span", null, "Score: ", fieldResult.evaluationApproachResult.score.toFixed(2)), h("span", null, "Matches:", ' ', fieldResult.keywordMatches.filter(match => match.found).length, "/", fieldResult.keywordMatches.length))))))) : null)) : (h("div", { class: "evaluation-summary__placeholder" }, isRunning ? 'Evaluating...' : ''))));
29988
30088
  };
29989
30089
 
29990
30090
  const IconButton = (props, children) => {
@@ -30020,6 +30120,24 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30020
30120
  const emit = (detail) => onExpectedOutcomeChange({
30021
30121
  detail,
30022
30122
  });
30123
+ const buildEvaluationConfig = (index, optionList) => ({
30124
+ name: `expectedOutcomeEvaluation-${index}`,
30125
+ fieldType: FormFieldType.SELECT,
30126
+ label: 'Evaluation Approach',
30127
+ placeholder: 'Select evaluation approach…',
30128
+ required: true,
30129
+ optionList,
30130
+ defaultValue: EvaluationApproach.EXACT,
30131
+ });
30132
+ const renderEvaluationSelector = (field, index) => {
30133
+ const optionList = getAllowedApproachesForFieldType(field.type);
30134
+ return (h("app-select", { config: buildEvaluationConfig(index, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
30135
+ testCaseId,
30136
+ index,
30137
+ operation: 'set-evaluation-approach',
30138
+ value: e.detail.value,
30139
+ }) }));
30140
+ };
30023
30141
  return (h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index) => {
30024
30142
  if (field.type === 'textarea') {
30025
30143
  const config = {
@@ -30027,15 +30145,15 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30027
30145
  fieldType: FormFieldType.TEXT_AREA,
30028
30146
  label: field.label,
30029
30147
  placeholder: field.placeholder,
30030
- required: field.required,
30148
+ required: true,
30031
30149
  rows: field.rows || 2,
30032
30150
  };
30033
- return (h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30151
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("app-textarea", { config: config, value: field.value, onValueChange: (e) => emit({
30034
30152
  testCaseId,
30035
30153
  index,
30036
30154
  operation: 'set-value',
30037
30155
  value: e.detail.value,
30038
- }) }));
30156
+ }) }), renderEvaluationSelector(field, index)));
30039
30157
  }
30040
30158
  if (field.type === 'chips-input') {
30041
30159
  const config = {
@@ -30043,9 +30161,9 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30043
30161
  fieldType: FormFieldType.CHIPS,
30044
30162
  label: field.label,
30045
30163
  placeholder: field.placeholder,
30046
- required: field.required,
30164
+ required: true,
30047
30165
  };
30048
- return (h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30166
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("app-chips", { config: config, value: field.value, onAddChip: (e) => emit({
30049
30167
  testCaseId,
30050
30168
  index,
30051
30169
  operation: 'add-chip',
@@ -30055,7 +30173,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30055
30173
  index,
30056
30174
  operation: 'remove-chip',
30057
30175
  value: e.detail.value,
30058
- }) }));
30176
+ }) }), renderEvaluationSelector(field, index)));
30059
30177
  }
30060
30178
  if (field.type === 'select') {
30061
30179
  const config = {
@@ -30063,26 +30181,26 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, onExpectedOutcomeChange,
30063
30181
  fieldType: FormFieldType.SELECT,
30064
30182
  label: field.label,
30065
30183
  placeholder: field.placeholder,
30066
- required: field.required,
30184
+ required: true,
30067
30185
  optionList: field.options,
30068
30186
  };
30069
- return (h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30187
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("app-select", { config: config, value: field.value, onValueChange: (e) => emit({
30070
30188
  testCaseId,
30071
30189
  index,
30072
30190
  operation: 'set-value',
30073
30191
  value: e.detail.value,
30074
- }) }));
30192
+ }) }), renderEvaluationSelector(field, index)));
30075
30193
  }
30076
- return (h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30194
+ return (h("div", { class: "expected-outcome-renderer__group" }, h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
30077
30195
  testCaseId,
30078
30196
  index,
30079
30197
  operation: 'set-value',
30080
30198
  value: e.target.value,
30081
- }) })));
30199
+ }) })), renderEvaluationSelector(field, index)));
30082
30200
  })));
30083
30201
  };
30084
30202
 
30085
- const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30203
+ const LLMTestCaseRow = ({ testCase, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30086
30204
  const questionConfig = {
30087
30205
  name: 'question',
30088
30206
  fieldType: FormFieldType.TEXT_AREA,
@@ -30092,26 +30210,17 @@ const LLMTestCaseRow = ({ testCase, onRun, onDelete, onUpdateApproach, handleTes
30092
30210
  required: true,
30093
30211
  rows: 3,
30094
30212
  };
30095
- const evaluationConfig = {
30096
- name: 'EvaluationApproach',
30097
- fieldType: FormFieldType.SELECT,
30098
- label: 'Evaluation',
30099
- placeholder: 'Select evaluation approach…',
30100
- required: true,
30101
- optionList: EvaluationApproachValues,
30102
- defaultValue: EvaluationApproach.EXACT,
30103
- };
30104
30213
  return (h("div", { class: "test-case-row", key: testCase.id }, h("div", { class: "test-case-row__input-column" }, h("app-textarea", { config: questionConfig, value: testCase.question, onValueChange: (e) => handleTestCaseChange({
30105
30214
  detail: {
30106
30215
  testCaseId: testCase.id,
30107
30216
  key: 'question',
30108
30217
  value: e.detail.value,
30109
30218
  },
30110
- }) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange }), h("app-select", { config: evaluationConfig, value: testCase.evaluationParameters?.approach, onValueChange: (e) => onUpdateApproach(testCase, e.detail.value) })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30219
+ }) }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
30111
30220
  };
30112
30221
 
30113
- const LLMTestCases = ({ testCases, onRun, onDelete, onUpdateApproach, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30114
- return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, onUpdateApproach: onUpdateApproach, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30222
+ const LLMTestCases = ({ testCases, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, }) => {
30223
+ return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
30115
30224
  };
30116
30225
 
30117
30226
  const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
@@ -30122,11 +30231,11 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
30122
30231
 
30123
30232
  const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
30124
30233
 
30125
- const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30234
+ const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
30126
30235
 
30127
30236
  const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
30128
30237
 
30129
- const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__details{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}.evaluation-summary__result-status{font-weight:var(--font-weight-semibold);font-size:var(--font-size-sm);padding:var(--spacing-2) var(--spacing-3);border-radius:var(--radius-md);text-align:center}.evaluation-summary__result-status--passed{background:var(--success);color:var(--success-foreground);border:var(--border-width) solid var(--success)}.evaluation-summary__result-status--failed{background:var(--destructive);color:var(--destructive-foreground);border:var(--border-width) solid var(--destructive)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30238
+ const evaluationSummaryCss = () => `.evaluation-summary{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.evaluation-summary__field-results{display:flex;flex-direction:column;gap:var(--spacing-2);margin-top:var(--spacing-2)}.evaluation-summary__field-result{border:var(--border-width) solid var(--border);border-radius:var(--radius-md);padding:var(--spacing-2);display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-header{display:flex;flex-direction:column;gap:var(--spacing-1)}.evaluation-summary__field-label{font-weight:var(--font-weight-semibold);font-size:var(--font-size-xs)}.evaluation-summary__field-approach{color:var(--muted-foreground);font-size:11px}.evaluation-summary__field-details{display:flex;flex-direction:column;gap:var(--spacing-1);font-size:var(--font-size-xs)}.evaluation-summary__field-status{width:fit-content;padding:2px var(--spacing-2);border-radius:var(--radius-sm);font-size:11px;font-weight:var(--font-weight-semibold);border:var(--border-width) solid transparent}.evaluation-summary__field-status--passed{background:var(--success);color:var(--success-foreground);border-color:var(--success)}.evaluation-summary__field-status--failed{background:var(--destructive);color:var(--destructive-foreground);border-color:var(--destructive)}.evaluation-summary__error-message{color:var(--destructive);font-size:var(--font-size-xs)}.evaluation-summary__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}.evaluation-summary__result{display:flex;flex-direction:column;gap:var(--spacing-2)}@media (max-width: 1200px){.evaluation-summary{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.evaluation-summary{padding:var(--spacing-4)}}`;
30130
30239
 
30131
30240
  const responseOutputCss = () => `.response-output{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border);display:flex;flex-direction:column}.response-output__content{background:var(--muted);border:var(--border-width) solid var(--border);border-radius:var(--radius);padding:var(--spacing-4);font-size:var(--font-size-sm);line-height:var(--line-height-relaxed);color:var(--foreground);white-space:pre-wrap;word-wrap:break-word;flex:1;overflow-y:auto;max-height:250px;overflow-x:scroll}.response-output__placeholder{display:flex;align-items:center;justify-content:center;color:var(--muted-foreground);font-style:italic;flex:1;background:var(--muted);border:2px dashed var(--border);border-radius:var(--radius)}@media (max-width: 1200px){.response-output{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.response-output{padding:var(--spacing-4)}}`;
30132
30241
 
@@ -30146,6 +30255,7 @@ const LLMTestRunner = class {
30146
30255
  save;
30147
30256
  delayMs = 500;
30148
30257
  useSave = false;
30258
+ usePromptEditor = false;
30149
30259
  initialTestCases;
30150
30260
  defaultExpectedOutcomeSchema;
30151
30261
  testCases = [
@@ -30159,9 +30269,6 @@ const LLMTestRunner = class {
30159
30269
  value: '',
30160
30270
  },
30161
30271
  ],
30162
- evaluationParameters: {
30163
- approach: EvaluationApproach.EXACT,
30164
- },
30165
30272
  isRunning: false,
30166
30273
  },
30167
30274
  ];
@@ -30268,52 +30375,13 @@ const LLMTestRunner = class {
30268
30375
  deleteTestCase(id) {
30269
30376
  this.testCases = this.testCases.filter(tc => tc.id !== id);
30270
30377
  }
30271
- updateApproach(testCase, approach) {
30272
- if (testCase) {
30273
- const updated = updateApproach(testCase, approach);
30274
- this.updateTestCase(testCase.id, {
30275
- evaluationParameters: updated.evaluationParameters,
30276
- });
30277
- }
30278
- }
30279
30378
  handleExpectedOutcomeChange = (event) => {
30280
- const { testCaseId, index, operation, value } = event.detail;
30379
+ const { testCaseId, ...change } = event.detail;
30281
30380
  this.testCases = this.testCases.map(tc => {
30282
- if (tc.id !== testCaseId)
30283
- return tc;
30284
- const expectedOutcome = [...(tc.expectedOutcome || [])];
30285
- const target = expectedOutcome[index];
30286
- if (!target)
30381
+ if (tc.id !== testCaseId) {
30287
30382
  return tc;
30288
- if (operation === 'set-value') {
30289
- if (target.type === 'chips-input') {
30290
- return tc;
30291
- }
30292
- expectedOutcome[index] = { ...target, value: value || '' };
30293
- return { ...tc, expectedOutcome };
30294
- }
30295
- if (operation === 'add-chip') {
30296
- if (target.type !== 'chips-input' || !value) {
30297
- return tc;
30298
- }
30299
- expectedOutcome[index] = {
30300
- ...target,
30301
- value: [...target.value, value],
30302
- };
30303
- return { ...tc, expectedOutcome };
30304
- }
30305
- if (operation === 'remove-chip') {
30306
- if (target.type !== 'chips-input' ||
30307
- !value) {
30308
- return tc;
30309
- }
30310
- expectedOutcome[index] = {
30311
- ...target,
30312
- value: target.value.filter(chip => chip !== value),
30313
- };
30314
- return { ...tc, expectedOutcome };
30315
30383
  }
30316
- return tc;
30384
+ return applyExpectedOutcomeChange(tc, change);
30317
30385
  });
30318
30386
  };
30319
30387
  async evaluateResponse(testCase) {
@@ -30413,7 +30481,7 @@ const LLMTestRunner = class {
30413
30481
  }
30414
30482
  }
30415
30483
  render() {
30416
- return (h("div", { key: '5cbdc388678929c271fd2a040aca8118344024c3', class: "test-runner-container" }, h(LLMTestRunnerHeader, { key: '92533803732fc5ec28da802ac9d367f9fbbffe72', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), h(ErrorMessage, { key: 'c16a0334b1a71d676a128de18a83991c2625a075', message: this.error, onClear: () => (this.error = '') }), h("div", { key: 'e757f49052a9516c12af858b46b32a957707524c', class: "test-runner-container__content" }, h(LLMTestCases, { key: 'e9a9f6553a3ce97aeb80924b116e1b73c2397b15', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onUpdateApproach: (testCase, approach) => this.updateApproach(testCase, approach), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30484
+ return (h("div", { key: '323b5e140740bb72d4767c0763c382a6b125caa2', class: "test-runner-container" }, h(LLMTestRunnerHeader, { key: 'e1e2efdf6cfe5f406de7e26e745b5775f307d294', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, usePromptEditor: this.usePromptEditor, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), h(ErrorMessage, { key: 'c6a34b81f66c6cd835eb8bc253f7a28d68c49874', message: this.error, onClear: () => (this.error = '') }), h("div", { key: '674daad8a2754afc8144463e9a173690a3d1d589', class: "test-runner-container__content" }, h(LLMTestCases, { key: '96c1aeae37f56378b7a9b5d54be73c5df48ae448', testCases: this.testCases, onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange }))));
30417
30485
  }
30418
30486
  };
30419
30487
  LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));