llm-testrunner-components 1.2.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/dist/cjs/app-chips_5.cjs.entry.js +1 -1
- package/dist/cjs/app-chips_5.cjs.entry.js.map +1 -1
- package/dist/cjs/index.cjs.js +464 -66
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +46 -13
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/chat-history.css +5 -5
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +45 -5
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +21 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js.map +1 -1
- package/dist/collection/lib/evaluation/actual-value-resolver.js +52 -0
- package/dist/collection/lib/evaluation/actual-value-resolver.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluation-engine.js +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +55 -17
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-importer.js +7 -1
- package/dist/collection/lib/import-export/test-suite-importer.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +5 -0
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +58 -23
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +39 -0
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/model-response.js +7 -0
- package/dist/collection/schemas/model-response.js.map +1 -0
- package/dist/collection/schemas/test-case.js +2 -1
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/expected-outcome.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/components/chat-history.js +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-kmtfMXcQ.js +2 -0
- package/dist/components/p-kmtfMXcQ.js.map +1 -0
- package/dist/components/{p-B87Lt3z4.js → p-wzA48RFK.js} +3 -3
- package/dist/components/p-wzA48RFK.js.map +1 -0
- package/dist/esm/app-chips_5.entry.js +1 -1
- package/dist/esm/app-chips_5.entry.js.map +1 -1
- package/dist/esm/index.js +464 -66
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/llm-testrunner/{p-21202f12.entry.js → p-5bf1fc78.entry.js} +2 -2
- package/dist/llm-testrunner/{p-21202f12.entry.js.map → p-5bf1fc78.entry.js.map} +1 -1
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +3 -4
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/output/response-output.d.ts +2 -1
- package/dist/types/components.d.ts +4 -2
- package/dist/types/lib/evaluation/actual-value-resolver.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-service.d.ts +2 -2
- package/dist/types/lib/evaluation/types.d.ts +1 -1
- package/dist/types/lib/import-export/test-suite-importer.d.ts +1 -1
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +10 -1
- package/dist/types/schemas/expected-outcome.d.ts +116 -0
- package/dist/types/schemas/model-response.d.ts +7 -0
- package/dist/types/schemas/test-case.d.ts +76 -1
- package/dist/types/types/expected-outcome.d.ts +1 -1
- package/dist/types/types/llm-test-runner.d.ts +4 -2
- package/package.json +1 -1
- package/dist/components/p-B87Lt3z4.js.map +0 -1
- package/dist/components/p-Bx2jqguC.js +0 -2
- package/dist/components/p-Bx2jqguC.js.map +0 -1
package/dist/esm/index.js
CHANGED
|
@@ -292,6 +292,7 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
292
292
|
function normalizeExpectedOutcomeField(field) {
|
|
293
293
|
return {
|
|
294
294
|
...field,
|
|
295
|
+
evaluationSource: field.evaluationSource || { type: 'text' },
|
|
295
296
|
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
296
297
|
};
|
|
297
298
|
}
|
|
@@ -315,6 +316,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
315
316
|
type: 'text',
|
|
316
317
|
label: schemaField.label,
|
|
317
318
|
placeholder: schemaField.placeholder,
|
|
319
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
318
320
|
value: '',
|
|
319
321
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
320
322
|
};
|
|
@@ -323,6 +325,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
323
325
|
type: 'textarea',
|
|
324
326
|
label: schemaField.label,
|
|
325
327
|
placeholder: schemaField.placeholder,
|
|
328
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
326
329
|
rows: schemaField.rows,
|
|
327
330
|
value: '',
|
|
328
331
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
@@ -332,6 +335,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
332
335
|
type: 'chips-input',
|
|
333
336
|
label: schemaField.label,
|
|
334
337
|
placeholder: schemaField.placeholder,
|
|
338
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
335
339
|
value: [],
|
|
336
340
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
337
341
|
};
|
|
@@ -340,6 +344,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
340
344
|
type: 'select',
|
|
341
345
|
label: schemaField.label,
|
|
342
346
|
placeholder: schemaField.placeholder,
|
|
347
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
343
348
|
value: schemaField.options[0],
|
|
344
349
|
options: schemaField.options,
|
|
345
350
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
@@ -2569,6 +2574,122 @@ function handleIntersectionResults(result, left, right) {
|
|
|
2569
2574
|
result.value = merged.data;
|
|
2570
2575
|
return result;
|
|
2571
2576
|
}
|
|
2577
|
+
const $ZodRecord = /*@__PURE__*/ $constructor("$ZodRecord", (inst, def) => {
|
|
2578
|
+
$ZodType.init(inst, def);
|
|
2579
|
+
inst._zod.parse = (payload, ctx) => {
|
|
2580
|
+
const input = payload.value;
|
|
2581
|
+
if (!isPlainObject(input)) {
|
|
2582
|
+
payload.issues.push({
|
|
2583
|
+
expected: "record",
|
|
2584
|
+
code: "invalid_type",
|
|
2585
|
+
input,
|
|
2586
|
+
inst,
|
|
2587
|
+
});
|
|
2588
|
+
return payload;
|
|
2589
|
+
}
|
|
2590
|
+
const proms = [];
|
|
2591
|
+
const values = def.keyType._zod.values;
|
|
2592
|
+
if (values) {
|
|
2593
|
+
payload.value = {};
|
|
2594
|
+
const recordKeys = new Set();
|
|
2595
|
+
for (const key of values) {
|
|
2596
|
+
if (typeof key === "string" || typeof key === "number" || typeof key === "symbol") {
|
|
2597
|
+
recordKeys.add(typeof key === "number" ? key.toString() : key);
|
|
2598
|
+
const result = def.valueType._zod.run({ value: input[key], issues: [] }, ctx);
|
|
2599
|
+
if (result instanceof Promise) {
|
|
2600
|
+
proms.push(result.then((result) => {
|
|
2601
|
+
if (result.issues.length) {
|
|
2602
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2603
|
+
}
|
|
2604
|
+
payload.value[key] = result.value;
|
|
2605
|
+
}));
|
|
2606
|
+
}
|
|
2607
|
+
else {
|
|
2608
|
+
if (result.issues.length) {
|
|
2609
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2610
|
+
}
|
|
2611
|
+
payload.value[key] = result.value;
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
}
|
|
2615
|
+
let unrecognized;
|
|
2616
|
+
for (const key in input) {
|
|
2617
|
+
if (!recordKeys.has(key)) {
|
|
2618
|
+
unrecognized = unrecognized ?? [];
|
|
2619
|
+
unrecognized.push(key);
|
|
2620
|
+
}
|
|
2621
|
+
}
|
|
2622
|
+
if (unrecognized && unrecognized.length > 0) {
|
|
2623
|
+
payload.issues.push({
|
|
2624
|
+
code: "unrecognized_keys",
|
|
2625
|
+
input,
|
|
2626
|
+
inst,
|
|
2627
|
+
keys: unrecognized,
|
|
2628
|
+
});
|
|
2629
|
+
}
|
|
2630
|
+
}
|
|
2631
|
+
else {
|
|
2632
|
+
payload.value = {};
|
|
2633
|
+
for (const key of Reflect.ownKeys(input)) {
|
|
2634
|
+
if (key === "__proto__")
|
|
2635
|
+
continue;
|
|
2636
|
+
let keyResult = def.keyType._zod.run({ value: key, issues: [] }, ctx);
|
|
2637
|
+
if (keyResult instanceof Promise) {
|
|
2638
|
+
throw new Error("Async schemas not supported in object keys currently");
|
|
2639
|
+
}
|
|
2640
|
+
// Numeric string fallback: if key is a numeric string and failed, retry with Number(key)
|
|
2641
|
+
// This handles z.number(), z.literal([1, 2, 3]), and unions containing numeric literals
|
|
2642
|
+
const checkNumericKey = typeof key === "string" && number$1.test(key) && keyResult.issues.length;
|
|
2643
|
+
if (checkNumericKey) {
|
|
2644
|
+
const retryResult = def.keyType._zod.run({ value: Number(key), issues: [] }, ctx);
|
|
2645
|
+
if (retryResult instanceof Promise) {
|
|
2646
|
+
throw new Error("Async schemas not supported in object keys currently");
|
|
2647
|
+
}
|
|
2648
|
+
if (retryResult.issues.length === 0) {
|
|
2649
|
+
keyResult = retryResult;
|
|
2650
|
+
}
|
|
2651
|
+
}
|
|
2652
|
+
if (keyResult.issues.length) {
|
|
2653
|
+
if (def.mode === "loose") {
|
|
2654
|
+
// Pass through unchanged
|
|
2655
|
+
payload.value[key] = input[key];
|
|
2656
|
+
}
|
|
2657
|
+
else {
|
|
2658
|
+
// Default "strict" behavior: error on invalid key
|
|
2659
|
+
payload.issues.push({
|
|
2660
|
+
code: "invalid_key",
|
|
2661
|
+
origin: "record",
|
|
2662
|
+
issues: keyResult.issues.map((iss) => finalizeIssue(iss, ctx, config())),
|
|
2663
|
+
input: key,
|
|
2664
|
+
path: [key],
|
|
2665
|
+
inst,
|
|
2666
|
+
});
|
|
2667
|
+
}
|
|
2668
|
+
continue;
|
|
2669
|
+
}
|
|
2670
|
+
const result = def.valueType._zod.run({ value: input[key], issues: [] }, ctx);
|
|
2671
|
+
if (result instanceof Promise) {
|
|
2672
|
+
proms.push(result.then((result) => {
|
|
2673
|
+
if (result.issues.length) {
|
|
2674
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2675
|
+
}
|
|
2676
|
+
payload.value[keyResult.value] = result.value;
|
|
2677
|
+
}));
|
|
2678
|
+
}
|
|
2679
|
+
else {
|
|
2680
|
+
if (result.issues.length) {
|
|
2681
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2682
|
+
}
|
|
2683
|
+
payload.value[keyResult.value] = result.value;
|
|
2684
|
+
}
|
|
2685
|
+
}
|
|
2686
|
+
}
|
|
2687
|
+
if (proms.length) {
|
|
2688
|
+
return Promise.all(proms).then(() => payload);
|
|
2689
|
+
}
|
|
2690
|
+
return payload;
|
|
2691
|
+
};
|
|
2692
|
+
});
|
|
2572
2693
|
const $ZodEnum = /*@__PURE__*/ $constructor("$ZodEnum", (inst, def) => {
|
|
2573
2694
|
$ZodType.init(inst, def);
|
|
2574
2695
|
const values = getEnumValues(def.entries);
|
|
@@ -4152,6 +4273,49 @@ const intersectionProcessor = (schema, ctx, json, params) => {
|
|
|
4152
4273
|
];
|
|
4153
4274
|
json.allOf = allOf;
|
|
4154
4275
|
};
|
|
4276
|
+
const recordProcessor = (schema, ctx, _json, params) => {
|
|
4277
|
+
const json = _json;
|
|
4278
|
+
const def = schema._zod.def;
|
|
4279
|
+
json.type = "object";
|
|
4280
|
+
// For looseRecord with regex patterns, use patternProperties
|
|
4281
|
+
// This correctly represents "only validate keys matching the pattern" semantics
|
|
4282
|
+
// and composes well with allOf (intersections)
|
|
4283
|
+
const keyType = def.keyType;
|
|
4284
|
+
const keyBag = keyType._zod.bag;
|
|
4285
|
+
const patterns = keyBag?.patterns;
|
|
4286
|
+
if (def.mode === "loose" && patterns && patterns.size > 0) {
|
|
4287
|
+
// Use patternProperties for looseRecord with regex patterns
|
|
4288
|
+
const valueSchema = process$1(def.valueType, ctx, {
|
|
4289
|
+
...params,
|
|
4290
|
+
path: [...params.path, "patternProperties", "*"],
|
|
4291
|
+
});
|
|
4292
|
+
json.patternProperties = {};
|
|
4293
|
+
for (const pattern of patterns) {
|
|
4294
|
+
json.patternProperties[pattern.source] = valueSchema;
|
|
4295
|
+
}
|
|
4296
|
+
}
|
|
4297
|
+
else {
|
|
4298
|
+
// Default behavior: use propertyNames + additionalProperties
|
|
4299
|
+
if (ctx.target === "draft-07" || ctx.target === "draft-2020-12") {
|
|
4300
|
+
json.propertyNames = process$1(def.keyType, ctx, {
|
|
4301
|
+
...params,
|
|
4302
|
+
path: [...params.path, "propertyNames"],
|
|
4303
|
+
});
|
|
4304
|
+
}
|
|
4305
|
+
json.additionalProperties = process$1(def.valueType, ctx, {
|
|
4306
|
+
...params,
|
|
4307
|
+
path: [...params.path, "additionalProperties"],
|
|
4308
|
+
});
|
|
4309
|
+
}
|
|
4310
|
+
// Add required for keys with discrete values (enum, literal, etc.)
|
|
4311
|
+
const keyValues = keyType._zod.values;
|
|
4312
|
+
if (keyValues) {
|
|
4313
|
+
const validKeyValues = [...keyValues].filter((v) => typeof v === "string" || typeof v === "number");
|
|
4314
|
+
if (validKeyValues.length > 0) {
|
|
4315
|
+
json.required = validKeyValues;
|
|
4316
|
+
}
|
|
4317
|
+
}
|
|
4318
|
+
};
|
|
4155
4319
|
const nullableProcessor = (schema, ctx, json, params) => {
|
|
4156
4320
|
const def = schema._zod.def;
|
|
4157
4321
|
const inner = process$1(def.innerType, ctx, params);
|
|
@@ -4706,6 +4870,21 @@ function intersection(left, right) {
|
|
|
4706
4870
|
right: right,
|
|
4707
4871
|
});
|
|
4708
4872
|
}
|
|
4873
|
+
const ZodRecord = /*@__PURE__*/ $constructor("ZodRecord", (inst, def) => {
|
|
4874
|
+
$ZodRecord.init(inst, def);
|
|
4875
|
+
ZodType.init(inst, def);
|
|
4876
|
+
inst._zod.processJSONSchema = (ctx, json, params) => recordProcessor(inst, ctx, json, params);
|
|
4877
|
+
inst.keyType = def.keyType;
|
|
4878
|
+
inst.valueType = def.valueType;
|
|
4879
|
+
});
|
|
4880
|
+
function record(keyType, valueType, params) {
|
|
4881
|
+
return new ZodRecord({
|
|
4882
|
+
type: "record",
|
|
4883
|
+
keyType,
|
|
4884
|
+
valueType: valueType,
|
|
4885
|
+
...normalizeParams(params),
|
|
4886
|
+
});
|
|
4887
|
+
}
|
|
4709
4888
|
const ZodEnum = /*@__PURE__*/ $constructor("ZodEnum", (inst, def) => {
|
|
4710
4889
|
$ZodEnum.init(inst, def);
|
|
4711
4890
|
ZodType.init(inst, def);
|
|
@@ -4943,7 +5122,7 @@ const ZodCustom = /*@__PURE__*/ $constructor("ZodCustom", (inst, def) => {
|
|
|
4943
5122
|
inst._zod.processJSONSchema = (ctx, json, params) => customProcessor(inst, ctx);
|
|
4944
5123
|
});
|
|
4945
5124
|
function custom(fn, _params) {
|
|
4946
|
-
return _custom(ZodCustom, (() => true), _params);
|
|
5125
|
+
return _custom(ZodCustom, fn ?? (() => true), _params);
|
|
4947
5126
|
}
|
|
4948
5127
|
function refine(fn, _params = {}) {
|
|
4949
5128
|
return _refine(ZodCustom, fn, _params);
|
|
@@ -4958,6 +5137,19 @@ const optionalPositiveInt = number().int().positive().optional();
|
|
|
4958
5137
|
const optionalString = string().optional();
|
|
4959
5138
|
const selectOptionsSchema = array(nonEmptyString).min(1);
|
|
4960
5139
|
const optionalNumber = number().optional();
|
|
5140
|
+
const textEvaluationSourceSchema = object({
|
|
5141
|
+
type: literal('text'),
|
|
5142
|
+
});
|
|
5143
|
+
const customEvaluationSourceSchema = object({
|
|
5144
|
+
type: literal('custom'),
|
|
5145
|
+
extractorId: nonEmptyString,
|
|
5146
|
+
});
|
|
5147
|
+
const evaluationSourceExtractorSchema = custom(value => typeof value === 'function', 'Extractor must be a function.');
|
|
5148
|
+
record(string().min(1), evaluationSourceExtractorSchema);
|
|
5149
|
+
const evaluationSourceSchema = discriminatedUnion('type', [
|
|
5150
|
+
textEvaluationSourceSchema,
|
|
5151
|
+
customEvaluationSourceSchema,
|
|
5152
|
+
]);
|
|
4961
5153
|
const expectedOutcomeModeSchema = _enum(['static', 'dynamic']);
|
|
4962
5154
|
const evaluationParametersSchema = object({
|
|
4963
5155
|
approach: _enum(EvaluationApproach),
|
|
@@ -4975,6 +5167,7 @@ const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine(
|
|
|
4975
5167
|
const defaultExpectedOutcomeBaseSchema = object({
|
|
4976
5168
|
label: nonEmptyString,
|
|
4977
5169
|
placeholder: optionalString,
|
|
5170
|
+
evaluationSource: evaluationSourceSchema.optional(),
|
|
4978
5171
|
});
|
|
4979
5172
|
const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
|
|
4980
5173
|
text: baseSchema.extend({
|
|
@@ -5067,6 +5260,37 @@ function validateExpectedOutcomeSchema(schema) {
|
|
|
5067
5260
|
throw new Error(`Invalid expectedOutcomeSchema: ${parsed.error.issues[0].message}`);
|
|
5068
5261
|
}
|
|
5069
5262
|
}
|
|
5263
|
+
function validateExpectedOutcomeArrayWithExtractors(expectedOutcome, allowedExtractorIds) {
|
|
5264
|
+
const allowed = new Set(allowedExtractorIds);
|
|
5265
|
+
const schema = expectedOutcomeArraySchema.superRefine((fields, ctx) => {
|
|
5266
|
+
fields.forEach((field, index) => {
|
|
5267
|
+
if (field.evaluationSource?.type !== 'custom') {
|
|
5268
|
+
return;
|
|
5269
|
+
}
|
|
5270
|
+
if (allowed.has(field.evaluationSource.extractorId)) {
|
|
5271
|
+
return;
|
|
5272
|
+
}
|
|
5273
|
+
ctx.addIssue({
|
|
5274
|
+
code: 'custom',
|
|
5275
|
+
path: [index, 'evaluationSource', 'extractorId'],
|
|
5276
|
+
message: `Invalid expectedOutcome: Extractor "${field.evaluationSource.extractorId}" is not registered.`,
|
|
5277
|
+
});
|
|
5278
|
+
});
|
|
5279
|
+
});
|
|
5280
|
+
const parsed = schema.safeParse(expectedOutcome);
|
|
5281
|
+
if (!parsed.success) {
|
|
5282
|
+
throw new Error(parsed.error.issues[0].message);
|
|
5283
|
+
}
|
|
5284
|
+
}
|
|
5285
|
+
function getExtractorIds(extractors) {
|
|
5286
|
+
return Object.keys(extractors || {});
|
|
5287
|
+
}
|
|
5288
|
+
|
|
5289
|
+
const modelResponseMetadataSchema = record(string(), unknown());
|
|
5290
|
+
const modelResponsePayloadSchema = object({
|
|
5291
|
+
text: string().optional(),
|
|
5292
|
+
metadata: modelResponseMetadataSchema.optional(),
|
|
5293
|
+
});
|
|
5070
5294
|
|
|
5071
5295
|
const testCaseChatHistorySchema = object({
|
|
5072
5296
|
enabled: boolean(),
|
|
@@ -5083,8 +5307,8 @@ object({
|
|
|
5083
5307
|
id: string(),
|
|
5084
5308
|
question: string(),
|
|
5085
5309
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5310
|
+
output: modelResponsePayloadSchema.optional(),
|
|
5086
5311
|
chatHistory: testCaseChatHistorySchema,
|
|
5087
|
-
output: string().optional(),
|
|
5088
5312
|
isRunning: boolean().optional(),
|
|
5089
5313
|
error: string().optional(),
|
|
5090
5314
|
evaluationResult: custom().optional(),
|
|
@@ -5106,10 +5330,15 @@ function validateTestCaseInputArray(data) {
|
|
|
5106
5330
|
* @param jsonContent - The JSON string to parse and validate
|
|
5107
5331
|
* @returns Validation result with test cases or error message
|
|
5108
5332
|
*/
|
|
5109
|
-
function importTestSuite(jsonContent) {
|
|
5333
|
+
function importTestSuite(jsonContent, allowedExtractorIds = []) {
|
|
5110
5334
|
try {
|
|
5111
5335
|
const parsed = JSON.parse(jsonContent);
|
|
5112
5336
|
validateTestCaseInputArray(parsed);
|
|
5337
|
+
if (allowedExtractorIds.length > 0) {
|
|
5338
|
+
parsed.forEach((testCase) => {
|
|
5339
|
+
validateExpectedOutcomeArrayWithExtractors(testCase.expectedOutcome, allowedExtractorIds);
|
|
5340
|
+
});
|
|
5341
|
+
}
|
|
5113
5342
|
const testCases = parsed.map((item, index) => {
|
|
5114
5343
|
try {
|
|
5115
5344
|
return createTestCaseFromInput(item);
|
|
@@ -5135,7 +5364,7 @@ function importTestSuite(jsonContent) {
|
|
|
5135
5364
|
}
|
|
5136
5365
|
|
|
5137
5366
|
const MISSING_RESOLVER_MESSAGE = 'resolveExpectedOutcome is required when a test case has dynamic expected outcomes.';
|
|
5138
|
-
function isDynamicTextareaField(field) {
|
|
5367
|
+
function isDynamicTextareaField$1(field) {
|
|
5139
5368
|
return field.type === 'textarea' && field.outcomeMode === 'dynamic';
|
|
5140
5369
|
}
|
|
5141
5370
|
function applyResolvedDynamicValues(testCase, resolvedValues) {
|
|
@@ -5145,7 +5374,7 @@ function applyResolvedDynamicValues(testCase, resolvedValues) {
|
|
|
5145
5374
|
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5146
5375
|
for (const resolved of resolvedValues) {
|
|
5147
5376
|
const field = expectedOutcome[resolved.index];
|
|
5148
|
-
if (!field || !isDynamicTextareaField(field)) {
|
|
5377
|
+
if (!field || !isDynamicTextareaField$1(field)) {
|
|
5149
5378
|
continue;
|
|
5150
5379
|
}
|
|
5151
5380
|
expectedOutcome[resolved.index] = {
|
|
@@ -5160,7 +5389,7 @@ function applyResolvedDynamicValues(testCase, resolvedValues) {
|
|
|
5160
5389
|
}
|
|
5161
5390
|
async function resolveDynamicExpectedOutcomes(testCase, resolver) {
|
|
5162
5391
|
const dynamicFields = (testCase.expectedOutcome || []).flatMap((field, index) => {
|
|
5163
|
-
if (!isDynamicTextareaField(field)) {
|
|
5392
|
+
if (!isDynamicTextareaField$1(field)) {
|
|
5164
5393
|
return [];
|
|
5165
5394
|
}
|
|
5166
5395
|
return [{ field, index }];
|
|
@@ -5178,6 +5407,15 @@ async function resolveDynamicExpectedOutcomes(testCase, resolver) {
|
|
|
5178
5407
|
return applyResolvedDynamicValues(testCase, resolvedValues);
|
|
5179
5408
|
}
|
|
5180
5409
|
|
|
5410
|
+
function isChipsInputField(field) {
|
|
5411
|
+
return field.type === 'chips-input';
|
|
5412
|
+
}
|
|
5413
|
+
function isTextareaField(field) {
|
|
5414
|
+
return field.type === 'textarea';
|
|
5415
|
+
}
|
|
5416
|
+
function isDynamicTextareaField(field) {
|
|
5417
|
+
return isTextareaField(field) && field.outcomeMode === 'dynamic';
|
|
5418
|
+
}
|
|
5181
5419
|
function applyExpectedOutcomeChange(testCase, change) {
|
|
5182
5420
|
const { index } = change;
|
|
5183
5421
|
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
@@ -5185,73 +5423,99 @@ function applyExpectedOutcomeChange(testCase, change) {
|
|
|
5185
5423
|
if (!target) {
|
|
5186
5424
|
return testCase;
|
|
5187
5425
|
}
|
|
5426
|
+
const commit = (updatedField) => {
|
|
5427
|
+
expectedOutcome[index] = updatedField;
|
|
5428
|
+
return { ...testCase, expectedOutcome };
|
|
5429
|
+
};
|
|
5188
5430
|
switch (change.operation) {
|
|
5189
5431
|
case 'set-value': {
|
|
5190
|
-
if (target
|
|
5432
|
+
if (isChipsInputField(target)) {
|
|
5191
5433
|
return testCase;
|
|
5192
5434
|
}
|
|
5193
|
-
if (target
|
|
5435
|
+
if (isDynamicTextareaField(target)) {
|
|
5194
5436
|
return testCase;
|
|
5195
5437
|
}
|
|
5196
|
-
|
|
5438
|
+
return commit({
|
|
5197
5439
|
...target,
|
|
5198
5440
|
value: change.value,
|
|
5199
|
-
};
|
|
5200
|
-
return { ...testCase, expectedOutcome };
|
|
5441
|
+
});
|
|
5201
5442
|
}
|
|
5202
5443
|
case 'add-chip': {
|
|
5203
|
-
if (target
|
|
5444
|
+
if (!isChipsInputField(target)) {
|
|
5204
5445
|
return testCase;
|
|
5205
5446
|
}
|
|
5206
|
-
|
|
5447
|
+
return commit({
|
|
5207
5448
|
...target,
|
|
5208
5449
|
value: [...target.value, change.value],
|
|
5209
|
-
};
|
|
5210
|
-
return { ...testCase, expectedOutcome };
|
|
5450
|
+
});
|
|
5211
5451
|
}
|
|
5212
5452
|
case 'remove-chip': {
|
|
5213
|
-
if (target
|
|
5453
|
+
if (!isChipsInputField(target)) {
|
|
5214
5454
|
return testCase;
|
|
5215
5455
|
}
|
|
5216
|
-
|
|
5456
|
+
return commit({
|
|
5217
5457
|
...target,
|
|
5218
5458
|
value: target.value.filter(chip => chip !== change.value),
|
|
5219
|
-
};
|
|
5220
|
-
return { ...testCase, expectedOutcome };
|
|
5459
|
+
});
|
|
5221
5460
|
}
|
|
5222
5461
|
case 'set-evaluation-approach':
|
|
5223
5462
|
return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
|
|
5224
5463
|
case 'set-outcome-mode': {
|
|
5225
|
-
if (target
|
|
5464
|
+
if (!isTextareaField(target)) {
|
|
5226
5465
|
return testCase;
|
|
5227
5466
|
}
|
|
5228
5467
|
const mode = change.value;
|
|
5229
5468
|
if (mode === 'static') {
|
|
5230
5469
|
const { resolutionQuery: _, ...rest } = target;
|
|
5231
|
-
|
|
5470
|
+
return commit({
|
|
5232
5471
|
...rest,
|
|
5233
5472
|
outcomeMode: 'static',
|
|
5234
5473
|
value: '',
|
|
5235
|
-
};
|
|
5474
|
+
});
|
|
5236
5475
|
}
|
|
5237
5476
|
else {
|
|
5238
|
-
|
|
5477
|
+
return commit({
|
|
5239
5478
|
...target,
|
|
5240
5479
|
outcomeMode: 'dynamic',
|
|
5241
5480
|
value: '',
|
|
5242
|
-
};
|
|
5481
|
+
});
|
|
5243
5482
|
}
|
|
5244
|
-
return { ...testCase, expectedOutcome };
|
|
5245
5483
|
}
|
|
5246
5484
|
case 'set-resolution-query': {
|
|
5247
|
-
if (target
|
|
5485
|
+
if (!isDynamicTextareaField(target)) {
|
|
5248
5486
|
return testCase;
|
|
5249
5487
|
}
|
|
5250
|
-
|
|
5488
|
+
return commit({
|
|
5251
5489
|
...target,
|
|
5252
5490
|
resolutionQuery: change.value,
|
|
5253
|
-
};
|
|
5254
|
-
|
|
5491
|
+
});
|
|
5492
|
+
}
|
|
5493
|
+
case 'set-evaluation-source-type': {
|
|
5494
|
+
if (change.value === 'text') {
|
|
5495
|
+
return commit({
|
|
5496
|
+
...target,
|
|
5497
|
+
evaluationSource: { type: 'text' },
|
|
5498
|
+
});
|
|
5499
|
+
}
|
|
5500
|
+
const extractorId = target.evaluationSource?.type === 'custom'
|
|
5501
|
+
? target.evaluationSource.extractorId
|
|
5502
|
+
: (change.fallbackExtractorId ?? '');
|
|
5503
|
+
return commit({
|
|
5504
|
+
...target,
|
|
5505
|
+
evaluationSource: {
|
|
5506
|
+
type: 'custom',
|
|
5507
|
+
extractorId,
|
|
5508
|
+
},
|
|
5509
|
+
});
|
|
5510
|
+
}
|
|
5511
|
+
case 'set-evaluation-source-extractor': {
|
|
5512
|
+
return commit({
|
|
5513
|
+
...target,
|
|
5514
|
+
evaluationSource: {
|
|
5515
|
+
type: 'custom',
|
|
5516
|
+
extractorId: change.value,
|
|
5517
|
+
},
|
|
5518
|
+
});
|
|
5255
5519
|
}
|
|
5256
5520
|
}
|
|
5257
5521
|
}
|
|
@@ -30032,7 +30296,7 @@ class LLMEvaluationEngine {
|
|
|
30032
30296
|
const fieldRequest = {
|
|
30033
30297
|
testCaseId: request.testCaseId,
|
|
30034
30298
|
question: request.question,
|
|
30035
|
-
actualResponse:
|
|
30299
|
+
actualResponse: field.actualResponse,
|
|
30036
30300
|
expectedOutcome: field.expectedValue,
|
|
30037
30301
|
evaluationParameters: field.evaluationParameters,
|
|
30038
30302
|
};
|
|
@@ -30102,6 +30366,58 @@ class LLMEvaluationEngine {
|
|
|
30102
30366
|
}
|
|
30103
30367
|
}
|
|
30104
30368
|
|
|
30369
|
+
function toTextSource() {
|
|
30370
|
+
return { type: 'text' };
|
|
30371
|
+
}
|
|
30372
|
+
async function resolveActualValue(field, output, extractors) {
|
|
30373
|
+
const source = field.evaluationSource || toTextSource();
|
|
30374
|
+
if (source.type === 'text') {
|
|
30375
|
+
const text = output?.text?.trim();
|
|
30376
|
+
if (!text) {
|
|
30377
|
+
return {
|
|
30378
|
+
success: false,
|
|
30379
|
+
error: 'Model response text is empty.',
|
|
30380
|
+
};
|
|
30381
|
+
}
|
|
30382
|
+
return { success: true, value: text };
|
|
30383
|
+
}
|
|
30384
|
+
const extractor = extractors?.[source.extractorId];
|
|
30385
|
+
if (!extractor) {
|
|
30386
|
+
return {
|
|
30387
|
+
success: false,
|
|
30388
|
+
error: `Extractor "${source.extractorId}" is not registered.`,
|
|
30389
|
+
};
|
|
30390
|
+
}
|
|
30391
|
+
try {
|
|
30392
|
+
const extractedRaw = await extractor(output || {});
|
|
30393
|
+
if (typeof extractedRaw !== 'string') {
|
|
30394
|
+
return {
|
|
30395
|
+
success: false,
|
|
30396
|
+
error: `Extractor "${source.extractorId}" must return a string.`,
|
|
30397
|
+
};
|
|
30398
|
+
}
|
|
30399
|
+
const extracted = extractedRaw.trim();
|
|
30400
|
+
if (!extracted) {
|
|
30401
|
+
return {
|
|
30402
|
+
success: false,
|
|
30403
|
+
error: `Extractor "${source.extractorId}" returned an empty value.`,
|
|
30404
|
+
};
|
|
30405
|
+
}
|
|
30406
|
+
return {
|
|
30407
|
+
success: true,
|
|
30408
|
+
value: extracted,
|
|
30409
|
+
};
|
|
30410
|
+
}
|
|
30411
|
+
catch (error) {
|
|
30412
|
+
return {
|
|
30413
|
+
success: false,
|
|
30414
|
+
error: error instanceof Error
|
|
30415
|
+
? error.message
|
|
30416
|
+
: `Extractor "${source.extractorId}" failed.`,
|
|
30417
|
+
};
|
|
30418
|
+
}
|
|
30419
|
+
}
|
|
30420
|
+
|
|
30105
30421
|
/**
|
|
30106
30422
|
* Service for evaluating test case responses
|
|
30107
30423
|
*/
|
|
@@ -30115,34 +30431,71 @@ class EvaluationService {
|
|
|
30115
30431
|
* @param testCase - The test case to evaluate
|
|
30116
30432
|
* @param onResult - Callback to handle the evaluation result
|
|
30117
30433
|
*/
|
|
30118
|
-
async evaluateTestCase(testCase, onResult) {
|
|
30119
|
-
|
|
30120
|
-
|
|
30121
|
-
|
|
30122
|
-
}
|
|
30123
|
-
const fields = (testCase.expectedOutcome || []).flatMap((field, index) => {
|
|
30434
|
+
async evaluateTestCase(testCase, onResult, extractors) {
|
|
30435
|
+
const fields = [];
|
|
30436
|
+
const failedFields = [];
|
|
30437
|
+
for (const [index, field] of (testCase.expectedOutcome || []).entries()) {
|
|
30124
30438
|
if (field.type === 'textarea' && field.outcomeMode === 'dynamic') {
|
|
30125
|
-
|
|
30439
|
+
continue;
|
|
30126
30440
|
}
|
|
30127
|
-
|
|
30128
|
-
|
|
30441
|
+
const evaluationParameters = normalizeEvaluationParametersForField(field.type, field.evaluationParameters);
|
|
30442
|
+
const expectedValue = getFieldExpectedValue(field);
|
|
30443
|
+
const resolvedActualValue = await resolveActualValue(field, testCase.output, extractors);
|
|
30444
|
+
if (resolvedActualValue.success) {
|
|
30445
|
+
fields.push({
|
|
30129
30446
|
index,
|
|
30130
30447
|
label: field.label,
|
|
30131
30448
|
type: field.type,
|
|
30132
|
-
expectedValue
|
|
30133
|
-
|
|
30134
|
-
|
|
30135
|
-
|
|
30136
|
-
|
|
30449
|
+
expectedValue,
|
|
30450
|
+
actualResponse: resolvedActualValue.value,
|
|
30451
|
+
evaluationParameters,
|
|
30452
|
+
});
|
|
30453
|
+
}
|
|
30454
|
+
else {
|
|
30455
|
+
failedFields.push({
|
|
30456
|
+
index,
|
|
30457
|
+
label: field.label,
|
|
30458
|
+
type: field.type,
|
|
30459
|
+
expectedValue,
|
|
30460
|
+
passed: false,
|
|
30461
|
+
keywordMatches: [],
|
|
30462
|
+
evaluationParameters,
|
|
30463
|
+
evaluationApproachResult: {
|
|
30464
|
+
score: 0,
|
|
30465
|
+
approachUsed: evaluationParameters.approach,
|
|
30466
|
+
},
|
|
30467
|
+
error: 'error' in resolvedActualValue
|
|
30468
|
+
? resolvedActualValue.error
|
|
30469
|
+
: 'Failed to resolve actual value.',
|
|
30470
|
+
});
|
|
30471
|
+
}
|
|
30472
|
+
}
|
|
30473
|
+
if (fields.length === 0) {
|
|
30474
|
+
if (failedFields.length === 0) {
|
|
30475
|
+
console.warn('⚠️ No evaluable fields for test case:', testCase.id);
|
|
30476
|
+
return;
|
|
30477
|
+
}
|
|
30478
|
+
onResult({
|
|
30479
|
+
testCaseId: testCase.id,
|
|
30480
|
+
passed: false,
|
|
30481
|
+
keywordMatches: [],
|
|
30482
|
+
fieldResults: failedFields,
|
|
30483
|
+
timestamp: new Date().toISOString(),
|
|
30484
|
+
});
|
|
30485
|
+
return;
|
|
30486
|
+
}
|
|
30137
30487
|
const evaluationRequest = {
|
|
30138
30488
|
testCaseId: testCase.id,
|
|
30139
30489
|
question: testCase.question,
|
|
30140
|
-
actualResponse: testCase.output,
|
|
30141
30490
|
fields,
|
|
30142
30491
|
};
|
|
30143
30492
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
30144
|
-
|
|
30145
|
-
onResult(
|
|
30493
|
+
const combinedResults = [...(result.fieldResults || []), ...failedFields].sort((a, b) => a.index - b.index);
|
|
30494
|
+
onResult({
|
|
30495
|
+
...result,
|
|
30496
|
+
passed: combinedResults.every(field => field.passed && !field.error),
|
|
30497
|
+
fieldResults: combinedResults,
|
|
30498
|
+
});
|
|
30146
30499
|
});
|
|
30147
30500
|
}
|
|
30148
30501
|
}
|
|
@@ -30190,7 +30543,7 @@ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isR
|
|
|
30190
30543
|
};
|
|
30191
30544
|
|
|
30192
30545
|
const ResponseOutput = ({ output, isRunning, }) => {
|
|
30193
|
-
return (h("div", { class: "response-output" }, output ? (h("div", { class: "response-output__content" }, output)) : (h("div", { class: "response-output__placeholder" }, isRunning ? 'Running...' : ''))));
|
|
30546
|
+
return (h("div", { class: "response-output" }, output?.text ? (h("div", { class: "response-output__content" }, output.text)) : (h("div", { class: "response-output__placeholder" }, isRunning ? 'Running...' : ''))));
|
|
30194
30547
|
};
|
|
30195
30548
|
|
|
30196
30549
|
const EvaluationSummary = ({ result, isRunning, }) => {
|
|
@@ -30228,7 +30581,9 @@ var FormFieldType;
|
|
|
30228
30581
|
FormFieldType["SELECT"] = "select";
|
|
30229
30582
|
})(FormFieldType || (FormFieldType = {}));
|
|
30230
30583
|
|
|
30231
|
-
const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupported = false, onExpectedOutcomeChange, }) => {
|
|
30584
|
+
const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupported = false, extractorIds = [], onExpectedOutcomeChange, }) => {
|
|
30585
|
+
const hasExtractorOptions = extractorIds.length > 0;
|
|
30586
|
+
const firstExtractorId = extractorIds[0];
|
|
30232
30587
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
30233
30588
|
detail,
|
|
30234
30589
|
});
|
|
@@ -30258,6 +30613,23 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30258
30613
|
required: false,
|
|
30259
30614
|
rows: 2,
|
|
30260
30615
|
});
|
|
30616
|
+
const buildEvaluationSourceConfig = (index) => ({
|
|
30617
|
+
name: `expectedOutcomeEvaluationSource-${index}`,
|
|
30618
|
+
fieldType: FormFieldType.SELECT,
|
|
30619
|
+
label: 'Evaluation Source',
|
|
30620
|
+
placeholder: 'Select evaluation source',
|
|
30621
|
+
required: true,
|
|
30622
|
+
optionList: ['text', 'custom'],
|
|
30623
|
+
defaultValue: 'text',
|
|
30624
|
+
});
|
|
30625
|
+
const buildExtractorConfig = (index) => ({
|
|
30626
|
+
name: `expectedOutcomeEvaluationSourceExtractor-${index}`,
|
|
30627
|
+
fieldType: FormFieldType.SELECT,
|
|
30628
|
+
label: 'Extractor',
|
|
30629
|
+
placeholder: 'Select extractor',
|
|
30630
|
+
required: true,
|
|
30631
|
+
optionList: extractorIds,
|
|
30632
|
+
});
|
|
30261
30633
|
const renderEvaluationSelector = (field, index) => {
|
|
30262
30634
|
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
30263
30635
|
return (h("app-select", { config: buildEvaluationConfig(index, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
@@ -30267,6 +30639,27 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30267
30639
|
value: e.detail.value,
|
|
30268
30640
|
}) }));
|
|
30269
30641
|
};
|
|
30642
|
+
const renderEvaluationSourceSelector = (field, index) => {
|
|
30643
|
+
if (!hasExtractorOptions) {
|
|
30644
|
+
return null;
|
|
30645
|
+
}
|
|
30646
|
+
const sourceType = field.evaluationSource?.type || 'text';
|
|
30647
|
+
return (h("div", null, h("app-select", { config: buildEvaluationSourceConfig(index), value: sourceType, onValueChange: (e) => emit({
|
|
30648
|
+
testCaseId,
|
|
30649
|
+
index,
|
|
30650
|
+
operation: 'set-evaluation-source-type',
|
|
30651
|
+
value: e.detail.value,
|
|
30652
|
+
fallbackExtractorId: firstExtractorId,
|
|
30653
|
+
}) }), sourceType === 'custom' && (h("app-select", { config: buildExtractorConfig(index), value: field.evaluationSource?.type === 'custom'
|
|
30654
|
+
? field.evaluationSource.extractorId
|
|
30655
|
+
: '', onValueChange: (e) => emit({
|
|
30656
|
+
testCaseId,
|
|
30657
|
+
index,
|
|
30658
|
+
operation: 'set-evaluation-source-extractor',
|
|
30659
|
+
value: e.detail.value,
|
|
30660
|
+
}) }))));
|
|
30661
|
+
};
|
|
30662
|
+
const renderEvaluationOptions = (field, index) => (h("details", { class: "expected-outcome-renderer__options" }, h("summary", { class: "expected-outcome-renderer__options-summary" }, "More options"), h("div", { class: "expected-outcome-renderer__options-content" }, renderEvaluationSelector(field, index), renderEvaluationSourceSelector(field, index))));
|
|
30270
30663
|
return (h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index) => {
|
|
30271
30664
|
if (field.type === 'textarea') {
|
|
30272
30665
|
const isDynamic = dynamicResolutionSupported && field.outcomeMode === 'dynamic';
|
|
@@ -30298,7 +30691,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30298
30691
|
index,
|
|
30299
30692
|
operation: 'set-resolution-query',
|
|
30300
30693
|
value: e.detail.value,
|
|
30301
|
-
}) })), !isDynamic &&
|
|
30694
|
+
}) })), !isDynamic && renderEvaluationOptions(field, index)));
|
|
30302
30695
|
}
|
|
30303
30696
|
if (field.type === 'chips-input') {
|
|
30304
30697
|
const config = {
|
|
@@ -30318,7 +30711,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30318
30711
|
index,
|
|
30319
30712
|
operation: 'remove-chip',
|
|
30320
30713
|
value: e.detail.value,
|
|
30321
|
-
}) }),
|
|
30714
|
+
}) }), renderEvaluationOptions(field, index)));
|
|
30322
30715
|
}
|
|
30323
30716
|
if (field.type === 'select') {
|
|
30324
30717
|
const config = {
|
|
@@ -30334,18 +30727,18 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30334
30727
|
index,
|
|
30335
30728
|
operation: 'set-value',
|
|
30336
30729
|
value: e.detail.value,
|
|
30337
|
-
}) }),
|
|
30730
|
+
}) }), renderEvaluationOptions(field, index)));
|
|
30338
30731
|
}
|
|
30339
30732
|
return (h("div", { class: "expected-outcome-renderer__group" }, h("div", { class: "expected-outcome-renderer__text" }, h("label", null, field.label), h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30340
30733
|
testCaseId,
|
|
30341
30734
|
index,
|
|
30342
30735
|
operation: 'set-value',
|
|
30343
30736
|
value: e.target.value,
|
|
30344
|
-
}) })),
|
|
30737
|
+
}) })), renderEvaluationOptions(field, index)));
|
|
30345
30738
|
})));
|
|
30346
30739
|
};
|
|
30347
30740
|
|
|
30348
|
-
const LLMTestCaseRow = ({ testCase, dynamicResolutionSupported = false, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30741
|
+
const LLMTestCaseRow = ({ testCase, dynamicResolutionSupported = false, extractorIds = [], onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30349
30742
|
const questionConfig = {
|
|
30350
30743
|
name: 'question',
|
|
30351
30744
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -30371,11 +30764,11 @@ const LLMTestCaseRow = ({ testCase, dynamicResolutionSupported = false, onRun, o
|
|
|
30371
30764
|
value,
|
|
30372
30765
|
},
|
|
30373
30766
|
});
|
|
30374
|
-
} }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], dynamicResolutionSupported: dynamicResolutionSupported, onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30767
|
+
} }), h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], dynamicResolutionSupported: dynamicResolutionSupported, extractorIds: extractorIds, onExpectedOutcomeChange: onExpectedOutcomeChange })), h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30375
30768
|
};
|
|
30376
30769
|
|
|
30377
|
-
const LLMTestCases = ({ testCases, dynamicResolutionSupported = false, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30378
|
-
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, dynamicResolutionSupported: dynamicResolutionSupported, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange, onChatHistoryChange: onChatHistoryChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30770
|
+
const LLMTestCases = ({ testCases, dynamicResolutionSupported = false, extractorIds = [], onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30771
|
+
return (h("div", { class: "test-cases" }, h("div", { class: "test-cases__column-headers" }, h("div", { class: "test-cases__column-header" }, "Input"), h("div", { class: "test-cases__column-header" }, "Output"), h("div", { class: "test-cases__column-header" }, "Evaluation"), h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (h(LLMTestCaseRow, { testCase: testCase, dynamicResolutionSupported: dynamicResolutionSupported, extractorIds: extractorIds, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange, onChatHistoryChange: onChatHistoryChange }))), h("div", { class: "test-cases__add-section" }, h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30379
30772
|
};
|
|
30380
30773
|
|
|
30381
30774
|
const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
|
|
@@ -30386,7 +30779,7 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
|
|
|
30386
30779
|
|
|
30387
30780
|
const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
|
|
30388
30781
|
|
|
30389
|
-
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30782
|
+
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}.expected-outcome-renderer__options{border:var(--border-width) solid var(--border);border-radius:var(--radius-sm);background:var(--muted)}.expected-outcome-renderer__options-summary{cursor:pointer;font-size:var(--font-size-sm);color:var(--foreground);padding:var(--spacing-2) var(--spacing-3);user-select:none}.expected-outcome-renderer__options-content{display:flex;flex-direction:column;gap:var(--spacing-2);padding:0 var(--spacing-3) var(--spacing-3)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30390
30783
|
|
|
30391
30784
|
const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
|
|
30392
30785
|
|
|
@@ -30412,6 +30805,7 @@ const LLMTestRunner = class {
|
|
|
30412
30805
|
useSave = false;
|
|
30413
30806
|
usePromptEditor = false;
|
|
30414
30807
|
resolveExpectedOutcome;
|
|
30808
|
+
evaluationSourceExtractors;
|
|
30415
30809
|
initialTestCases;
|
|
30416
30810
|
defaultExpectedOutcomeSchema;
|
|
30417
30811
|
testCases = [
|
|
@@ -30448,6 +30842,12 @@ const LLMTestRunner = class {
|
|
|
30448
30842
|
// Initialize testCases from prop if provided
|
|
30449
30843
|
if (this.initialTestCases !== undefined) {
|
|
30450
30844
|
validateTestCaseInputArray(this.initialTestCases);
|
|
30845
|
+
const extractorIds = getExtractorIds(this.evaluationSourceExtractors);
|
|
30846
|
+
if (extractorIds.length > 0) {
|
|
30847
|
+
this.initialTestCases.forEach(testCase => {
|
|
30848
|
+
validateExpectedOutcomeArrayWithExtractors(testCase.expectedOutcome, extractorIds);
|
|
30849
|
+
});
|
|
30850
|
+
}
|
|
30451
30851
|
this.testCases = this.initialTestCases.map((rawTestCase, index) => {
|
|
30452
30852
|
try {
|
|
30453
30853
|
return createTestCaseFromInput(rawTestCase);
|
|
@@ -30471,8 +30871,6 @@ const LLMTestRunner = class {
|
|
|
30471
30871
|
this.testCases = [];
|
|
30472
30872
|
}
|
|
30473
30873
|
}
|
|
30474
|
-
componentDidLoad() { }
|
|
30475
|
-
disconnectedCallback() { }
|
|
30476
30874
|
async resetSavingState() {
|
|
30477
30875
|
this.isSaving = false;
|
|
30478
30876
|
}
|
|
@@ -30505,7 +30903,7 @@ const LLMTestRunner = class {
|
|
|
30505
30903
|
updateTestCase(id, updates) {
|
|
30506
30904
|
this.testCases = this.testCases.map(tc => tc.id === id ? { ...tc, ...updates } : tc);
|
|
30507
30905
|
}
|
|
30508
|
-
|
|
30906
|
+
requestLlmResponse(testCase) {
|
|
30509
30907
|
return new Promise((resolve, reject) => {
|
|
30510
30908
|
const payload = {
|
|
30511
30909
|
prompt: testCase.question,
|
|
@@ -30528,14 +30926,14 @@ const LLMTestRunner = class {
|
|
|
30528
30926
|
const startTime = Date.now();
|
|
30529
30927
|
this.updateTestCase(testCase.id, { isRunning: true });
|
|
30530
30928
|
const [llmSettled, resolutionSettled] = await Promise.allSettled([
|
|
30531
|
-
this.
|
|
30929
|
+
this.requestLlmResponse(testCase),
|
|
30532
30930
|
resolveDynamicExpectedOutcomes(testCase, this.resolveExpectedOutcome),
|
|
30533
30931
|
]);
|
|
30534
30932
|
const responseTime = Date.now() - startTime;
|
|
30535
30933
|
if (llmSettled.status === 'rejected') {
|
|
30536
30934
|
this.updateTestCase(testCase.id, {
|
|
30537
30935
|
isRunning: false,
|
|
30538
|
-
output:
|
|
30936
|
+
output: undefined,
|
|
30539
30937
|
error: this.addErrorMessage(llmSettled.reason, 'Unknown error'),
|
|
30540
30938
|
responseTime,
|
|
30541
30939
|
});
|
|
@@ -30583,7 +30981,7 @@ const LLMTestRunner = class {
|
|
|
30583
30981
|
this.updateTestCase(testCase.id, {
|
|
30584
30982
|
evaluationResult: result,
|
|
30585
30983
|
});
|
|
30586
|
-
});
|
|
30984
|
+
}, this.evaluationSourceExtractors);
|
|
30587
30985
|
}
|
|
30588
30986
|
async runAllTests() {
|
|
30589
30987
|
this.isRunningAll = true;
|
|
@@ -30614,7 +31012,7 @@ const LLMTestRunner = class {
|
|
|
30614
31012
|
this.error = '';
|
|
30615
31013
|
try {
|
|
30616
31014
|
const content = await readFileAsync(file);
|
|
30617
|
-
const result = importTestSuite(content);
|
|
31015
|
+
const result = importTestSuite(content, getExtractorIds(this.evaluationSourceExtractors));
|
|
30618
31016
|
if (!result.success) {
|
|
30619
31017
|
this.error = result.error || 'Unknown error occurred during import.';
|
|
30620
31018
|
return;
|
|
@@ -30675,7 +31073,7 @@ const LLMTestRunner = class {
|
|
|
30675
31073
|
}
|
|
30676
31074
|
}
|
|
30677
31075
|
render() {
|
|
30678
|
-
return (h("div", { key: '
|
|
31076
|
+
return (h("div", { key: '7433beaa1d60d48f65600c43e11b302b892a7bca', class: "test-runner-container" }, h(LLMTestRunnerHeader, { key: '8083cc39376e7a710bd3f52efb184b959e885a87', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, usePromptEditor: this.usePromptEditor, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), h(ErrorMessage, { key: 'ddced98c13cd595c4cfb6eef11b27cb173769518', message: this.error, onClear: () => (this.error = '') }), h("div", { key: '8d6f65c4d68d34869b644709eacb97fec93683c6', class: "test-runner-container__content" }, h(LLMTestCases, { key: '5ccb186132b23af6209209b0a14086e03cf790af', testCases: this.testCases, dynamicResolutionSupported: !!this.resolveExpectedOutcome, extractorIds: getExtractorIds(this.evaluationSourceExtractors), onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange, onChatHistoryChange: this.handleChatHistoryChange }))));
|
|
30679
31077
|
}
|
|
30680
31078
|
};
|
|
30681
31079
|
LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));
|