llm-testrunner-components 1.2.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/dist/cjs/app-chips_5.cjs.entry.js +1 -1
- package/dist/cjs/app-chips_5.cjs.entry.js.map +1 -1
- package/dist/cjs/index.cjs.js +464 -66
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +46 -13
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/chat-history.css +5 -5
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +45 -5
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +21 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js.map +1 -1
- package/dist/collection/lib/evaluation/actual-value-resolver.js +52 -0
- package/dist/collection/lib/evaluation/actual-value-resolver.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluation-engine.js +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +55 -17
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-importer.js +7 -1
- package/dist/collection/lib/import-export/test-suite-importer.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +5 -0
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +58 -23
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +39 -0
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/model-response.js +7 -0
- package/dist/collection/schemas/model-response.js.map +1 -0
- package/dist/collection/schemas/test-case.js +2 -1
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/expected-outcome.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/components/chat-history.js +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-kmtfMXcQ.js +2 -0
- package/dist/components/p-kmtfMXcQ.js.map +1 -0
- package/dist/components/{p-B87Lt3z4.js → p-wzA48RFK.js} +3 -3
- package/dist/components/p-wzA48RFK.js.map +1 -0
- package/dist/esm/app-chips_5.entry.js +1 -1
- package/dist/esm/app-chips_5.entry.js.map +1 -1
- package/dist/esm/index.js +464 -66
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/llm-testrunner/{p-21202f12.entry.js → p-5bf1fc78.entry.js} +2 -2
- package/dist/llm-testrunner/{p-21202f12.entry.js.map → p-5bf1fc78.entry.js.map} +1 -1
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +3 -4
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/output/response-output.d.ts +2 -1
- package/dist/types/components.d.ts +4 -2
- package/dist/types/lib/evaluation/actual-value-resolver.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-service.d.ts +2 -2
- package/dist/types/lib/evaluation/types.d.ts +1 -1
- package/dist/types/lib/import-export/test-suite-importer.d.ts +1 -1
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +10 -1
- package/dist/types/schemas/expected-outcome.d.ts +116 -0
- package/dist/types/schemas/model-response.d.ts +7 -0
- package/dist/types/schemas/test-case.d.ts +76 -1
- package/dist/types/types/expected-outcome.d.ts +1 -1
- package/dist/types/types/llm-test-runner.d.ts +4 -2
- package/package.json +1 -1
- package/dist/components/p-B87Lt3z4.js.map +0 -1
- package/dist/components/p-Bx2jqguC.js +0 -2
- package/dist/components/p-Bx2jqguC.js.map +0 -1
package/dist/cjs/index.cjs.js
CHANGED
|
@@ -295,6 +295,7 @@ const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
295
295
|
function normalizeExpectedOutcomeField(field) {
|
|
296
296
|
return {
|
|
297
297
|
...field,
|
|
298
|
+
evaluationSource: field.evaluationSource || { type: 'text' },
|
|
298
299
|
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
299
300
|
};
|
|
300
301
|
}
|
|
@@ -318,6 +319,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
318
319
|
type: 'text',
|
|
319
320
|
label: schemaField.label,
|
|
320
321
|
placeholder: schemaField.placeholder,
|
|
322
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
321
323
|
value: '',
|
|
322
324
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
323
325
|
};
|
|
@@ -326,6 +328,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
326
328
|
type: 'textarea',
|
|
327
329
|
label: schemaField.label,
|
|
328
330
|
placeholder: schemaField.placeholder,
|
|
331
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
329
332
|
rows: schemaField.rows,
|
|
330
333
|
value: '',
|
|
331
334
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
@@ -335,6 +338,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
335
338
|
type: 'chips-input',
|
|
336
339
|
label: schemaField.label,
|
|
337
340
|
placeholder: schemaField.placeholder,
|
|
341
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
338
342
|
value: [],
|
|
339
343
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
340
344
|
};
|
|
@@ -343,6 +347,7 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
343
347
|
type: 'select',
|
|
344
348
|
label: schemaField.label,
|
|
345
349
|
placeholder: schemaField.placeholder,
|
|
350
|
+
evaluationSource: schemaField.evaluationSource || { type: 'text' },
|
|
346
351
|
value: schemaField.options[0],
|
|
347
352
|
options: schemaField.options,
|
|
348
353
|
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
@@ -2572,6 +2577,122 @@ function handleIntersectionResults(result, left, right) {
|
|
|
2572
2577
|
result.value = merged.data;
|
|
2573
2578
|
return result;
|
|
2574
2579
|
}
|
|
2580
|
+
const $ZodRecord = /*@__PURE__*/ $constructor("$ZodRecord", (inst, def) => {
|
|
2581
|
+
$ZodType.init(inst, def);
|
|
2582
|
+
inst._zod.parse = (payload, ctx) => {
|
|
2583
|
+
const input = payload.value;
|
|
2584
|
+
if (!isPlainObject(input)) {
|
|
2585
|
+
payload.issues.push({
|
|
2586
|
+
expected: "record",
|
|
2587
|
+
code: "invalid_type",
|
|
2588
|
+
input,
|
|
2589
|
+
inst,
|
|
2590
|
+
});
|
|
2591
|
+
return payload;
|
|
2592
|
+
}
|
|
2593
|
+
const proms = [];
|
|
2594
|
+
const values = def.keyType._zod.values;
|
|
2595
|
+
if (values) {
|
|
2596
|
+
payload.value = {};
|
|
2597
|
+
const recordKeys = new Set();
|
|
2598
|
+
for (const key of values) {
|
|
2599
|
+
if (typeof key === "string" || typeof key === "number" || typeof key === "symbol") {
|
|
2600
|
+
recordKeys.add(typeof key === "number" ? key.toString() : key);
|
|
2601
|
+
const result = def.valueType._zod.run({ value: input[key], issues: [] }, ctx);
|
|
2602
|
+
if (result instanceof Promise) {
|
|
2603
|
+
proms.push(result.then((result) => {
|
|
2604
|
+
if (result.issues.length) {
|
|
2605
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2606
|
+
}
|
|
2607
|
+
payload.value[key] = result.value;
|
|
2608
|
+
}));
|
|
2609
|
+
}
|
|
2610
|
+
else {
|
|
2611
|
+
if (result.issues.length) {
|
|
2612
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2613
|
+
}
|
|
2614
|
+
payload.value[key] = result.value;
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
}
|
|
2618
|
+
let unrecognized;
|
|
2619
|
+
for (const key in input) {
|
|
2620
|
+
if (!recordKeys.has(key)) {
|
|
2621
|
+
unrecognized = unrecognized ?? [];
|
|
2622
|
+
unrecognized.push(key);
|
|
2623
|
+
}
|
|
2624
|
+
}
|
|
2625
|
+
if (unrecognized && unrecognized.length > 0) {
|
|
2626
|
+
payload.issues.push({
|
|
2627
|
+
code: "unrecognized_keys",
|
|
2628
|
+
input,
|
|
2629
|
+
inst,
|
|
2630
|
+
keys: unrecognized,
|
|
2631
|
+
});
|
|
2632
|
+
}
|
|
2633
|
+
}
|
|
2634
|
+
else {
|
|
2635
|
+
payload.value = {};
|
|
2636
|
+
for (const key of Reflect.ownKeys(input)) {
|
|
2637
|
+
if (key === "__proto__")
|
|
2638
|
+
continue;
|
|
2639
|
+
let keyResult = def.keyType._zod.run({ value: key, issues: [] }, ctx);
|
|
2640
|
+
if (keyResult instanceof Promise) {
|
|
2641
|
+
throw new Error("Async schemas not supported in object keys currently");
|
|
2642
|
+
}
|
|
2643
|
+
// Numeric string fallback: if key is a numeric string and failed, retry with Number(key)
|
|
2644
|
+
// This handles z.number(), z.literal([1, 2, 3]), and unions containing numeric literals
|
|
2645
|
+
const checkNumericKey = typeof key === "string" && number$1.test(key) && keyResult.issues.length;
|
|
2646
|
+
if (checkNumericKey) {
|
|
2647
|
+
const retryResult = def.keyType._zod.run({ value: Number(key), issues: [] }, ctx);
|
|
2648
|
+
if (retryResult instanceof Promise) {
|
|
2649
|
+
throw new Error("Async schemas not supported in object keys currently");
|
|
2650
|
+
}
|
|
2651
|
+
if (retryResult.issues.length === 0) {
|
|
2652
|
+
keyResult = retryResult;
|
|
2653
|
+
}
|
|
2654
|
+
}
|
|
2655
|
+
if (keyResult.issues.length) {
|
|
2656
|
+
if (def.mode === "loose") {
|
|
2657
|
+
// Pass through unchanged
|
|
2658
|
+
payload.value[key] = input[key];
|
|
2659
|
+
}
|
|
2660
|
+
else {
|
|
2661
|
+
// Default "strict" behavior: error on invalid key
|
|
2662
|
+
payload.issues.push({
|
|
2663
|
+
code: "invalid_key",
|
|
2664
|
+
origin: "record",
|
|
2665
|
+
issues: keyResult.issues.map((iss) => finalizeIssue(iss, ctx, config())),
|
|
2666
|
+
input: key,
|
|
2667
|
+
path: [key],
|
|
2668
|
+
inst,
|
|
2669
|
+
});
|
|
2670
|
+
}
|
|
2671
|
+
continue;
|
|
2672
|
+
}
|
|
2673
|
+
const result = def.valueType._zod.run({ value: input[key], issues: [] }, ctx);
|
|
2674
|
+
if (result instanceof Promise) {
|
|
2675
|
+
proms.push(result.then((result) => {
|
|
2676
|
+
if (result.issues.length) {
|
|
2677
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2678
|
+
}
|
|
2679
|
+
payload.value[keyResult.value] = result.value;
|
|
2680
|
+
}));
|
|
2681
|
+
}
|
|
2682
|
+
else {
|
|
2683
|
+
if (result.issues.length) {
|
|
2684
|
+
payload.issues.push(...prefixIssues(key, result.issues));
|
|
2685
|
+
}
|
|
2686
|
+
payload.value[keyResult.value] = result.value;
|
|
2687
|
+
}
|
|
2688
|
+
}
|
|
2689
|
+
}
|
|
2690
|
+
if (proms.length) {
|
|
2691
|
+
return Promise.all(proms).then(() => payload);
|
|
2692
|
+
}
|
|
2693
|
+
return payload;
|
|
2694
|
+
};
|
|
2695
|
+
});
|
|
2575
2696
|
const $ZodEnum = /*@__PURE__*/ $constructor("$ZodEnum", (inst, def) => {
|
|
2576
2697
|
$ZodType.init(inst, def);
|
|
2577
2698
|
const values = getEnumValues(def.entries);
|
|
@@ -4155,6 +4276,49 @@ const intersectionProcessor = (schema, ctx, json, params) => {
|
|
|
4155
4276
|
];
|
|
4156
4277
|
json.allOf = allOf;
|
|
4157
4278
|
};
|
|
4279
|
+
const recordProcessor = (schema, ctx, _json, params) => {
|
|
4280
|
+
const json = _json;
|
|
4281
|
+
const def = schema._zod.def;
|
|
4282
|
+
json.type = "object";
|
|
4283
|
+
// For looseRecord with regex patterns, use patternProperties
|
|
4284
|
+
// This correctly represents "only validate keys matching the pattern" semantics
|
|
4285
|
+
// and composes well with allOf (intersections)
|
|
4286
|
+
const keyType = def.keyType;
|
|
4287
|
+
const keyBag = keyType._zod.bag;
|
|
4288
|
+
const patterns = keyBag?.patterns;
|
|
4289
|
+
if (def.mode === "loose" && patterns && patterns.size > 0) {
|
|
4290
|
+
// Use patternProperties for looseRecord with regex patterns
|
|
4291
|
+
const valueSchema = process$1(def.valueType, ctx, {
|
|
4292
|
+
...params,
|
|
4293
|
+
path: [...params.path, "patternProperties", "*"],
|
|
4294
|
+
});
|
|
4295
|
+
json.patternProperties = {};
|
|
4296
|
+
for (const pattern of patterns) {
|
|
4297
|
+
json.patternProperties[pattern.source] = valueSchema;
|
|
4298
|
+
}
|
|
4299
|
+
}
|
|
4300
|
+
else {
|
|
4301
|
+
// Default behavior: use propertyNames + additionalProperties
|
|
4302
|
+
if (ctx.target === "draft-07" || ctx.target === "draft-2020-12") {
|
|
4303
|
+
json.propertyNames = process$1(def.keyType, ctx, {
|
|
4304
|
+
...params,
|
|
4305
|
+
path: [...params.path, "propertyNames"],
|
|
4306
|
+
});
|
|
4307
|
+
}
|
|
4308
|
+
json.additionalProperties = process$1(def.valueType, ctx, {
|
|
4309
|
+
...params,
|
|
4310
|
+
path: [...params.path, "additionalProperties"],
|
|
4311
|
+
});
|
|
4312
|
+
}
|
|
4313
|
+
// Add required for keys with discrete values (enum, literal, etc.)
|
|
4314
|
+
const keyValues = keyType._zod.values;
|
|
4315
|
+
if (keyValues) {
|
|
4316
|
+
const validKeyValues = [...keyValues].filter((v) => typeof v === "string" || typeof v === "number");
|
|
4317
|
+
if (validKeyValues.length > 0) {
|
|
4318
|
+
json.required = validKeyValues;
|
|
4319
|
+
}
|
|
4320
|
+
}
|
|
4321
|
+
};
|
|
4158
4322
|
const nullableProcessor = (schema, ctx, json, params) => {
|
|
4159
4323
|
const def = schema._zod.def;
|
|
4160
4324
|
const inner = process$1(def.innerType, ctx, params);
|
|
@@ -4709,6 +4873,21 @@ function intersection(left, right) {
|
|
|
4709
4873
|
right: right,
|
|
4710
4874
|
});
|
|
4711
4875
|
}
|
|
4876
|
+
const ZodRecord = /*@__PURE__*/ $constructor("ZodRecord", (inst, def) => {
|
|
4877
|
+
$ZodRecord.init(inst, def);
|
|
4878
|
+
ZodType.init(inst, def);
|
|
4879
|
+
inst._zod.processJSONSchema = (ctx, json, params) => recordProcessor(inst, ctx, json, params);
|
|
4880
|
+
inst.keyType = def.keyType;
|
|
4881
|
+
inst.valueType = def.valueType;
|
|
4882
|
+
});
|
|
4883
|
+
function record(keyType, valueType, params) {
|
|
4884
|
+
return new ZodRecord({
|
|
4885
|
+
type: "record",
|
|
4886
|
+
keyType,
|
|
4887
|
+
valueType: valueType,
|
|
4888
|
+
...normalizeParams(params),
|
|
4889
|
+
});
|
|
4890
|
+
}
|
|
4712
4891
|
const ZodEnum = /*@__PURE__*/ $constructor("ZodEnum", (inst, def) => {
|
|
4713
4892
|
$ZodEnum.init(inst, def);
|
|
4714
4893
|
ZodType.init(inst, def);
|
|
@@ -4946,7 +5125,7 @@ const ZodCustom = /*@__PURE__*/ $constructor("ZodCustom", (inst, def) => {
|
|
|
4946
5125
|
inst._zod.processJSONSchema = (ctx, json, params) => customProcessor(inst, ctx);
|
|
4947
5126
|
});
|
|
4948
5127
|
function custom(fn, _params) {
|
|
4949
|
-
return _custom(ZodCustom, (() => true), _params);
|
|
5128
|
+
return _custom(ZodCustom, fn ?? (() => true), _params);
|
|
4950
5129
|
}
|
|
4951
5130
|
function refine(fn, _params = {}) {
|
|
4952
5131
|
return _refine(ZodCustom, fn, _params);
|
|
@@ -4961,6 +5140,19 @@ const optionalPositiveInt = number().int().positive().optional();
|
|
|
4961
5140
|
const optionalString = string().optional();
|
|
4962
5141
|
const selectOptionsSchema = array(nonEmptyString).min(1);
|
|
4963
5142
|
const optionalNumber = number().optional();
|
|
5143
|
+
const textEvaluationSourceSchema = object({
|
|
5144
|
+
type: literal('text'),
|
|
5145
|
+
});
|
|
5146
|
+
const customEvaluationSourceSchema = object({
|
|
5147
|
+
type: literal('custom'),
|
|
5148
|
+
extractorId: nonEmptyString,
|
|
5149
|
+
});
|
|
5150
|
+
const evaluationSourceExtractorSchema = custom(value => typeof value === 'function', 'Extractor must be a function.');
|
|
5151
|
+
record(string().min(1), evaluationSourceExtractorSchema);
|
|
5152
|
+
const evaluationSourceSchema = discriminatedUnion('type', [
|
|
5153
|
+
textEvaluationSourceSchema,
|
|
5154
|
+
customEvaluationSourceSchema,
|
|
5155
|
+
]);
|
|
4964
5156
|
const expectedOutcomeModeSchema = _enum(['static', 'dynamic']);
|
|
4965
5157
|
const evaluationParametersSchema = object({
|
|
4966
5158
|
approach: _enum(EvaluationApproach),
|
|
@@ -4978,6 +5170,7 @@ const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine(
|
|
|
4978
5170
|
const defaultExpectedOutcomeBaseSchema = object({
|
|
4979
5171
|
label: nonEmptyString,
|
|
4980
5172
|
placeholder: optionalString,
|
|
5173
|
+
evaluationSource: evaluationSourceSchema.optional(),
|
|
4981
5174
|
});
|
|
4982
5175
|
const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
|
|
4983
5176
|
text: baseSchema.extend({
|
|
@@ -5070,6 +5263,37 @@ function validateExpectedOutcomeSchema(schema) {
|
|
|
5070
5263
|
throw new Error(`Invalid expectedOutcomeSchema: ${parsed.error.issues[0].message}`);
|
|
5071
5264
|
}
|
|
5072
5265
|
}
|
|
5266
|
+
function validateExpectedOutcomeArrayWithExtractors(expectedOutcome, allowedExtractorIds) {
|
|
5267
|
+
const allowed = new Set(allowedExtractorIds);
|
|
5268
|
+
const schema = expectedOutcomeArraySchema.superRefine((fields, ctx) => {
|
|
5269
|
+
fields.forEach((field, index) => {
|
|
5270
|
+
if (field.evaluationSource?.type !== 'custom') {
|
|
5271
|
+
return;
|
|
5272
|
+
}
|
|
5273
|
+
if (allowed.has(field.evaluationSource.extractorId)) {
|
|
5274
|
+
return;
|
|
5275
|
+
}
|
|
5276
|
+
ctx.addIssue({
|
|
5277
|
+
code: 'custom',
|
|
5278
|
+
path: [index, 'evaluationSource', 'extractorId'],
|
|
5279
|
+
message: `Invalid expectedOutcome: Extractor "${field.evaluationSource.extractorId}" is not registered.`,
|
|
5280
|
+
});
|
|
5281
|
+
});
|
|
5282
|
+
});
|
|
5283
|
+
const parsed = schema.safeParse(expectedOutcome);
|
|
5284
|
+
if (!parsed.success) {
|
|
5285
|
+
throw new Error(parsed.error.issues[0].message);
|
|
5286
|
+
}
|
|
5287
|
+
}
|
|
5288
|
+
function getExtractorIds(extractors) {
|
|
5289
|
+
return Object.keys(extractors || {});
|
|
5290
|
+
}
|
|
5291
|
+
|
|
5292
|
+
const modelResponseMetadataSchema = record(string(), unknown());
|
|
5293
|
+
const modelResponsePayloadSchema = object({
|
|
5294
|
+
text: string().optional(),
|
|
5295
|
+
metadata: modelResponseMetadataSchema.optional(),
|
|
5296
|
+
});
|
|
5073
5297
|
|
|
5074
5298
|
const testCaseChatHistorySchema = object({
|
|
5075
5299
|
enabled: boolean(),
|
|
@@ -5086,8 +5310,8 @@ object({
|
|
|
5086
5310
|
id: string(),
|
|
5087
5311
|
question: string(),
|
|
5088
5312
|
expectedOutcome: expectedOutcomeArraySchema,
|
|
5313
|
+
output: modelResponsePayloadSchema.optional(),
|
|
5089
5314
|
chatHistory: testCaseChatHistorySchema,
|
|
5090
|
-
output: string().optional(),
|
|
5091
5315
|
isRunning: boolean().optional(),
|
|
5092
5316
|
error: string().optional(),
|
|
5093
5317
|
evaluationResult: custom().optional(),
|
|
@@ -5109,10 +5333,15 @@ function validateTestCaseInputArray(data) {
|
|
|
5109
5333
|
* @param jsonContent - The JSON string to parse and validate
|
|
5110
5334
|
* @returns Validation result with test cases or error message
|
|
5111
5335
|
*/
|
|
5112
|
-
function importTestSuite(jsonContent) {
|
|
5336
|
+
function importTestSuite(jsonContent, allowedExtractorIds = []) {
|
|
5113
5337
|
try {
|
|
5114
5338
|
const parsed = JSON.parse(jsonContent);
|
|
5115
5339
|
validateTestCaseInputArray(parsed);
|
|
5340
|
+
if (allowedExtractorIds.length > 0) {
|
|
5341
|
+
parsed.forEach((testCase) => {
|
|
5342
|
+
validateExpectedOutcomeArrayWithExtractors(testCase.expectedOutcome, allowedExtractorIds);
|
|
5343
|
+
});
|
|
5344
|
+
}
|
|
5116
5345
|
const testCases = parsed.map((item, index) => {
|
|
5117
5346
|
try {
|
|
5118
5347
|
return createTestCaseFromInput(item);
|
|
@@ -5138,7 +5367,7 @@ function importTestSuite(jsonContent) {
|
|
|
5138
5367
|
}
|
|
5139
5368
|
|
|
5140
5369
|
const MISSING_RESOLVER_MESSAGE = 'resolveExpectedOutcome is required when a test case has dynamic expected outcomes.';
|
|
5141
|
-
function isDynamicTextareaField(field) {
|
|
5370
|
+
function isDynamicTextareaField$1(field) {
|
|
5142
5371
|
return field.type === 'textarea' && field.outcomeMode === 'dynamic';
|
|
5143
5372
|
}
|
|
5144
5373
|
function applyResolvedDynamicValues(testCase, resolvedValues) {
|
|
@@ -5148,7 +5377,7 @@ function applyResolvedDynamicValues(testCase, resolvedValues) {
|
|
|
5148
5377
|
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5149
5378
|
for (const resolved of resolvedValues) {
|
|
5150
5379
|
const field = expectedOutcome[resolved.index];
|
|
5151
|
-
if (!field || !isDynamicTextareaField(field)) {
|
|
5380
|
+
if (!field || !isDynamicTextareaField$1(field)) {
|
|
5152
5381
|
continue;
|
|
5153
5382
|
}
|
|
5154
5383
|
expectedOutcome[resolved.index] = {
|
|
@@ -5163,7 +5392,7 @@ function applyResolvedDynamicValues(testCase, resolvedValues) {
|
|
|
5163
5392
|
}
|
|
5164
5393
|
async function resolveDynamicExpectedOutcomes(testCase, resolver) {
|
|
5165
5394
|
const dynamicFields = (testCase.expectedOutcome || []).flatMap((field, index) => {
|
|
5166
|
-
if (!isDynamicTextareaField(field)) {
|
|
5395
|
+
if (!isDynamicTextareaField$1(field)) {
|
|
5167
5396
|
return [];
|
|
5168
5397
|
}
|
|
5169
5398
|
return [{ field, index }];
|
|
@@ -5181,6 +5410,15 @@ async function resolveDynamicExpectedOutcomes(testCase, resolver) {
|
|
|
5181
5410
|
return applyResolvedDynamicValues(testCase, resolvedValues);
|
|
5182
5411
|
}
|
|
5183
5412
|
|
|
5413
|
+
function isChipsInputField(field) {
|
|
5414
|
+
return field.type === 'chips-input';
|
|
5415
|
+
}
|
|
5416
|
+
function isTextareaField(field) {
|
|
5417
|
+
return field.type === 'textarea';
|
|
5418
|
+
}
|
|
5419
|
+
function isDynamicTextareaField(field) {
|
|
5420
|
+
return isTextareaField(field) && field.outcomeMode === 'dynamic';
|
|
5421
|
+
}
|
|
5184
5422
|
function applyExpectedOutcomeChange(testCase, change) {
|
|
5185
5423
|
const { index } = change;
|
|
5186
5424
|
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
@@ -5188,73 +5426,99 @@ function applyExpectedOutcomeChange(testCase, change) {
|
|
|
5188
5426
|
if (!target) {
|
|
5189
5427
|
return testCase;
|
|
5190
5428
|
}
|
|
5429
|
+
const commit = (updatedField) => {
|
|
5430
|
+
expectedOutcome[index] = updatedField;
|
|
5431
|
+
return { ...testCase, expectedOutcome };
|
|
5432
|
+
};
|
|
5191
5433
|
switch (change.operation) {
|
|
5192
5434
|
case 'set-value': {
|
|
5193
|
-
if (target
|
|
5435
|
+
if (isChipsInputField(target)) {
|
|
5194
5436
|
return testCase;
|
|
5195
5437
|
}
|
|
5196
|
-
if (target
|
|
5438
|
+
if (isDynamicTextareaField(target)) {
|
|
5197
5439
|
return testCase;
|
|
5198
5440
|
}
|
|
5199
|
-
|
|
5441
|
+
return commit({
|
|
5200
5442
|
...target,
|
|
5201
5443
|
value: change.value,
|
|
5202
|
-
};
|
|
5203
|
-
return { ...testCase, expectedOutcome };
|
|
5444
|
+
});
|
|
5204
5445
|
}
|
|
5205
5446
|
case 'add-chip': {
|
|
5206
|
-
if (target
|
|
5447
|
+
if (!isChipsInputField(target)) {
|
|
5207
5448
|
return testCase;
|
|
5208
5449
|
}
|
|
5209
|
-
|
|
5450
|
+
return commit({
|
|
5210
5451
|
...target,
|
|
5211
5452
|
value: [...target.value, change.value],
|
|
5212
|
-
};
|
|
5213
|
-
return { ...testCase, expectedOutcome };
|
|
5453
|
+
});
|
|
5214
5454
|
}
|
|
5215
5455
|
case 'remove-chip': {
|
|
5216
|
-
if (target
|
|
5456
|
+
if (!isChipsInputField(target)) {
|
|
5217
5457
|
return testCase;
|
|
5218
5458
|
}
|
|
5219
|
-
|
|
5459
|
+
return commit({
|
|
5220
5460
|
...target,
|
|
5221
5461
|
value: target.value.filter(chip => chip !== change.value),
|
|
5222
|
-
};
|
|
5223
|
-
return { ...testCase, expectedOutcome };
|
|
5462
|
+
});
|
|
5224
5463
|
}
|
|
5225
5464
|
case 'set-evaluation-approach':
|
|
5226
5465
|
return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
|
|
5227
5466
|
case 'set-outcome-mode': {
|
|
5228
|
-
if (target
|
|
5467
|
+
if (!isTextareaField(target)) {
|
|
5229
5468
|
return testCase;
|
|
5230
5469
|
}
|
|
5231
5470
|
const mode = change.value;
|
|
5232
5471
|
if (mode === 'static') {
|
|
5233
5472
|
const { resolutionQuery: _, ...rest } = target;
|
|
5234
|
-
|
|
5473
|
+
return commit({
|
|
5235
5474
|
...rest,
|
|
5236
5475
|
outcomeMode: 'static',
|
|
5237
5476
|
value: '',
|
|
5238
|
-
};
|
|
5477
|
+
});
|
|
5239
5478
|
}
|
|
5240
5479
|
else {
|
|
5241
|
-
|
|
5480
|
+
return commit({
|
|
5242
5481
|
...target,
|
|
5243
5482
|
outcomeMode: 'dynamic',
|
|
5244
5483
|
value: '',
|
|
5245
|
-
};
|
|
5484
|
+
});
|
|
5246
5485
|
}
|
|
5247
|
-
return { ...testCase, expectedOutcome };
|
|
5248
5486
|
}
|
|
5249
5487
|
case 'set-resolution-query': {
|
|
5250
|
-
if (target
|
|
5488
|
+
if (!isDynamicTextareaField(target)) {
|
|
5251
5489
|
return testCase;
|
|
5252
5490
|
}
|
|
5253
|
-
|
|
5491
|
+
return commit({
|
|
5254
5492
|
...target,
|
|
5255
5493
|
resolutionQuery: change.value,
|
|
5256
|
-
};
|
|
5257
|
-
|
|
5494
|
+
});
|
|
5495
|
+
}
|
|
5496
|
+
case 'set-evaluation-source-type': {
|
|
5497
|
+
if (change.value === 'text') {
|
|
5498
|
+
return commit({
|
|
5499
|
+
...target,
|
|
5500
|
+
evaluationSource: { type: 'text' },
|
|
5501
|
+
});
|
|
5502
|
+
}
|
|
5503
|
+
const extractorId = target.evaluationSource?.type === 'custom'
|
|
5504
|
+
? target.evaluationSource.extractorId
|
|
5505
|
+
: (change.fallbackExtractorId ?? '');
|
|
5506
|
+
return commit({
|
|
5507
|
+
...target,
|
|
5508
|
+
evaluationSource: {
|
|
5509
|
+
type: 'custom',
|
|
5510
|
+
extractorId,
|
|
5511
|
+
},
|
|
5512
|
+
});
|
|
5513
|
+
}
|
|
5514
|
+
case 'set-evaluation-source-extractor': {
|
|
5515
|
+
return commit({
|
|
5516
|
+
...target,
|
|
5517
|
+
evaluationSource: {
|
|
5518
|
+
type: 'custom',
|
|
5519
|
+
extractorId: change.value,
|
|
5520
|
+
},
|
|
5521
|
+
});
|
|
5258
5522
|
}
|
|
5259
5523
|
}
|
|
5260
5524
|
}
|
|
@@ -30035,7 +30299,7 @@ class LLMEvaluationEngine {
|
|
|
30035
30299
|
const fieldRequest = {
|
|
30036
30300
|
testCaseId: request.testCaseId,
|
|
30037
30301
|
question: request.question,
|
|
30038
|
-
actualResponse:
|
|
30302
|
+
actualResponse: field.actualResponse,
|
|
30039
30303
|
expectedOutcome: field.expectedValue,
|
|
30040
30304
|
evaluationParameters: field.evaluationParameters,
|
|
30041
30305
|
};
|
|
@@ -30105,6 +30369,58 @@ class LLMEvaluationEngine {
|
|
|
30105
30369
|
}
|
|
30106
30370
|
}
|
|
30107
30371
|
|
|
30372
|
+
function toTextSource() {
|
|
30373
|
+
return { type: 'text' };
|
|
30374
|
+
}
|
|
30375
|
+
async function resolveActualValue(field, output, extractors) {
|
|
30376
|
+
const source = field.evaluationSource || toTextSource();
|
|
30377
|
+
if (source.type === 'text') {
|
|
30378
|
+
const text = output?.text?.trim();
|
|
30379
|
+
if (!text) {
|
|
30380
|
+
return {
|
|
30381
|
+
success: false,
|
|
30382
|
+
error: 'Model response text is empty.',
|
|
30383
|
+
};
|
|
30384
|
+
}
|
|
30385
|
+
return { success: true, value: text };
|
|
30386
|
+
}
|
|
30387
|
+
const extractor = extractors?.[source.extractorId];
|
|
30388
|
+
if (!extractor) {
|
|
30389
|
+
return {
|
|
30390
|
+
success: false,
|
|
30391
|
+
error: `Extractor "${source.extractorId}" is not registered.`,
|
|
30392
|
+
};
|
|
30393
|
+
}
|
|
30394
|
+
try {
|
|
30395
|
+
const extractedRaw = await extractor(output || {});
|
|
30396
|
+
if (typeof extractedRaw !== 'string') {
|
|
30397
|
+
return {
|
|
30398
|
+
success: false,
|
|
30399
|
+
error: `Extractor "${source.extractorId}" must return a string.`,
|
|
30400
|
+
};
|
|
30401
|
+
}
|
|
30402
|
+
const extracted = extractedRaw.trim();
|
|
30403
|
+
if (!extracted) {
|
|
30404
|
+
return {
|
|
30405
|
+
success: false,
|
|
30406
|
+
error: `Extractor "${source.extractorId}" returned an empty value.`,
|
|
30407
|
+
};
|
|
30408
|
+
}
|
|
30409
|
+
return {
|
|
30410
|
+
success: true,
|
|
30411
|
+
value: extracted,
|
|
30412
|
+
};
|
|
30413
|
+
}
|
|
30414
|
+
catch (error) {
|
|
30415
|
+
return {
|
|
30416
|
+
success: false,
|
|
30417
|
+
error: error instanceof Error
|
|
30418
|
+
? error.message
|
|
30419
|
+
: `Extractor "${source.extractorId}" failed.`,
|
|
30420
|
+
};
|
|
30421
|
+
}
|
|
30422
|
+
}
|
|
30423
|
+
|
|
30108
30424
|
/**
|
|
30109
30425
|
* Service for evaluating test case responses
|
|
30110
30426
|
*/
|
|
@@ -30118,34 +30434,71 @@ class EvaluationService {
|
|
|
30118
30434
|
* @param testCase - The test case to evaluate
|
|
30119
30435
|
* @param onResult - Callback to handle the evaluation result
|
|
30120
30436
|
*/
|
|
30121
|
-
async evaluateTestCase(testCase, onResult) {
|
|
30122
|
-
|
|
30123
|
-
|
|
30124
|
-
|
|
30125
|
-
}
|
|
30126
|
-
const fields = (testCase.expectedOutcome || []).flatMap((field, index) => {
|
|
30437
|
+
async evaluateTestCase(testCase, onResult, extractors) {
|
|
30438
|
+
const fields = [];
|
|
30439
|
+
const failedFields = [];
|
|
30440
|
+
for (const [index, field] of (testCase.expectedOutcome || []).entries()) {
|
|
30127
30441
|
if (field.type === 'textarea' && field.outcomeMode === 'dynamic') {
|
|
30128
|
-
|
|
30442
|
+
continue;
|
|
30129
30443
|
}
|
|
30130
|
-
|
|
30131
|
-
|
|
30444
|
+
const evaluationParameters = normalizeEvaluationParametersForField(field.type, field.evaluationParameters);
|
|
30445
|
+
const expectedValue = getFieldExpectedValue(field);
|
|
30446
|
+
const resolvedActualValue = await resolveActualValue(field, testCase.output, extractors);
|
|
30447
|
+
if (resolvedActualValue.success) {
|
|
30448
|
+
fields.push({
|
|
30132
30449
|
index,
|
|
30133
30450
|
label: field.label,
|
|
30134
30451
|
type: field.type,
|
|
30135
|
-
expectedValue
|
|
30136
|
-
|
|
30137
|
-
|
|
30138
|
-
|
|
30139
|
-
|
|
30452
|
+
expectedValue,
|
|
30453
|
+
actualResponse: resolvedActualValue.value,
|
|
30454
|
+
evaluationParameters,
|
|
30455
|
+
});
|
|
30456
|
+
}
|
|
30457
|
+
else {
|
|
30458
|
+
failedFields.push({
|
|
30459
|
+
index,
|
|
30460
|
+
label: field.label,
|
|
30461
|
+
type: field.type,
|
|
30462
|
+
expectedValue,
|
|
30463
|
+
passed: false,
|
|
30464
|
+
keywordMatches: [],
|
|
30465
|
+
evaluationParameters,
|
|
30466
|
+
evaluationApproachResult: {
|
|
30467
|
+
score: 0,
|
|
30468
|
+
approachUsed: evaluationParameters.approach,
|
|
30469
|
+
},
|
|
30470
|
+
error: 'error' in resolvedActualValue
|
|
30471
|
+
? resolvedActualValue.error
|
|
30472
|
+
: 'Failed to resolve actual value.',
|
|
30473
|
+
});
|
|
30474
|
+
}
|
|
30475
|
+
}
|
|
30476
|
+
if (fields.length === 0) {
|
|
30477
|
+
if (failedFields.length === 0) {
|
|
30478
|
+
console.warn('⚠️ No evaluable fields for test case:', testCase.id);
|
|
30479
|
+
return;
|
|
30480
|
+
}
|
|
30481
|
+
onResult({
|
|
30482
|
+
testCaseId: testCase.id,
|
|
30483
|
+
passed: false,
|
|
30484
|
+
keywordMatches: [],
|
|
30485
|
+
fieldResults: failedFields,
|
|
30486
|
+
timestamp: new Date().toISOString(),
|
|
30487
|
+
});
|
|
30488
|
+
return;
|
|
30489
|
+
}
|
|
30140
30490
|
const evaluationRequest = {
|
|
30141
30491
|
testCaseId: testCase.id,
|
|
30142
30492
|
question: testCase.question,
|
|
30143
|
-
actualResponse: testCase.output,
|
|
30144
30493
|
fields,
|
|
30145
30494
|
};
|
|
30146
30495
|
await this.engine.evaluateResponse(evaluationRequest, (result) => {
|
|
30147
|
-
|
|
30148
|
-
onResult(
|
|
30496
|
+
const combinedResults = [...(result.fieldResults || []), ...failedFields].sort((a, b) => a.index - b.index);
|
|
30497
|
+
onResult({
|
|
30498
|
+
...result,
|
|
30499
|
+
passed: combinedResults.every(field => field.passed && !field.error),
|
|
30500
|
+
fieldResults: combinedResults,
|
|
30501
|
+
});
|
|
30149
30502
|
});
|
|
30150
30503
|
}
|
|
30151
30504
|
}
|
|
@@ -30193,7 +30546,7 @@ const LLMTestRunnerHeader = ({ isExportingTestSuite, isExportingTestResults, isR
|
|
|
30193
30546
|
};
|
|
30194
30547
|
|
|
30195
30548
|
const ResponseOutput = ({ output, isRunning, }) => {
|
|
30196
|
-
return (index.h("div", { class: "response-output" }, output ? (index.h("div", { class: "response-output__content" }, output)) : (index.h("div", { class: "response-output__placeholder" }, isRunning ? 'Running...' : ''))));
|
|
30549
|
+
return (index.h("div", { class: "response-output" }, output?.text ? (index.h("div", { class: "response-output__content" }, output.text)) : (index.h("div", { class: "response-output__placeholder" }, isRunning ? 'Running...' : ''))));
|
|
30197
30550
|
};
|
|
30198
30551
|
|
|
30199
30552
|
const EvaluationSummary = ({ result, isRunning, }) => {
|
|
@@ -30231,7 +30584,9 @@ var FormFieldType;
|
|
|
30231
30584
|
FormFieldType["SELECT"] = "select";
|
|
30232
30585
|
})(FormFieldType || (FormFieldType = {}));
|
|
30233
30586
|
|
|
30234
|
-
const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupported = false, onExpectedOutcomeChange, }) => {
|
|
30587
|
+
const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupported = false, extractorIds = [], onExpectedOutcomeChange, }) => {
|
|
30588
|
+
const hasExtractorOptions = extractorIds.length > 0;
|
|
30589
|
+
const firstExtractorId = extractorIds[0];
|
|
30235
30590
|
const emit = (detail) => onExpectedOutcomeChange({
|
|
30236
30591
|
detail,
|
|
30237
30592
|
});
|
|
@@ -30261,6 +30616,23 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30261
30616
|
required: false,
|
|
30262
30617
|
rows: 2,
|
|
30263
30618
|
});
|
|
30619
|
+
const buildEvaluationSourceConfig = (index) => ({
|
|
30620
|
+
name: `expectedOutcomeEvaluationSource-${index}`,
|
|
30621
|
+
fieldType: FormFieldType.SELECT,
|
|
30622
|
+
label: 'Evaluation Source',
|
|
30623
|
+
placeholder: 'Select evaluation source',
|
|
30624
|
+
required: true,
|
|
30625
|
+
optionList: ['text', 'custom'],
|
|
30626
|
+
defaultValue: 'text',
|
|
30627
|
+
});
|
|
30628
|
+
const buildExtractorConfig = (index) => ({
|
|
30629
|
+
name: `expectedOutcomeEvaluationSourceExtractor-${index}`,
|
|
30630
|
+
fieldType: FormFieldType.SELECT,
|
|
30631
|
+
label: 'Extractor',
|
|
30632
|
+
placeholder: 'Select extractor',
|
|
30633
|
+
required: true,
|
|
30634
|
+
optionList: extractorIds,
|
|
30635
|
+
});
|
|
30264
30636
|
const renderEvaluationSelector = (field, index$1) => {
|
|
30265
30637
|
const optionList = getAllowedApproachesForFieldType(field.type);
|
|
30266
30638
|
return (index.h("app-select", { config: buildEvaluationConfig(index$1, optionList), value: field.evaluationParameters?.approach, onValueChange: (e) => emit({
|
|
@@ -30270,6 +30642,27 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30270
30642
|
value: e.detail.value,
|
|
30271
30643
|
}) }));
|
|
30272
30644
|
};
|
|
30645
|
+
const renderEvaluationSourceSelector = (field, index$1) => {
|
|
30646
|
+
if (!hasExtractorOptions) {
|
|
30647
|
+
return null;
|
|
30648
|
+
}
|
|
30649
|
+
const sourceType = field.evaluationSource?.type || 'text';
|
|
30650
|
+
return (index.h("div", null, index.h("app-select", { config: buildEvaluationSourceConfig(index$1), value: sourceType, onValueChange: (e) => emit({
|
|
30651
|
+
testCaseId,
|
|
30652
|
+
index: index$1,
|
|
30653
|
+
operation: 'set-evaluation-source-type',
|
|
30654
|
+
value: e.detail.value,
|
|
30655
|
+
fallbackExtractorId: firstExtractorId,
|
|
30656
|
+
}) }), sourceType === 'custom' && (index.h("app-select", { config: buildExtractorConfig(index$1), value: field.evaluationSource?.type === 'custom'
|
|
30657
|
+
? field.evaluationSource.extractorId
|
|
30658
|
+
: '', onValueChange: (e) => emit({
|
|
30659
|
+
testCaseId,
|
|
30660
|
+
index: index$1,
|
|
30661
|
+
operation: 'set-evaluation-source-extractor',
|
|
30662
|
+
value: e.detail.value,
|
|
30663
|
+
}) }))));
|
|
30664
|
+
};
|
|
30665
|
+
const renderEvaluationOptions = (field, index$1) => (index.h("details", { class: "expected-outcome-renderer__options" }, index.h("summary", { class: "expected-outcome-renderer__options-summary" }, "More options"), index.h("div", { class: "expected-outcome-renderer__options-content" }, renderEvaluationSelector(field, index$1), renderEvaluationSourceSelector(field, index$1))));
|
|
30273
30666
|
return (index.h("div", { class: "expected-outcome-renderer" }, (fields || []).map((field, index$1) => {
|
|
30274
30667
|
if (field.type === 'textarea') {
|
|
30275
30668
|
const isDynamic = dynamicResolutionSupported && field.outcomeMode === 'dynamic';
|
|
@@ -30301,7 +30694,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30301
30694
|
index: index$1,
|
|
30302
30695
|
operation: 'set-resolution-query',
|
|
30303
30696
|
value: e.detail.value,
|
|
30304
|
-
}) })), !isDynamic &&
|
|
30697
|
+
}) })), !isDynamic && renderEvaluationOptions(field, index$1)));
|
|
30305
30698
|
}
|
|
30306
30699
|
if (field.type === 'chips-input') {
|
|
30307
30700
|
const config = {
|
|
@@ -30321,7 +30714,7 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30321
30714
|
index: index$1,
|
|
30322
30715
|
operation: 'remove-chip',
|
|
30323
30716
|
value: e.detail.value,
|
|
30324
|
-
}) }),
|
|
30717
|
+
}) }), renderEvaluationOptions(field, index$1)));
|
|
30325
30718
|
}
|
|
30326
30719
|
if (field.type === 'select') {
|
|
30327
30720
|
const config = {
|
|
@@ -30337,18 +30730,18 @@ const ExpectedOutcomeRenderer = ({ testCaseId, fields, dynamicResolutionSupporte
|
|
|
30337
30730
|
index: index$1,
|
|
30338
30731
|
operation: 'set-value',
|
|
30339
30732
|
value: e.detail.value,
|
|
30340
|
-
}) }),
|
|
30733
|
+
}) }), renderEvaluationOptions(field, index$1)));
|
|
30341
30734
|
}
|
|
30342
30735
|
return (index.h("div", { class: "expected-outcome-renderer__group" }, index.h("div", { class: "expected-outcome-renderer__text" }, index.h("label", null, field.label), index.h("input", { type: "text", value: field.value, placeholder: field.placeholder, onInput: (e) => emit({
|
|
30343
30736
|
testCaseId,
|
|
30344
30737
|
index: index$1,
|
|
30345
30738
|
operation: 'set-value',
|
|
30346
30739
|
value: e.target.value,
|
|
30347
|
-
}) })),
|
|
30740
|
+
}) })), renderEvaluationOptions(field, index$1)));
|
|
30348
30741
|
})));
|
|
30349
30742
|
};
|
|
30350
30743
|
|
|
30351
|
-
const LLMTestCaseRow = ({ testCase, dynamicResolutionSupported = false, onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30744
|
+
const LLMTestCaseRow = ({ testCase, dynamicResolutionSupported = false, extractorIds = [], onRun, onDelete, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30352
30745
|
const questionConfig = {
|
|
30353
30746
|
name: 'question',
|
|
30354
30747
|
fieldType: FormFieldType.TEXT_AREA,
|
|
@@ -30374,11 +30767,11 @@ const LLMTestCaseRow = ({ testCase, dynamicResolutionSupported = false, onRun, o
|
|
|
30374
30767
|
value,
|
|
30375
30768
|
},
|
|
30376
30769
|
});
|
|
30377
|
-
} }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], dynamicResolutionSupported: dynamicResolutionSupported, onExpectedOutcomeChange: onExpectedOutcomeChange })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30770
|
+
} }), index.h(ExpectedOutcomeRenderer, { testCaseId: testCase.id, fields: testCase.expectedOutcome || [], dynamicResolutionSupported: dynamicResolutionSupported, extractorIds: extractorIds, onExpectedOutcomeChange: onExpectedOutcomeChange })), index.h(ResponseOutput, { output: testCase.output, isRunning: testCase.isRunning }), index.h(EvaluationSummary, { result: testCase.evaluationResult, isRunning: testCase.isRunning }), index.h(RowActions, { isRunning: testCase.isRunning, canRun: !!testCase.question.trim(), onRun: () => onRun(testCase), onDelete: () => onDelete(testCase.id) })));
|
|
30378
30771
|
};
|
|
30379
30772
|
|
|
30380
|
-
const LLMTestCases = ({ testCases, dynamicResolutionSupported = false, onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30381
|
-
return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, dynamicResolutionSupported: dynamicResolutionSupported, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange, onChatHistoryChange: onChatHistoryChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30773
|
+
const LLMTestCases = ({ testCases, dynamicResolutionSupported = false, extractorIds = [], onRun, onDelete, onAddTestCase, handleTestCaseChange, onExpectedOutcomeChange, onChatHistoryChange, }) => {
|
|
30774
|
+
return (index.h("div", { class: "test-cases" }, index.h("div", { class: "test-cases__column-headers" }, index.h("div", { class: "test-cases__column-header" }, "Input"), index.h("div", { class: "test-cases__column-header" }, "Output"), index.h("div", { class: "test-cases__column-header" }, "Evaluation"), index.h("div", { class: "test-cases__column-header" }, "Actions")), testCases.map(testCase => (index.h(LLMTestCaseRow, { testCase: testCase, dynamicResolutionSupported: dynamicResolutionSupported, extractorIds: extractorIds, onRun: onRun, onDelete: onDelete, handleTestCaseChange: handleTestCaseChange, onExpectedOutcomeChange: onExpectedOutcomeChange, onChatHistoryChange: onChatHistoryChange }))), index.h("div", { class: "test-cases__add-section" }, index.h(Button, { variant: "outline", size: "md", onClick: onAddTestCase }, "+ Add Question"))));
|
|
30382
30775
|
};
|
|
30383
30776
|
|
|
30384
30777
|
const tokensCss = () => `:host{--spacing:0.25rem;--spacing-1:calc(var(--spacing) * 1);--spacing-2:calc(var(--spacing) * 2);--spacing-3:calc(var(--spacing) * 3);--spacing-4:calc(var(--spacing) * 4);--spacing-5:calc(var(--spacing) * 5);--spacing-6:calc(var(--spacing) * 6);--spacing-8:calc(var(--spacing) * 8);--spacing-10:calc(var(--spacing) * 10);--spacing-12:calc(var(--spacing) * 12);--spacing-16:calc(var(--spacing) * 16);--spacing-20:calc(var(--spacing) * 20);--spacing-24:calc(var(--spacing) * 24);--radius-none:0;--radius-sm:0.125rem;--radius-md:0.375rem;--radius-lg:0.5rem;--radius-xl:0.75rem;--radius-2xl:1rem;--radius-3xl:1.5rem;--radius-full:9999px;--radius:var(--radius-lg);--font-size-xs:0.75rem;--font-size-sm:0.875rem;--font-size-base:1rem;--font-size-lg:1.125rem;--font-size-xl:1.25rem;--font-size-2xl:1.5rem;--font-size-3xl:1.875rem;--font-size-4xl:2.25rem;--font-weight-normal:400;--font-weight-medium:500;--font-weight-semibold:600;--font-weight-bold:700;--line-height-none:1;--line-height-tight:1.25;--line-height-snug:1.375;--line-height-normal:1.5;--line-height-relaxed:1.625;--line-height-loose:2;--letter-spacing-tight:-0.025em;--letter-spacing-normal:0;--letter-spacing-wide:0.05em;--shadow-sm:0 1px 2px 0 rgba(0, 0, 0, 0.05);--shadow-md:0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);--shadow-lg:0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);--shadow-xl:0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);--shadow-2xl:0 25px 50px -12px rgba(0, 0, 0, 0.25);--border-width:1px;--z-base:0;--z-dropdown:1000;--z-sticky:1100;--z-modal:1200;--z-popover:1300;--z-tooltip:1400;--opacity-disabled:0.5;--opacity-hover:0.8;--opacity-muted:0.6;--max-w-sm:24rem;--max-w-md:28rem;--max-w-lg:32rem;--max-w-xl:42rem;--max-w-2xl:48rem;--max-w-full:100%;--breakpoint-sm:640px;--breakpoint-md:768px;--breakpoint-lg:1024px;--breakpoint-xl:1280px;--breakpoint-2xl:1536px;--background:#ffffff;--foreground:#0a0a0a;--card:#ffffff;--card-foreground:#0a0a0a;--popover:#ffffff;--popover-foreground:#0a0a0a;--primary:#0a0a0a;--primary-foreground:#fafafa;--secondary:#f4f4f5;--secondary-foreground:#0a0a0a;--muted:#f4f4f5;--muted-foreground:#71717a;--accent:#f4f4f5;--accent-foreground:#0a0a0a;--destructive:#ef4444;--destructive-foreground:#fafafa;--border:#e4e4e7;--input:#e4e4e7;--ring:#3b82f6;--success:#10b981;--success-foreground:#fafafa;--warning:#f59e0b;--warning-foreground:#fafafa;--info:#3b82f6;--info-foreground:#fafafa}:host([data-theme='dark']){--background:#0a0a0a;--foreground:#fafafa;--card:#171717;--card-foreground:#fafafa;--popover:#171717;--popover-foreground:#fafafa;--primary:#fafafa;--primary-foreground:#0a0a0a;--secondary:#27272a;--secondary-foreground:#fafafa;--muted:#27272a;--muted-foreground:#a1a1aa;--accent:#27272a;--accent-foreground:#fafafa;--destructive:#dc2626;--destructive-foreground:#fafafa;--border:#27272a;--input:#27272a;--ring:#3b82f6;--success:#059669;--success-foreground:#fafafa;--warning:#d97706;--warning-foreground:#fafafa;--info:#2563eb;--info-foreground:#fafafa}`;
|
|
@@ -30389,7 +30782,7 @@ const llmTestRunnerHeaderCss = () => `.test-runner-header{display:flex;justify-c
|
|
|
30389
30782
|
|
|
30390
30783
|
const llmTestCasesCss = () => `.test-cases{background:var(--background)}.test-cases__column-headers{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);background:var(--border);border-bottom:2px solid var(--border)}.test-cases__column-header{background:var(--muted);padding:var(--spacing-4) var(--spacing-5);font-weight:var(--font-weight-semibold);color:var(--foreground);font-size:var(--font-size-sm);text-transform:uppercase;letter-spacing:var(--letter-spacing-wide)}.test-cases__add-section{padding:var(--spacing-6);text-align:center;background:var(--muted);border-top:var(--border-width) solid var(--border)}@media (max-width: 1200px){.test-cases__column-headers{display:none}}`;
|
|
30391
30784
|
|
|
30392
|
-
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30785
|
+
const llmTestCaseRowCss = () => `.test-case-row{display:grid;grid-template-columns:1fr 1.5fr 0.5fr 120px;gap:var(--border-width);border-bottom:var(--border-width) solid var(--border);min-height:200px}.test-case-row:hover{background:var(--muted)}.test-case-row__input-column{padding:var(--spacing-5);background:var(--background);border-right:var(--border-width) solid var(--border)}.expected-outcome-renderer{display:flex;flex-direction:column;gap:var(--spacing-4);margin-top:var(--spacing-4)}.expected-outcome-renderer__group{display:flex;flex-direction:column;gap:var(--spacing-2);padding:var(--spacing-3);border:var(--border-width) solid var(--border);border-radius:var(--radius-md);background:var(--background)}.expected-outcome-renderer__options{border:var(--border-width) solid var(--border);border-radius:var(--radius-sm);background:var(--muted)}.expected-outcome-renderer__options-summary{cursor:pointer;font-size:var(--font-size-sm);color:var(--foreground);padding:var(--spacing-2) var(--spacing-3);user-select:none}.expected-outcome-renderer__options-content{display:flex;flex-direction:column;gap:var(--spacing-2);padding:0 var(--spacing-3) var(--spacing-3)}@media (max-width: 1200px){.test-case-row{grid-template-columns:1fr;gap:0}.test-case-row__input-column{border-right:none;border-bottom:var(--border-width) solid var(--border)}}@media (max-width: 768px){.test-case-row__input-column{padding:var(--spacing-4)}.test-case-row{min-height:auto}}`;
|
|
30393
30786
|
|
|
30394
30787
|
const rowActionsCss = () => `.row-actions{height:100%;padding:var(--spacing-5);background:var(--background);display:flex;flex-direction:column;gap:var(--spacing-3);align-items:center;justify-content:flex-start;align-self:flex-start}@media (max-width: 1200px){.row-actions{border-right:none;border-bottom:var(--border-width) solid var(--border);flex-direction:row;justify-content:center}}@media (max-width: 768px){.row-actions{padding:var(--spacing-4)}}`;
|
|
30395
30788
|
|
|
@@ -30415,6 +30808,7 @@ const LLMTestRunner = class {
|
|
|
30415
30808
|
useSave = false;
|
|
30416
30809
|
usePromptEditor = false;
|
|
30417
30810
|
resolveExpectedOutcome;
|
|
30811
|
+
evaluationSourceExtractors;
|
|
30418
30812
|
initialTestCases;
|
|
30419
30813
|
defaultExpectedOutcomeSchema;
|
|
30420
30814
|
testCases = [
|
|
@@ -30451,6 +30845,12 @@ const LLMTestRunner = class {
|
|
|
30451
30845
|
// Initialize testCases from prop if provided
|
|
30452
30846
|
if (this.initialTestCases !== undefined) {
|
|
30453
30847
|
validateTestCaseInputArray(this.initialTestCases);
|
|
30848
|
+
const extractorIds = getExtractorIds(this.evaluationSourceExtractors);
|
|
30849
|
+
if (extractorIds.length > 0) {
|
|
30850
|
+
this.initialTestCases.forEach(testCase => {
|
|
30851
|
+
validateExpectedOutcomeArrayWithExtractors(testCase.expectedOutcome, extractorIds);
|
|
30852
|
+
});
|
|
30853
|
+
}
|
|
30454
30854
|
this.testCases = this.initialTestCases.map((rawTestCase, index) => {
|
|
30455
30855
|
try {
|
|
30456
30856
|
return createTestCaseFromInput(rawTestCase);
|
|
@@ -30474,8 +30874,6 @@ const LLMTestRunner = class {
|
|
|
30474
30874
|
this.testCases = [];
|
|
30475
30875
|
}
|
|
30476
30876
|
}
|
|
30477
|
-
componentDidLoad() { }
|
|
30478
|
-
disconnectedCallback() { }
|
|
30479
30877
|
async resetSavingState() {
|
|
30480
30878
|
this.isSaving = false;
|
|
30481
30879
|
}
|
|
@@ -30508,7 +30906,7 @@ const LLMTestRunner = class {
|
|
|
30508
30906
|
updateTestCase(id, updates) {
|
|
30509
30907
|
this.testCases = this.testCases.map(tc => tc.id === id ? { ...tc, ...updates } : tc);
|
|
30510
30908
|
}
|
|
30511
|
-
|
|
30909
|
+
requestLlmResponse(testCase) {
|
|
30512
30910
|
return new Promise((resolve, reject) => {
|
|
30513
30911
|
const payload = {
|
|
30514
30912
|
prompt: testCase.question,
|
|
@@ -30531,14 +30929,14 @@ const LLMTestRunner = class {
|
|
|
30531
30929
|
const startTime = Date.now();
|
|
30532
30930
|
this.updateTestCase(testCase.id, { isRunning: true });
|
|
30533
30931
|
const [llmSettled, resolutionSettled] = await Promise.allSettled([
|
|
30534
|
-
this.
|
|
30932
|
+
this.requestLlmResponse(testCase),
|
|
30535
30933
|
resolveDynamicExpectedOutcomes(testCase, this.resolveExpectedOutcome),
|
|
30536
30934
|
]);
|
|
30537
30935
|
const responseTime = Date.now() - startTime;
|
|
30538
30936
|
if (llmSettled.status === 'rejected') {
|
|
30539
30937
|
this.updateTestCase(testCase.id, {
|
|
30540
30938
|
isRunning: false,
|
|
30541
|
-
output:
|
|
30939
|
+
output: undefined,
|
|
30542
30940
|
error: this.addErrorMessage(llmSettled.reason, 'Unknown error'),
|
|
30543
30941
|
responseTime,
|
|
30544
30942
|
});
|
|
@@ -30586,7 +30984,7 @@ const LLMTestRunner = class {
|
|
|
30586
30984
|
this.updateTestCase(testCase.id, {
|
|
30587
30985
|
evaluationResult: result,
|
|
30588
30986
|
});
|
|
30589
|
-
});
|
|
30987
|
+
}, this.evaluationSourceExtractors);
|
|
30590
30988
|
}
|
|
30591
30989
|
async runAllTests() {
|
|
30592
30990
|
this.isRunningAll = true;
|
|
@@ -30617,7 +31015,7 @@ const LLMTestRunner = class {
|
|
|
30617
31015
|
this.error = '';
|
|
30618
31016
|
try {
|
|
30619
31017
|
const content = await readFileAsync(file);
|
|
30620
|
-
const result = importTestSuite(content);
|
|
31018
|
+
const result = importTestSuite(content, getExtractorIds(this.evaluationSourceExtractors));
|
|
30621
31019
|
if (!result.success) {
|
|
30622
31020
|
this.error = result.error || 'Unknown error occurred during import.';
|
|
30623
31021
|
return;
|
|
@@ -30678,7 +31076,7 @@ const LLMTestRunner = class {
|
|
|
30678
31076
|
}
|
|
30679
31077
|
}
|
|
30680
31078
|
render() {
|
|
30681
|
-
return (index.h("div", { key: '
|
|
31079
|
+
return (index.h("div", { key: '7433beaa1d60d48f65600c43e11b302b892a7bca', class: "test-runner-container" }, index.h(LLMTestRunnerHeader, { key: '8083cc39376e7a710bd3f52efb184b959e885a87', isExportingTestSuite: this.isExportingTestSuite, isExportingTestResults: this.isExportingTestResults, isRunningAll: this.isRunningAll, useSave: this.useSave, isSaving: this.isSaving, usePromptEditor: this.usePromptEditor, onImport: file => this.handleImport(file), onExportSuite: () => this.handleExportTestSuite(), onExportResults: () => this.handleExportTestResults(), onRunAll: () => this.runAllTests(), onSave: () => this.handleSave() }), index.h(ErrorMessage, { key: 'ddced98c13cd595c4cfb6eef11b27cb173769518', message: this.error, onClear: () => (this.error = '') }), index.h("div", { key: '8d6f65c4d68d34869b644709eacb97fec93683c6', class: "test-runner-container__content" }, index.h(LLMTestCases, { key: '5ccb186132b23af6209209b0a14086e03cf790af', testCases: this.testCases, dynamicResolutionSupported: !!this.resolveExpectedOutcome, extractorIds: getExtractorIds(this.evaluationSourceExtractors), onRun: testCase => this.runSingleTest(testCase).catch(() => { }), onDelete: id => this.deleteTestCase(id), onAddTestCase: () => this.addNewTestCase(), handleTestCaseChange: this.handleTestCaseChange, onExpectedOutcomeChange: this.handleExpectedOutcomeChange, onChatHistoryChange: this.handleChatHistoryChange }))));
|
|
30682
31080
|
}
|
|
30683
31081
|
};
|
|
30684
31082
|
LLMTestRunner.style = tokensCss() + (llmTestRunnerCss() + (llmTestRunnerHeaderCss() + (llmTestCasesCss() + (llmTestCaseRowCss() + (rowActionsCss() + (evaluationSummaryCss() + (responseOutputCss() + (errorMessageCss() + (buttonCss() + iconButtonCss())))))))));
|