@ls-stack/agent-eval 0.28.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-mBbAN-Gt.mjs → app-CbOZBHju.mjs} +33 -6
- package/dist/apps/web/dist/assets/index-DEikHy2a.js +118 -0
- package/dist/apps/web/dist/assets/index-DjUTm3M-.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-BQwRbqsL.mjs → cli-CiFOqMwS.mjs} +893 -166
- package/dist/index.d.mts +5758 -3526
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +4 -2
- package/dist/{runOrchestration-ClWYWPen.mjs → runOrchestration-CO3Vf0cQ.mjs} +654 -34
- package/dist/{runner-BQn_xf36.mjs → runner-4pF_Qrc9.mjs} +1 -1
- package/dist/{runner-DbVB66h9.mjs → runner-CXHkf7ih.mjs} +2 -2
- package/dist/src-BiPLv9ya.mjs +3 -0
- package/package.json +4 -33
- package/skills/agent-eval/SKILL.md +63 -8
- package/dist/apps/web/dist/assets/index-8VE7b6RK.css +0 -1
- package/dist/apps/web/dist/assets/index-Czer_MdN.js +0 -118
- package/dist/src-CuirVcPY.mjs +0 -3
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
import { createRequire, registerHooks } from "node:module";
|
|
2
|
-
import { createHash } from "node:crypto";
|
|
2
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
4
4
|
import { extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
5
5
|
import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
|
|
6
6
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
7
7
|
import { z, z as z$1 } from "zod/v4";
|
|
8
|
-
import { Buffer as Buffer$1 } from "node:buffer";
|
|
8
|
+
import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
|
|
9
9
|
import { gunzipSync, gzipSync } from "node:zlib";
|
|
10
10
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
11
11
|
import { existsSync } from "node:fs";
|
|
12
|
-
import { resultify } from "t-result";
|
|
12
|
+
import { Result, resultify } from "t-result";
|
|
13
13
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
14
14
|
//#region ../sdk/src/defineEval.ts
|
|
15
15
|
const evalRegistry = /* @__PURE__ */ new Map();
|
|
@@ -787,6 +787,67 @@ function evalExpect(value) {
|
|
|
787
787
|
return new EvalExpectationImpl(value, false);
|
|
788
788
|
}
|
|
789
789
|
//#endregion
|
|
790
|
+
//#region ../sdk/src/manualInputFile.ts
|
|
791
|
+
/**
|
|
792
|
+
* Zod schema describing one file uploaded through the manual-input modal.
|
|
793
|
+
*
|
|
794
|
+
* Use this as the field type on your `manualInput.schema` whenever you mark
|
|
795
|
+
* a field with `{ asFile: true }` in `manualInput.fields`. The UI / CLI stages
|
|
796
|
+
* the selected file on disk, the runner materializes it into the run artifacts
|
|
797
|
+
* directory, and the server validates this JSON metadata against the schema
|
|
798
|
+
* before flowing it into the case input.
|
|
799
|
+
*
|
|
800
|
+
* @example
|
|
801
|
+
* ```ts
|
|
802
|
+
* const schema = z.object({
|
|
803
|
+
* image: manualInputFileValueSchema,
|
|
804
|
+
* note: z.string().optional(),
|
|
805
|
+
* });
|
|
806
|
+
*
|
|
807
|
+
* defineEval({
|
|
808
|
+
* id: 'image-analyzer',
|
|
809
|
+
* manualInput: {
|
|
810
|
+
* schema,
|
|
811
|
+
* fields: { image: { asFile: true, accept: 'image/*' } },
|
|
812
|
+
* },
|
|
813
|
+
* // ...
|
|
814
|
+
* });
|
|
815
|
+
* ```
|
|
816
|
+
*/
|
|
817
|
+
const manualInputFileValueSchema = z.object({
|
|
818
|
+
name: z.string(),
|
|
819
|
+
mimeType: z.string(),
|
|
820
|
+
sizeBytes: z.number().int().nonnegative(),
|
|
821
|
+
sha256: z.string().regex(/^[a-f0-9]{64}$/),
|
|
822
|
+
path: z.string().min(1)
|
|
823
|
+
});
|
|
824
|
+
/**
|
|
825
|
+
* Read a manual-input file artifact from disk and expose common byte, Blob,
|
|
826
|
+
* File, text, and JSON views for eval code.
|
|
827
|
+
*
|
|
828
|
+
* @param value Manual-input file metadata received by an eval.
|
|
829
|
+
* @param options.cwd Directory used to resolve relative paths. Defaults to `process.cwd()`.
|
|
830
|
+
* @returns File bytes plus convenience views for common file-processing flows.
|
|
831
|
+
*/
|
|
832
|
+
async function readManualInputFile(value, options = {}) {
|
|
833
|
+
const absolutePath = resolve(options.cwd ?? process.cwd(), value.path);
|
|
834
|
+
const bytes = new Uint8Array(await readFile(absolutePath));
|
|
835
|
+
const arrayBuffer = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
|
|
836
|
+
const blob = new Blob$1([bytes], { type: value.mimeType });
|
|
837
|
+
return {
|
|
838
|
+
value,
|
|
839
|
+
absolutePath,
|
|
840
|
+
bytes,
|
|
841
|
+
arrayBuffer,
|
|
842
|
+
blob,
|
|
843
|
+
file: new File$1([bytes], value.name, { type: value.mimeType }),
|
|
844
|
+
text: async () => await blob.text(),
|
|
845
|
+
json: async () => {
|
|
846
|
+
return JSON.parse(await blob.text());
|
|
847
|
+
}
|
|
848
|
+
};
|
|
849
|
+
}
|
|
850
|
+
//#endregion
|
|
790
851
|
//#region ../sdk/src/repoFile.ts
|
|
791
852
|
/**
|
|
792
853
|
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
@@ -2688,6 +2749,11 @@ const evalChartConfigSchema = z.object({
|
|
|
2688
2749
|
* the rendered history window.
|
|
2689
2750
|
*/
|
|
2690
2751
|
hideIfNoValue: z.boolean().optional(),
|
|
2752
|
+
/**
|
|
2753
|
+
* Drop consecutive history points whose plotted metrics and tooltip extras
|
|
2754
|
+
* have the same values as the previous kept point.
|
|
2755
|
+
*/
|
|
2756
|
+
dedupeConsecutiveValues: z.boolean().optional(),
|
|
2691
2757
|
type: evalChartTypeSchema,
|
|
2692
2758
|
/** At least one series must be declared. */
|
|
2693
2759
|
metrics: z.array(evalChartMetricSchema).min(1),
|
|
@@ -2713,6 +2779,122 @@ const evalChartConfigSchema = z.object({
|
|
|
2713
2779
|
*/
|
|
2714
2780
|
const evalChartsConfigSchema = z.array(evalChartConfigSchema);
|
|
2715
2781
|
//#endregion
|
|
2782
|
+
//#region ../shared/src/schemas/manualInput.ts
|
|
2783
|
+
/**
|
|
2784
|
+
* Common metadata shared by every manual-input field descriptor exposed to
|
|
2785
|
+
* the web UI. The runner builds these from the eval's authored Zod schema and
|
|
2786
|
+
* any per-field overrides, so the client never needs the schema itself.
|
|
2787
|
+
*/
|
|
2788
|
+
const manualInputFieldBaseSchema = z.object({
|
|
2789
|
+
/** Top-level key on the eval input object that this field writes to. */
|
|
2790
|
+
key: z.string(),
|
|
2791
|
+
/** Human-readable label rendered next to the field in the modal. */
|
|
2792
|
+
label: z.string(),
|
|
2793
|
+
/** Optional helper text rendered under the label. */
|
|
2794
|
+
description: z.string().optional(),
|
|
2795
|
+
/** Optional placeholder rendered inside the input element. */
|
|
2796
|
+
placeholder: z.string().optional(),
|
|
2797
|
+
/** Whether the field must be filled before the run can be submitted. */
|
|
2798
|
+
required: z.boolean(),
|
|
2799
|
+
/**
|
|
2800
|
+
* Default value used to prefill the field. Type matches the underlying
|
|
2801
|
+
* widget kind (`string` for text/multiline/select, `number` for number,
|
|
2802
|
+
* `boolean` for boolean, JSON-serialisable for `json`).
|
|
2803
|
+
*/
|
|
2804
|
+
defaultValue: z.unknown().optional()
|
|
2805
|
+
});
|
|
2806
|
+
/** One option rendered by the `select` widget. */
|
|
2807
|
+
const manualInputSelectOptionSchema = z.object({
|
|
2808
|
+
value: z.string(),
|
|
2809
|
+
label: z.string()
|
|
2810
|
+
});
|
|
2811
|
+
/** Single line text widget descriptor. */
|
|
2812
|
+
const manualInputTextFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2813
|
+
kind: z.literal("text"),
|
|
2814
|
+
/** Optional minimum character length enforced client-side. */
|
|
2815
|
+
minLength: z.number().int().min(0).optional(),
|
|
2816
|
+
/** Optional maximum character length enforced client-side. */
|
|
2817
|
+
maxLength: z.number().int().min(0).optional()
|
|
2818
|
+
});
|
|
2819
|
+
/** Multi-line textarea widget descriptor. */
|
|
2820
|
+
const manualInputMultilineFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2821
|
+
kind: z.literal("multiline"),
|
|
2822
|
+
/** Optional minimum character length enforced client-side. */
|
|
2823
|
+
minLength: z.number().int().min(0).optional(),
|
|
2824
|
+
/** Optional maximum character length enforced client-side. */
|
|
2825
|
+
maxLength: z.number().int().min(0).optional(),
|
|
2826
|
+
/** Suggested number of visible textarea rows; UI may clamp this. */
|
|
2827
|
+
rows: z.number().int().min(1).optional()
|
|
2828
|
+
});
|
|
2829
|
+
/** Numeric input widget descriptor. */
|
|
2830
|
+
const manualInputNumberFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2831
|
+
kind: z.literal("number"),
|
|
2832
|
+
/** Optional inclusive lower bound. */
|
|
2833
|
+
min: z.number().optional(),
|
|
2834
|
+
/** Optional inclusive upper bound. */
|
|
2835
|
+
max: z.number().optional(),
|
|
2836
|
+
/** Optional UI step increment. */
|
|
2837
|
+
step: z.number().positive().optional(),
|
|
2838
|
+
/** Whether the value must be an integer. */
|
|
2839
|
+
integer: z.boolean().optional()
|
|
2840
|
+
});
|
|
2841
|
+
/** Boolean checkbox/toggle widget descriptor. */
|
|
2842
|
+
const manualInputBooleanFieldSchema = manualInputFieldBaseSchema.extend({ kind: z.literal("boolean") });
|
|
2843
|
+
/** Single-select dropdown widget descriptor. */
|
|
2844
|
+
const manualInputSelectFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2845
|
+
kind: z.literal("select"),
|
|
2846
|
+
options: z.array(manualInputSelectOptionSchema)
|
|
2847
|
+
});
|
|
2848
|
+
/** JSON textarea widget descriptor used for nested objects, arrays, and unions. */
|
|
2849
|
+
const manualInputJsonFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2850
|
+
kind: z.literal("json"),
|
|
2851
|
+
/** Suggested number of visible textarea rows; UI may clamp this. */
|
|
2852
|
+
rows: z.number().int().min(1).optional()
|
|
2853
|
+
});
|
|
2854
|
+
/**
|
|
2855
|
+
* File / image upload widget descriptor. The widget supports clicking to
|
|
2856
|
+
* pick a file, drag-and-drop onto the dropzone, and pasting an image from
|
|
2857
|
+
* the system clipboard. The submitted value references a staged file artifact.
|
|
2858
|
+
*/
|
|
2859
|
+
const manualInputFileFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2860
|
+
kind: z.literal("file"),
|
|
2861
|
+
/**
|
|
2862
|
+
* Browser `accept` attribute (e.g. `image/*`, `image/png,image/jpeg`,
|
|
2863
|
+
* `.pdf`). When omitted the picker accepts any file type.
|
|
2864
|
+
*/
|
|
2865
|
+
accept: z.string().optional(),
|
|
2866
|
+
/** Optional client-side maximum file size in bytes. */
|
|
2867
|
+
maxSizeBytes: z.number().int().positive().optional()
|
|
2868
|
+
});
|
|
2869
|
+
/**
|
|
2870
|
+
* Discriminated union of all supported manual-input widget kinds. The web UI
|
|
2871
|
+
* dispatches to the matching field component based on `kind`.
|
|
2872
|
+
*/
|
|
2873
|
+
const manualInputFieldDescriptorSchema = z.discriminatedUnion("kind", [
|
|
2874
|
+
manualInputTextFieldSchema,
|
|
2875
|
+
manualInputMultilineFieldSchema,
|
|
2876
|
+
manualInputNumberFieldSchema,
|
|
2877
|
+
manualInputBooleanFieldSchema,
|
|
2878
|
+
manualInputSelectFieldSchema,
|
|
2879
|
+
manualInputJsonFieldSchema,
|
|
2880
|
+
manualInputFileFieldSchema
|
|
2881
|
+
]);
|
|
2882
|
+
/**
|
|
2883
|
+
* Wire-format descriptor attached to an `EvalSummary` when the eval declares
|
|
2884
|
+
* `manualInput`. Carries the ordered list of fields the modal renders and
|
|
2885
|
+
* basic context shown in the modal header.
|
|
2886
|
+
*/
|
|
2887
|
+
const manualInputDescriptorSchema = z.object({
|
|
2888
|
+
/** Optional title shown in the modal header. Defaults to the eval title. */
|
|
2889
|
+
title: z.string().optional(),
|
|
2890
|
+
/** Optional helper text shown above the form. */
|
|
2891
|
+
description: z.string().optional(),
|
|
2892
|
+
/** Optional submit button label. Defaults to `Run`. */
|
|
2893
|
+
submitLabel: z.string().optional(),
|
|
2894
|
+
/** Ordered list of fields rendered in the modal. */
|
|
2895
|
+
fields: z.array(manualInputFieldDescriptorSchema)
|
|
2896
|
+
});
|
|
2897
|
+
//#endregion
|
|
2716
2898
|
//#region ../shared/src/schemas/eval.ts
|
|
2717
2899
|
/** Freshness signal derived from the latest relevant run plus git state. */
|
|
2718
2900
|
const evalFreshnessStatusSchema = z.enum([
|
|
@@ -2810,7 +2992,13 @@ const evalSummarySchema = z.object({
|
|
|
2810
2992
|
* Ordered per-eval history chart configuration for the EvalCard. Opt-in:
|
|
2811
2993
|
* when omitted or empty, the UI renders no history chart at all.
|
|
2812
2994
|
*/
|
|
2813
|
-
charts: evalChartsConfigSchema.optional()
|
|
2995
|
+
charts: evalChartsConfigSchema.optional(),
|
|
2996
|
+
/**
|
|
2997
|
+
* Manual-input form descriptor when the eval declares `manualInput`. The
|
|
2998
|
+
* web UI renders these fields in a modal before kicking off a run; the
|
|
2999
|
+
* runner consumes the validated values as the case input.
|
|
3000
|
+
*/
|
|
3001
|
+
manualInput: manualInputDescriptorSchema.optional()
|
|
2814
3002
|
});
|
|
2815
3003
|
/** Schema for one case row in an eval run result table. */
|
|
2816
3004
|
const caseRowSchema = z.object({
|
|
@@ -2950,7 +3138,7 @@ const caseDetailSchema = z.object({
|
|
|
2950
3138
|
});
|
|
2951
3139
|
/** Schema for discovery problems that should be shown before running evals. */
|
|
2952
3140
|
const discoveryIssueSchema = z.object({
|
|
2953
|
-
type: z.enum(["duplicate-eval-id"]),
|
|
3141
|
+
type: z.enum(["duplicate-eval-id", "manual-input-with-cases"]),
|
|
2954
3142
|
severity: z.enum(["error"]),
|
|
2955
3143
|
filePath: z.string(),
|
|
2956
3144
|
evalId: z.string(),
|
|
@@ -3031,6 +3219,8 @@ const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
|
|
|
3031
3219
|
/** Where an API-call metric is rendered inside the API calls tab. */
|
|
3032
3220
|
const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
|
|
3033
3221
|
const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
|
|
3222
|
+
const callDerivedAttributesFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attributes function" });
|
|
3223
|
+
const callDerivedAttributesConfigSchema = z.union([z.record(z.string().min(1), callDerivedAttributeSchema), callDerivedAttributesFnSchema]);
|
|
3034
3224
|
/**
|
|
3035
3225
|
* Schema for a single user-defined metric attached to LLM call rows.
|
|
3036
3226
|
*
|
|
@@ -3157,10 +3347,11 @@ const llmCallsConfigSchema = z.object({
|
|
|
3157
3347
|
/**
|
|
3158
3348
|
* Derived attributes persisted onto every matching LLM span before
|
|
3159
3349
|
* `deriveFromTracing`, default outputs, trace display, and call metrics read
|
|
3160
|
-
* the trace.
|
|
3161
|
-
*
|
|
3350
|
+
* the trace. Use a keyed map for one-off fields, or one callback returning a
|
|
3351
|
+
* path/value object for multiple fields. Keys are dot-paths under
|
|
3352
|
+
* `span.attributes`; return `undefined` to skip one span or one returned key.
|
|
3162
3353
|
*/
|
|
3163
|
-
derivedAttributes:
|
|
3354
|
+
derivedAttributes: callDerivedAttributesConfigSchema.optional(),
|
|
3164
3355
|
/**
|
|
3165
3356
|
* Model-keyed pricing registry used to calculate LLM-call costs from token
|
|
3166
3357
|
* counts. Built-in LLM cost fields are only derived from this registry.
|
|
@@ -3192,11 +3383,12 @@ const apiCallsConfigSchema = z.object({
|
|
|
3192
3383
|
}).optional(),
|
|
3193
3384
|
/**
|
|
3194
3385
|
* Derived attributes persisted onto every matching API span before trace
|
|
3195
|
-
* display and call metrics read the trace.
|
|
3196
|
-
*
|
|
3197
|
-
*
|
|
3386
|
+
* display and call metrics read the trace. Use a keyed map for one-off
|
|
3387
|
+
* fields, or one callback returning a path/value object for multiple fields.
|
|
3388
|
+
* Keys are dot-paths under `span.attributes`; return `undefined` to skip one
|
|
3389
|
+
* span or one returned key.
|
|
3198
3390
|
*/
|
|
3199
|
-
derivedAttributes:
|
|
3391
|
+
derivedAttributes: callDerivedAttributesConfigSchema.optional(),
|
|
3200
3392
|
/** Custom user-defined metrics surfaced on each API call. */
|
|
3201
3393
|
metrics: z.array(apiCallMetricSchema).optional()
|
|
3202
3394
|
});
|
|
@@ -3256,7 +3448,9 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
3256
3448
|
metrics: []
|
|
3257
3449
|
};
|
|
3258
3450
|
function resolveDerivedAttributes(input) {
|
|
3259
|
-
|
|
3451
|
+
if (input === void 0) return [];
|
|
3452
|
+
if (typeof input === "function") return [{ computeMany: input }];
|
|
3453
|
+
return Object.entries(input).map(([path, compute]) => ({
|
|
3260
3454
|
path,
|
|
3261
3455
|
compute
|
|
3262
3456
|
}));
|
|
@@ -3621,11 +3815,31 @@ function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
|
3621
3815
|
function applyDerivedAttributesForKind(params) {
|
|
3622
3816
|
let attributes = params.span.attributes;
|
|
3623
3817
|
for (const derivedAttribute of params.derivedAttributes) {
|
|
3624
|
-
if (derivedAttribute.compute === void 0) continue;
|
|
3625
3818
|
const span = {
|
|
3626
3819
|
...params.span,
|
|
3627
3820
|
attributes
|
|
3628
3821
|
};
|
|
3822
|
+
if (derivedAttribute.computeMany !== void 0) {
|
|
3823
|
+
const values = (() => {
|
|
3824
|
+
try {
|
|
3825
|
+
return derivedAttribute.computeMany({
|
|
3826
|
+
attributes,
|
|
3827
|
+
span,
|
|
3828
|
+
get: (path) => getNestedAttribute(attributes, path)
|
|
3829
|
+
});
|
|
3830
|
+
} catch {
|
|
3831
|
+
return;
|
|
3832
|
+
}
|
|
3833
|
+
})();
|
|
3834
|
+
if (!isRecord$3(values)) continue;
|
|
3835
|
+
for (const [path, value] of Object.entries(values)) {
|
|
3836
|
+
if (value === void 0) continue;
|
|
3837
|
+
attributes = mergeNestedAttribute$1(attributes, path, value);
|
|
3838
|
+
}
|
|
3839
|
+
continue;
|
|
3840
|
+
}
|
|
3841
|
+
if (derivedAttribute.path === void 0) continue;
|
|
3842
|
+
if (derivedAttribute.compute === void 0) continue;
|
|
3629
3843
|
const value = (() => {
|
|
3630
3844
|
try {
|
|
3631
3845
|
return derivedAttribute.compute({
|
|
@@ -4128,6 +4342,7 @@ function isCacheHitEntry(entry) {
|
|
|
4128
4342
|
}
|
|
4129
4343
|
z.enum([
|
|
4130
4344
|
"discovery.updated",
|
|
4345
|
+
"config.reload",
|
|
4131
4346
|
"run.started",
|
|
4132
4347
|
"run.summary",
|
|
4133
4348
|
"case.started",
|
|
@@ -4147,6 +4362,19 @@ const sseEnvelopeSchema = z.object({
|
|
|
4147
4362
|
});
|
|
4148
4363
|
//#endregion
|
|
4149
4364
|
//#region ../shared/src/schemas/api.ts
|
|
4365
|
+
/** Lifecycle state for an app config reload triggered by `agent-evals.config.ts`. */
|
|
4366
|
+
const configReloadStatusSchema = z.enum([
|
|
4367
|
+
"idle",
|
|
4368
|
+
"pending",
|
|
4369
|
+
"reloading"
|
|
4370
|
+
]);
|
|
4371
|
+
/** UI/API-visible state for config reloads in `agent-evals app`. */
|
|
4372
|
+
const configReloadStateSchema = z.object({
|
|
4373
|
+
status: configReloadStatusSchema,
|
|
4374
|
+
activeRunCount: z.number().int().min(0),
|
|
4375
|
+
lastChangedAt: z.string().nullable(),
|
|
4376
|
+
lastReloadedAt: z.string().nullable()
|
|
4377
|
+
});
|
|
4150
4378
|
/** Schema for the API request that starts a new eval run. */
|
|
4151
4379
|
const createRunRequestSchema = z.object({
|
|
4152
4380
|
target: z.object({
|
|
@@ -4167,7 +4395,14 @@ const createRunRequestSchema = z.object({
|
|
|
4167
4395
|
* Optional cache controls for the run. When omitted, the cache is used in
|
|
4168
4396
|
* its default read-through / write-on-miss mode.
|
|
4169
4397
|
*/
|
|
4170
|
-
cache: z.object({ mode: cacheModeSchema.default("use") }).optional()
|
|
4398
|
+
cache: z.object({ mode: cacheModeSchema.default("use") }).optional(),
|
|
4399
|
+
/**
|
|
4400
|
+
* Manual-input values keyed by eval `key` (workspace-relative file path
|
|
4401
|
+
* plus authored eval id). Required for any targeted eval that declares
|
|
4402
|
+
* `manualInput` in its definition; the server validates each entry against
|
|
4403
|
+
* the eval's authored Zod schema before starting the run.
|
|
4404
|
+
*/
|
|
4405
|
+
manualInputs: z.record(z.string(), z.unknown()).optional()
|
|
4171
4406
|
});
|
|
4172
4407
|
/** Schema for updating a UI-authored manual score on one persisted case. */
|
|
4173
4408
|
const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
|
|
@@ -4819,7 +5054,9 @@ async function loadConfig() {
|
|
|
4819
5054
|
const configPath = resolve(process.cwd(), "agent-evals.config.ts");
|
|
4820
5055
|
if (!existsSync(configPath)) return defaultConfig;
|
|
4821
5056
|
try {
|
|
4822
|
-
const
|
|
5057
|
+
const configUrl = pathToFileURL(configPath);
|
|
5058
|
+
configUrl.searchParams.set("v", randomUUID());
|
|
5059
|
+
const imported = await import(configUrl.href);
|
|
4823
5060
|
const configModule = configModuleSchema.parse(imported);
|
|
4824
5061
|
const userConfig = configModule.default ?? configModule.config;
|
|
4825
5062
|
if (!userConfig) return defaultConfig;
|
|
@@ -4997,6 +5234,7 @@ function appendDefaultCharts(params) {
|
|
|
4997
5234
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4998
5235
|
heading: "LLM Cost",
|
|
4999
5236
|
hideIfNoValue: true,
|
|
5237
|
+
dedupeConsecutiveValues: true,
|
|
5000
5238
|
type: "area",
|
|
5001
5239
|
metrics: [{
|
|
5002
5240
|
source: "column",
|
|
@@ -5006,7 +5244,7 @@ function appendDefaultCharts(params) {
|
|
|
5006
5244
|
color: "warning"
|
|
5007
5245
|
}]
|
|
5008
5246
|
});
|
|
5009
|
-
const
|
|
5247
|
+
const inputTokenMetrics = [
|
|
5010
5248
|
activeKeys.has("inputTokens") ? {
|
|
5011
5249
|
source: "column",
|
|
5012
5250
|
key: "inputTokens",
|
|
@@ -5014,13 +5252,6 @@ function appendDefaultCharts(params) {
|
|
|
5014
5252
|
label: "Input",
|
|
5015
5253
|
color: "accent"
|
|
5016
5254
|
} : null,
|
|
5017
|
-
activeKeys.has("outputTokens") ? {
|
|
5018
|
-
source: "column",
|
|
5019
|
-
key: "outputTokens",
|
|
5020
|
-
aggregate: "avg",
|
|
5021
|
-
label: "Output",
|
|
5022
|
-
color: "success"
|
|
5023
|
-
} : null,
|
|
5024
5255
|
activeKeys.has("cachedInputTokens") ? {
|
|
5025
5256
|
source: "column",
|
|
5026
5257
|
key: "cachedInputTokens",
|
|
@@ -5036,17 +5267,25 @@ function appendDefaultCharts(params) {
|
|
|
5036
5267
|
color: "warning"
|
|
5037
5268
|
} : null
|
|
5038
5269
|
].filter((metric) => metric !== null);
|
|
5039
|
-
if (
|
|
5040
|
-
heading: "LLM Tokens",
|
|
5270
|
+
if (inputTokenMetrics.length > 0) defaults.push({
|
|
5271
|
+
heading: "LLM Input Tokens",
|
|
5041
5272
|
hideIfNoValue: true,
|
|
5273
|
+
dedupeConsecutiveValues: true,
|
|
5042
5274
|
type: "bar",
|
|
5043
|
-
metrics:
|
|
5044
|
-
|
|
5275
|
+
metrics: inputTokenMetrics
|
|
5276
|
+
});
|
|
5277
|
+
if (activeKeys.has("outputTokens")) defaults.push({
|
|
5278
|
+
heading: "LLM Output Tokens",
|
|
5279
|
+
hideIfNoValue: true,
|
|
5280
|
+
dedupeConsecutiveValues: true,
|
|
5281
|
+
type: "bar",
|
|
5282
|
+
metrics: [{
|
|
5045
5283
|
source: "column",
|
|
5046
|
-
key: "
|
|
5284
|
+
key: "outputTokens",
|
|
5047
5285
|
aggregate: "avg",
|
|
5048
|
-
label: "
|
|
5049
|
-
|
|
5286
|
+
label: "Output",
|
|
5287
|
+
color: "success"
|
|
5288
|
+
}]
|
|
5050
5289
|
});
|
|
5051
5290
|
const merged = [...params.charts ?? [], ...defaults];
|
|
5052
5291
|
return merged.length > 0 ? merged : void 0;
|
|
@@ -5316,6 +5555,371 @@ function getRunFreshnessTimestamp(manifest) {
|
|
|
5316
5555
|
return manifest.endedAt ?? manifest.startedAt;
|
|
5317
5556
|
}
|
|
5318
5557
|
//#endregion
|
|
5558
|
+
//#region ../runner/src/manualInput/walker.ts
|
|
5559
|
+
function isObject(value) {
|
|
5560
|
+
return typeof value === "object" && value !== null;
|
|
5561
|
+
}
|
|
5562
|
+
function getZodDef(schema) {
|
|
5563
|
+
if (!isObject(schema)) return null;
|
|
5564
|
+
const zodHolder = schema._zod;
|
|
5565
|
+
if (!isObject(zodHolder)) return null;
|
|
5566
|
+
const def = zodHolder.def;
|
|
5567
|
+
if (!isObject(def)) return null;
|
|
5568
|
+
if (typeof def.type !== "string") return null;
|
|
5569
|
+
return {
|
|
5570
|
+
...def,
|
|
5571
|
+
type: def.type
|
|
5572
|
+
};
|
|
5573
|
+
}
|
|
5574
|
+
function getDescription(schema) {
|
|
5575
|
+
if (!isObject(schema)) return void 0;
|
|
5576
|
+
const description = schema.description;
|
|
5577
|
+
return typeof description === "string" ? description : void 0;
|
|
5578
|
+
}
|
|
5579
|
+
function getInnerSchema(def) {
|
|
5580
|
+
return def.innerType;
|
|
5581
|
+
}
|
|
5582
|
+
function getChecks(def) {
|
|
5583
|
+
const checks = def.checks;
|
|
5584
|
+
if (!Array.isArray(checks)) return [];
|
|
5585
|
+
const out = [];
|
|
5586
|
+
for (const check of checks) {
|
|
5587
|
+
if (!isObject(check)) continue;
|
|
5588
|
+
const zodHolder = check._zod;
|
|
5589
|
+
if (!isObject(zodHolder)) continue;
|
|
5590
|
+
const checkDef = zodHolder.def;
|
|
5591
|
+
if (!isObject(checkDef)) continue;
|
|
5592
|
+
if (typeof checkDef.check !== "string") continue;
|
|
5593
|
+
out.push({
|
|
5594
|
+
...checkDef,
|
|
5595
|
+
check: checkDef.check
|
|
5596
|
+
});
|
|
5597
|
+
}
|
|
5598
|
+
return out;
|
|
5599
|
+
}
|
|
5600
|
+
function findCheck(checks, name) {
|
|
5601
|
+
return checks.find((check) => check.check === name);
|
|
5602
|
+
}
|
|
5603
|
+
function unwrap(schema) {
|
|
5604
|
+
let current = schema;
|
|
5605
|
+
let required = true;
|
|
5606
|
+
let defaultValue = void 0;
|
|
5607
|
+
for (let depth = 0; depth < 8; depth += 1) {
|
|
5608
|
+
const def = getZodDef(current);
|
|
5609
|
+
if (!def) return null;
|
|
5610
|
+
if (def.type === "optional" || def.type === "nullable") {
|
|
5611
|
+
required = false;
|
|
5612
|
+
current = getInnerSchema(def);
|
|
5613
|
+
continue;
|
|
5614
|
+
}
|
|
5615
|
+
if (def.type === "nullish") {
|
|
5616
|
+
required = false;
|
|
5617
|
+
current = getInnerSchema(def);
|
|
5618
|
+
continue;
|
|
5619
|
+
}
|
|
5620
|
+
if (def.type === "default" || def.type === "prefault") {
|
|
5621
|
+
const raw = def.defaultValue;
|
|
5622
|
+
if (typeof raw === "function") defaultValue = Reflect.apply(raw, void 0, []);
|
|
5623
|
+
else defaultValue = raw;
|
|
5624
|
+
current = getInnerSchema(def);
|
|
5625
|
+
continue;
|
|
5626
|
+
}
|
|
5627
|
+
if (def.type === "readonly" || def.type === "pipe") {
|
|
5628
|
+
current = getInnerSchema(def) ?? def.in;
|
|
5629
|
+
continue;
|
|
5630
|
+
}
|
|
5631
|
+
return {
|
|
5632
|
+
schema: current,
|
|
5633
|
+
def,
|
|
5634
|
+
required,
|
|
5635
|
+
defaultValue
|
|
5636
|
+
};
|
|
5637
|
+
}
|
|
5638
|
+
return null;
|
|
5639
|
+
}
|
|
5640
|
+
function humaniseKey(key) {
|
|
5641
|
+
const spaced = key.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/[_-]+/g, " ").trim();
|
|
5642
|
+
if (!spaced) return key;
|
|
5643
|
+
const lowered = spaced.toLowerCase();
|
|
5644
|
+
return lowered.charAt(0).toUpperCase() + lowered.slice(1);
|
|
5645
|
+
}
|
|
5646
|
+
function normaliseSelectOptions(raw) {
|
|
5647
|
+
if (!raw) return void 0;
|
|
5648
|
+
return raw.map((entry) => {
|
|
5649
|
+
if (typeof entry === "string") return {
|
|
5650
|
+
value: entry,
|
|
5651
|
+
label: entry
|
|
5652
|
+
};
|
|
5653
|
+
return {
|
|
5654
|
+
value: entry.value,
|
|
5655
|
+
label: entry.label ?? entry.value
|
|
5656
|
+
};
|
|
5657
|
+
});
|
|
5658
|
+
}
|
|
5659
|
+
function enumOptionsFromEntries(def) {
|
|
5660
|
+
const entries = def.entries;
|
|
5661
|
+
if (!isObject(entries)) return null;
|
|
5662
|
+
const out = [];
|
|
5663
|
+
for (const [label, value] of Object.entries(entries)) if (typeof value === "string") out.push({
|
|
5664
|
+
value,
|
|
5665
|
+
label
|
|
5666
|
+
});
|
|
5667
|
+
else if (typeof value === "number") out.push({
|
|
5668
|
+
value: String(value),
|
|
5669
|
+
label
|
|
5670
|
+
});
|
|
5671
|
+
else return null;
|
|
5672
|
+
return out;
|
|
5673
|
+
}
|
|
5674
|
+
function literalUnionOptions(def) {
|
|
5675
|
+
const options = def.options;
|
|
5676
|
+
if (!Array.isArray(options)) return null;
|
|
5677
|
+
const out = [];
|
|
5678
|
+
for (const option of options) {
|
|
5679
|
+
const optDef = getZodDef(option);
|
|
5680
|
+
if (optDef?.type !== "literal") return null;
|
|
5681
|
+
const values = optDef.values;
|
|
5682
|
+
if (!Array.isArray(values) || values.length !== 1) return null;
|
|
5683
|
+
const value = values[0];
|
|
5684
|
+
if (typeof value === "string") out.push({
|
|
5685
|
+
value,
|
|
5686
|
+
label: value
|
|
5687
|
+
});
|
|
5688
|
+
else if (typeof value === "number") {
|
|
5689
|
+
const stringValue = String(value);
|
|
5690
|
+
out.push({
|
|
5691
|
+
value: stringValue,
|
|
5692
|
+
label: stringValue
|
|
5693
|
+
});
|
|
5694
|
+
} else return null;
|
|
5695
|
+
}
|
|
5696
|
+
return out.length > 0 ? out : null;
|
|
5697
|
+
}
|
|
5698
|
+
function literalSelectOptions(def) {
|
|
5699
|
+
const values = def.values;
|
|
5700
|
+
if (!Array.isArray(values)) return null;
|
|
5701
|
+
const out = [];
|
|
5702
|
+
for (const value of values) if (typeof value === "string") out.push({
|
|
5703
|
+
value,
|
|
5704
|
+
label: value
|
|
5705
|
+
});
|
|
5706
|
+
else if (typeof value === "number") {
|
|
5707
|
+
const stringValue = String(value);
|
|
5708
|
+
out.push({
|
|
5709
|
+
value: stringValue,
|
|
5710
|
+
label: stringValue
|
|
5711
|
+
});
|
|
5712
|
+
} else return null;
|
|
5713
|
+
return out;
|
|
5714
|
+
}
|
|
5715
|
+
function readStringChecks(def) {
|
|
5716
|
+
const checks = getChecks(def);
|
|
5717
|
+
const out = {};
|
|
5718
|
+
const min = findCheck(checks, "min_length");
|
|
5719
|
+
if (min && typeof min.minimum === "number") out.minLength = min.minimum;
|
|
5720
|
+
const max = findCheck(checks, "max_length");
|
|
5721
|
+
if (max && typeof max.maximum === "number") out.maxLength = max.maximum;
|
|
5722
|
+
return out;
|
|
5723
|
+
}
|
|
5724
|
+
const integerNumberFormats = new Set([
|
|
5725
|
+
"int",
|
|
5726
|
+
"safeint",
|
|
5727
|
+
"int32",
|
|
5728
|
+
"uint32",
|
|
5729
|
+
"int64",
|
|
5730
|
+
"uint64"
|
|
5731
|
+
]);
|
|
5732
|
+
function readNumberChecks(def) {
|
|
5733
|
+
const checks = getChecks(def);
|
|
5734
|
+
const out = {};
|
|
5735
|
+
const gt = findCheck(checks, "greater_than");
|
|
5736
|
+
if (gt && typeof gt.value === "number" && gt.inclusive === true) out.min = gt.value;
|
|
5737
|
+
const lt = findCheck(checks, "less_than");
|
|
5738
|
+
if (lt && typeof lt.value === "number" && lt.inclusive === true) out.max = lt.value;
|
|
5739
|
+
const format = findCheck(checks, "number_format");
|
|
5740
|
+
if (format && typeof format.format === "string" && integerNumberFormats.has(format.format)) out.integer = true;
|
|
5741
|
+
return out;
|
|
5742
|
+
}
|
|
5743
|
+
function buildField(key, fieldSchema, override) {
|
|
5744
|
+
const unwrapped = unwrap(fieldSchema);
|
|
5745
|
+
if (!unwrapped) return Result.err(/* @__PURE__ */ new Error(`manualInput: field "${key}" uses an unsupported Zod schema (could not introspect)`));
|
|
5746
|
+
const inner = unwrapped.def;
|
|
5747
|
+
const description = override?.description ?? getDescription(unwrapped.schema);
|
|
5748
|
+
const base = {
|
|
5749
|
+
key,
|
|
5750
|
+
label: override?.label ?? humaniseKey(key),
|
|
5751
|
+
description,
|
|
5752
|
+
placeholder: override?.placeholder,
|
|
5753
|
+
required: unwrapped.required,
|
|
5754
|
+
defaultValue: override?.defaultValue !== void 0 ? override.defaultValue : unwrapped.defaultValue
|
|
5755
|
+
};
|
|
5756
|
+
if (override?.asJson === true) {
|
|
5757
|
+
const rows = override.rows;
|
|
5758
|
+
return Result.ok({
|
|
5759
|
+
...base,
|
|
5760
|
+
kind: "json",
|
|
5761
|
+
rows
|
|
5762
|
+
});
|
|
5763
|
+
}
|
|
5764
|
+
if (override?.asFile === true) return Result.ok({
|
|
5765
|
+
...base,
|
|
5766
|
+
kind: "file",
|
|
5767
|
+
accept: override.accept,
|
|
5768
|
+
maxSizeBytes: override.maxSizeBytes
|
|
5769
|
+
});
|
|
5770
|
+
const overrideOptions = normaliseSelectOptions(override?.options);
|
|
5771
|
+
if (overrideOptions) return Result.ok({
|
|
5772
|
+
...base,
|
|
5773
|
+
kind: "select",
|
|
5774
|
+
options: overrideOptions
|
|
5775
|
+
});
|
|
5776
|
+
switch (inner.type) {
|
|
5777
|
+
case "string": {
|
|
5778
|
+
const checks = readStringChecks(inner);
|
|
5779
|
+
if (override?.multiline === true) return Result.ok({
|
|
5780
|
+
...base,
|
|
5781
|
+
kind: "multiline",
|
|
5782
|
+
rows: override.rows,
|
|
5783
|
+
minLength: checks.minLength,
|
|
5784
|
+
maxLength: checks.maxLength
|
|
5785
|
+
});
|
|
5786
|
+
return Result.ok({
|
|
5787
|
+
...base,
|
|
5788
|
+
kind: "text",
|
|
5789
|
+
minLength: checks.minLength,
|
|
5790
|
+
maxLength: checks.maxLength
|
|
5791
|
+
});
|
|
5792
|
+
}
|
|
5793
|
+
case "number":
|
|
5794
|
+
case "int":
|
|
5795
|
+
case "bigint": {
|
|
5796
|
+
const checks = readNumberChecks(inner);
|
|
5797
|
+
return Result.ok({
|
|
5798
|
+
...base,
|
|
5799
|
+
kind: "number",
|
|
5800
|
+
min: checks.min,
|
|
5801
|
+
max: checks.max,
|
|
5802
|
+
integer: checks.integer
|
|
5803
|
+
});
|
|
5804
|
+
}
|
|
5805
|
+
case "boolean": return Result.ok({
|
|
5806
|
+
...base,
|
|
5807
|
+
kind: "boolean"
|
|
5808
|
+
});
|
|
5809
|
+
case "enum": {
|
|
5810
|
+
const options = enumOptionsFromEntries(inner);
|
|
5811
|
+
if (options) return Result.ok({
|
|
5812
|
+
...base,
|
|
5813
|
+
kind: "select",
|
|
5814
|
+
options
|
|
5815
|
+
});
|
|
5816
|
+
return Result.ok({
|
|
5817
|
+
...base,
|
|
5818
|
+
kind: "json",
|
|
5819
|
+
rows: override?.rows
|
|
5820
|
+
});
|
|
5821
|
+
}
|
|
5822
|
+
case "literal": {
|
|
5823
|
+
const options = literalSelectOptions(inner);
|
|
5824
|
+
if (options && options.length > 0) return Result.ok({
|
|
5825
|
+
...base,
|
|
5826
|
+
kind: "select",
|
|
5827
|
+
options
|
|
5828
|
+
});
|
|
5829
|
+
return Result.ok({
|
|
5830
|
+
...base,
|
|
5831
|
+
kind: "json",
|
|
5832
|
+
rows: override?.rows
|
|
5833
|
+
});
|
|
5834
|
+
}
|
|
5835
|
+
case "union": {
|
|
5836
|
+
const options = literalUnionOptions(inner);
|
|
5837
|
+
if (options) return Result.ok({
|
|
5838
|
+
...base,
|
|
5839
|
+
kind: "select",
|
|
5840
|
+
options
|
|
5841
|
+
});
|
|
5842
|
+
return Result.ok({
|
|
5843
|
+
...base,
|
|
5844
|
+
kind: "json",
|
|
5845
|
+
rows: override?.rows
|
|
5846
|
+
});
|
|
5847
|
+
}
|
|
5848
|
+
default: return Result.ok({
|
|
5849
|
+
...base,
|
|
5850
|
+
kind: "json",
|
|
5851
|
+
rows: override?.rows
|
|
5852
|
+
});
|
|
5853
|
+
}
|
|
5854
|
+
}
|
|
5855
|
+
function getObjectShape(schema) {
|
|
5856
|
+
const def = getZodDef(schema);
|
|
5857
|
+
if (!def) return null;
|
|
5858
|
+
if (def.type !== "object") return null;
|
|
5859
|
+
const shape = def.shape;
|
|
5860
|
+
if (!isObject(shape)) return null;
|
|
5861
|
+
return shape;
|
|
5862
|
+
}
|
|
5863
|
+
/**
|
|
5864
|
+
* Walk an eval's `manualInput` configuration and produce the wire-format
|
|
5865
|
+
* descriptor consumed by the web UI. The schema must resolve to a top-level
|
|
5866
|
+
* `z.object(...)`; nested objects, arrays, unions, and other unsupported
|
|
5867
|
+
* shapes inside fields fall back to the JSON textarea widget.
|
|
5868
|
+
*
|
|
5869
|
+
* Returns a `Result` so the caller (eval discovery) can surface a discovery
|
|
5870
|
+
* issue without throwing when the schema is incompatible.
|
|
5871
|
+
*/
|
|
5872
|
+
function buildManualInputDescriptor(config) {
|
|
5873
|
+
const shape = getObjectShape(config.schema);
|
|
5874
|
+
if (!shape) return Result.err(/* @__PURE__ */ new Error("manualInput.schema must be a top-level z.object(...). Wrap nested types in an object schema."));
|
|
5875
|
+
const overrides = {};
|
|
5876
|
+
const rawOverrides = config.fields;
|
|
5877
|
+
if (rawOverrides) {
|
|
5878
|
+
for (const [key, override] of Object.entries(rawOverrides)) if (override) overrides[key] = override;
|
|
5879
|
+
}
|
|
5880
|
+
const fields = [];
|
|
5881
|
+
for (const [key, fieldSchema] of Object.entries(shape)) {
|
|
5882
|
+
const fieldResult = buildField(key, fieldSchema, overrides[key]);
|
|
5883
|
+
if (fieldResult.error) return fieldResult.errorResult();
|
|
5884
|
+
fields.push(fieldResult.value);
|
|
5885
|
+
}
|
|
5886
|
+
return Result.ok({
|
|
5887
|
+
title: config.title,
|
|
5888
|
+
description: config.description,
|
|
5889
|
+
submitLabel: config.submitLabel,
|
|
5890
|
+
fields
|
|
5891
|
+
});
|
|
5892
|
+
}
|
|
5893
|
+
/**
|
|
5894
|
+
* Resolve an eval's `manualInput` Zod schema against a raw user submission.
|
|
5895
|
+
* Returns the parsed value typed against the eval's `TInput` generic, or a
|
|
5896
|
+
* structured `Error` carrying the Zod issues for the caller to surface.
|
|
5897
|
+
*/
|
|
5898
|
+
function parseManualInputValues(config, raw) {
|
|
5899
|
+
const parsed = config.schema.safeParse(raw);
|
|
5900
|
+
if (!parsed.success) return Result.err(new ManualInputValidationError(parsed.error.issues.map(formatIssue)));
|
|
5901
|
+
return Result.ok(parsed.data);
|
|
5902
|
+
}
|
|
5903
|
+
/**
|
|
5904
|
+
* Error thrown / returned when manual-input values fail validation against
|
|
5905
|
+
* the eval's `manualInput.schema`. Carries the structured Zod issues so the
|
|
5906
|
+
* CLI and HTTP layers can surface them per-field.
|
|
5907
|
+
*/
|
|
5908
|
+
var ManualInputValidationError = class extends Error {
|
|
5909
|
+
issues;
|
|
5910
|
+
constructor(issues) {
|
|
5911
|
+
super(issues.length === 0 ? "manualInput validation failed" : `manualInput validation failed: ${issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ")}`);
|
|
5912
|
+
this.name = "ManualInputValidationError";
|
|
5913
|
+
this.issues = issues;
|
|
5914
|
+
}
|
|
5915
|
+
};
|
|
5916
|
+
function formatIssue(issue) {
|
|
5917
|
+
return {
|
|
5918
|
+
path: issue.path.map((segment) => typeof segment === "string" || typeof segment === "number" ? String(segment) : "").filter((segment) => segment !== "").join("."),
|
|
5919
|
+
message: issue.message
|
|
5920
|
+
};
|
|
5921
|
+
}
|
|
5922
|
+
//#endregion
|
|
5319
5923
|
//#region ../runner/src/outputArtifacts.ts
|
|
5320
5924
|
const mimeTypeExtensionMap = {
|
|
5321
5925
|
"application/json": ".json",
|
|
@@ -6347,8 +6951,24 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6347
6951
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
6348
6952
|
await runInEvalRuntimeScope("cases", async () => {
|
|
6349
6953
|
await entry.use(async (evalDef) => {
|
|
6350
|
-
|
|
6351
|
-
|
|
6954
|
+
if (evalDef.manualInput && evalDef.cases !== void 0) throw new Error(`Eval "${evalMeta.id}" cannot declare both "cases" and "manualInput". Remove one of them.`);
|
|
6955
|
+
let manualInputCase = null;
|
|
6956
|
+
if (evalDef.manualInput) {
|
|
6957
|
+
const rawValue = request.manualInputs?.[evalMeta.key];
|
|
6958
|
+
if (rawValue === void 0) throw new Error(`Eval "${evalMeta.id}" requires manual input. Provide it via the run modal in the web UI or "--input" / "--input-file" on the CLI.`);
|
|
6959
|
+
const parsed = parseManualInputValues(evalDef.manualInput, rawValue);
|
|
6960
|
+
if (parsed.error) {
|
|
6961
|
+
const formatted = parsed.error.issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ");
|
|
6962
|
+
throw new Error(`Invalid manual input for eval "${evalMeta.id}": ${formatted}`);
|
|
6963
|
+
}
|
|
6964
|
+
manualInputCase = {
|
|
6965
|
+
id: `${evalMeta.id}-manual`,
|
|
6966
|
+
input: parsed.value
|
|
6967
|
+
};
|
|
6968
|
+
}
|
|
6969
|
+
const evalCases = manualInputCase ? [manualInputCase] : await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime });
|
|
6970
|
+
const runnableCases = manualInputCase ? evalCases : resolveRunnableEvalCases({
|
|
6971
|
+
cases: evalCases,
|
|
6352
6972
|
evalId: evalMeta.id
|
|
6353
6973
|
});
|
|
6354
6974
|
const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
|
|
@@ -6567,4 +7187,4 @@ function toLastRunStatus(status) {
|
|
|
6567
7187
|
return status === "pending" ? null : status;
|
|
6568
7188
|
}
|
|
6569
7189
|
//#endregion
|
|
6570
|
-
export {
|
|
7190
|
+
export { defaultConfigKeySchema as $, incrementEvalOutput as $n, cacheEntryWithDebugKeySchema as $t, createRunRequestSchema as A, buildTraceTree as An, runLogPhaseSchema as At, getEvalDisplayStatus as B, repoFile as Bn, manualInputTextFieldSchema as Bt, loadConfig as C, columnKindSchema as Cn, evalStatAggregateSchema as Ct, createFsCacheStore as D, repoFileRefSchema as Dn, runLogEntrySchema as Dt, validateCharts as E, numberDisplayOptionsSchema as En, evalSummarySchema as Et, extractApiCalls as F, hashCacheKeySync as Fn, manualInputJsonFieldSchema as Ft, runSummarySchema as G, advanceEvalTime as Gn, evalChartConfigSchema as Gt, deriveStatusFromCaseRows as H, readManualInputFile as Hn, evalChartAxisSchema as Ht, extractLlmCalls as I, deserializeCacheRecording as In, manualInputMultilineFieldSchema as It, agentEvalsConfigSchema as J, evalAssert as Jn, evalChartTypeSchema as Jt, DEFAULT_API_CALLS_CONFIG as K, appendToEvalOutput as Kn, evalChartMetricSchema as Kt, applyDerivedCallAttributes as L, deserializeCacheValue as Ln, manualInputNumberFieldSchema as Lt, sseEnvelopeSchema as M, evalSpan as Mn, manualInputBooleanFieldSchema as Mt, extractCacheEntries as N, evalTracer as Nn, manualInputDescriptorSchema as Nt, configReloadStateSchema as O, runArtifactRefSchema as On, runLogLevelSchema as Ot, extractCacheHits as P, hashCacheKey as Pn, manualInputFieldDescriptorSchema as Pt, apiCallsConfigSchema as Q, getEvalStartTime as Qn, cacheEntrySchema as Qt, getNestedAttribute as R, serializeCacheRecording as Rn, manualInputSelectFieldSchema as Rt, resolveEvalDefaultConfig as S, columnFormatSchema as Sn, evalFreshnessStatusSchema as St, normalizeScoreDef as T, jsonCellSchema as Tn, evalStatsConfigSchema as Tt, deriveStatusFromChildStatuses as U, evalExpect as Un, evalChartBuiltinMetricSchema as Ut, deriveScopedSummaryFromCases as V, manualInputFileValueSchema as Vn, evalChartAggregateSchema as Vt, runManifestSchema as W, EvalAssertionError as Wn, evalChartColorSchema as Wt, apiCallMetricPlacementSchema as X, getCurrentScope as Xn, cacheDebugKeyEntrySchema as Xt, apiCallMetricFormatSchema as Y, evalLog as Yn, evalChartsConfigSchema as Yt, apiCallMetricSchema as Z, getEvalCaseInput as Zn, cacheDebugKeyFileSchema as Zt, buildManualInputDescriptor as _, traceSpanKindSchema as _n, getCaseRowEvalKey as _t, getLastRunStatuses as a, cacheRecordingSchema as an, runInExistingEvalScope as ar, llmCallMetricSchema as at, loadEvalModule as b, cellValueSchema as bn, caseRowSchema as bt, loadPersistedRunSnapshots as c, spanCacheOptionsSchema as cn, startEvalBackgroundJob as cr, llmCallsConfigSchema as ct, persistRunState as d, traceAttributeDisplayInputSchema as dn, resolveLlmCallsConfig as dt, cacheFileSchema as en, isInEvalScope as er, evalColumnOverrideSchema as et, recomputeEvalStatusesInRuns as f, traceAttributeDisplayPlacementSchema as fn, runLogsConfigSchema as ft, resolveArtifactPath as g, traceSpanErrorSchema as gn, getCaseRowCaseKey as gt, resolveTracePresentation as h, traceDisplayInputConfigSchema as hn, buildEvalKey as ht, generateRunId as i, cacheRecordingOpSchema as in, runInEvalScope as ir, llmCallMetricPlacementSchema as it, updateManualScoreRequestSchema as j, captureEvalSpanError as jn, scoreTraceSchema as jt, configReloadStatusSchema as k, z$1 as kn, runLogLocationSchema as kt, nextShortIdFromSnapshots as l, traceCacheRefSchema as ln, defineEval as lr, removeDefaultConfigSchema as lt, runTouchesEval as m, traceDisplayConfigSchema as mn, buildCaseKey as mt, getTargetEvalKeys as n, cacheModeSchema as nn, nextEvalId as nr, evalDeriveConfigSchema as nt, getLatestRunInfos as o, cacheStatusSchema as on, setEvalOutput as or, llmCallPricingRateSchema as ot, recomputePersistedCaseStatus as p, traceAttributeDisplaySchema as pn, trialSelectionModeSchema as pt, DEFAULT_LLM_CALLS_CONFIG as q, configureEvalRunLogs as qn, evalChartTooltipExtraSchema as qt, getTargetEvals as r, cacheOperationTypeSchema as rn, runInEvalRuntimeScope as rr, llmCallMetricFormatSchema as rt, loadPersistedRunSnapshot as s, serializedCacheSpanSchema as sn, setScopeCacheContext as sr, llmCallPricingSchema as st, executeRun as t, cacheListItemSchema as tn, mergeEvalOutput as tr, evalColumnsSchema as tt, persistCaseDetail as u, traceAttributeDisplayFormatSchema as un, getEvalRegistry as ur, resolveApiCallsConfig as ut, parseManualInputValues as v, traceSpanSchema as vn, assertionFailureSchema as vt, buildDeclaredColumnDefs as w, fileRefSchema as wn, evalStatItemSchema as wt, parseEvalDiscovery as x, columnDefSchema as xn, discoveryIssueSchema as xt, deriveEvalFreshness as y, traceSpanWarningSchema as yn, caseDetailSchema as yt, getEvalTitle as z, serializeCacheValue as zn, manualInputSelectOptionSchema as zt };
|