@ls-stack/agent-eval 0.27.1 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CJj1yPPD.mjs → app-D6-msfKP.mjs} +45 -6
- package/dist/apps/web/dist/assets/index-BCr6J8Uj.js +118 -0
- package/dist/apps/web/dist/assets/index-DjUTm3M-.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Clf8xUFa.mjs → cli-CIc_gBNM.mjs} +965 -167
- package/dist/index.d.mts +5828 -3368
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +4 -2
- package/dist/{runOrchestration-FEvBwwJI.mjs → runOrchestration-CIARrLs6.mjs} +1046 -228
- package/dist/{runner-zqKwTlNj.mjs → runner-1F8MeY5V.mjs} +2 -2
- package/dist/{runner-KbDKLSU4.mjs → runner-Bq1f9B9d.mjs} +1 -1
- package/dist/src-CkWT1iSu.mjs +3 -0
- package/package.json +2 -29
- package/skills/agent-eval/SKILL.md +104 -20
- package/dist/apps/web/dist/assets/index-6YqV9t4k.js +0 -118
- package/dist/apps/web/dist/assets/index-C-OiMSQD.css +0 -1
- package/dist/bin.d.mts +0 -1
- package/dist/runChild.d.mts +0 -1
- package/dist/src-BBwT7_cy.mjs +0 -3
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
import { createRequire, registerHooks } from "node:module";
|
|
2
|
-
import { createHash } from "node:crypto";
|
|
2
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
4
4
|
import { extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
5
5
|
import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
|
|
6
6
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
7
7
|
import { z, z as z$1 } from "zod/v4";
|
|
8
|
-
import { Buffer as Buffer$1 } from "node:buffer";
|
|
8
|
+
import { Blob as Blob$1, Buffer as Buffer$1, File as File$1 } from "node:buffer";
|
|
9
9
|
import { gunzipSync, gzipSync } from "node:zlib";
|
|
10
10
|
import { getCompositeKey } from "@ls-stack/utils/getCompositeKey";
|
|
11
11
|
import { existsSync } from "node:fs";
|
|
12
|
-
import { resultify } from "t-result";
|
|
12
|
+
import { Result, resultify } from "t-result";
|
|
13
13
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
14
14
|
//#region ../sdk/src/defineEval.ts
|
|
15
15
|
const evalRegistry = /* @__PURE__ */ new Map();
|
|
@@ -531,10 +531,13 @@ function recordOpIfActive(scope, op) {
|
|
|
531
531
|
if (top) top.ops.push(op);
|
|
532
532
|
}
|
|
533
533
|
function toAssertionFailure$1(message, error = void 0) {
|
|
534
|
-
|
|
534
|
+
const name = error?.name;
|
|
535
|
+
const stack = error?.stack ? stripTerminalControlCodes$1(error.stack) : void 0;
|
|
536
|
+
return {
|
|
537
|
+
...name !== void 0 ? { name } : {},
|
|
535
538
|
message,
|
|
536
|
-
stack:
|
|
537
|
-
}
|
|
539
|
+
...stack !== void 0 ? { stack } : {}
|
|
540
|
+
};
|
|
538
541
|
}
|
|
539
542
|
/**
|
|
540
543
|
* Record or replace an output value for the current case scope.
|
|
@@ -784,6 +787,67 @@ function evalExpect(value) {
|
|
|
784
787
|
return new EvalExpectationImpl(value, false);
|
|
785
788
|
}
|
|
786
789
|
//#endregion
|
|
790
|
+
//#region ../sdk/src/manualInputFile.ts
|
|
791
|
+
/**
|
|
792
|
+
* Zod schema describing one file uploaded through the manual-input modal.
|
|
793
|
+
*
|
|
794
|
+
* Use this as the field type on your `manualInput.schema` whenever you mark
|
|
795
|
+
* a field with `{ asFile: true }` in `manualInput.fields`. The UI / CLI stages
|
|
796
|
+
* the selected file on disk, the runner materializes it into the run artifacts
|
|
797
|
+
* directory, and the server validates this JSON metadata against the schema
|
|
798
|
+
* before flowing it into the case input.
|
|
799
|
+
*
|
|
800
|
+
* @example
|
|
801
|
+
* ```ts
|
|
802
|
+
* const schema = z.object({
|
|
803
|
+
* image: manualInputFileValueSchema,
|
|
804
|
+
* note: z.string().optional(),
|
|
805
|
+
* });
|
|
806
|
+
*
|
|
807
|
+
* defineEval({
|
|
808
|
+
* id: 'image-analyzer',
|
|
809
|
+
* manualInput: {
|
|
810
|
+
* schema,
|
|
811
|
+
* fields: { image: { asFile: true, accept: 'image/*' } },
|
|
812
|
+
* },
|
|
813
|
+
* // ...
|
|
814
|
+
* });
|
|
815
|
+
* ```
|
|
816
|
+
*/
|
|
817
|
+
const manualInputFileValueSchema = z.object({
|
|
818
|
+
name: z.string(),
|
|
819
|
+
mimeType: z.string(),
|
|
820
|
+
sizeBytes: z.number().int().nonnegative(),
|
|
821
|
+
sha256: z.string().regex(/^[a-f0-9]{64}$/),
|
|
822
|
+
path: z.string().min(1)
|
|
823
|
+
});
|
|
824
|
+
/**
|
|
825
|
+
* Read a manual-input file artifact from disk and expose common byte, Blob,
|
|
826
|
+
* File, text, and JSON views for eval code.
|
|
827
|
+
*
|
|
828
|
+
* @param value Manual-input file metadata received by an eval.
|
|
829
|
+
* @param options.cwd Directory used to resolve relative paths. Defaults to `process.cwd()`.
|
|
830
|
+
* @returns File bytes plus convenience views for common file-processing flows.
|
|
831
|
+
*/
|
|
832
|
+
async function readManualInputFile(value, options = {}) {
|
|
833
|
+
const absolutePath = resolve(options.cwd ?? process.cwd(), value.path);
|
|
834
|
+
const bytes = new Uint8Array(await readFile(absolutePath));
|
|
835
|
+
const arrayBuffer = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
|
|
836
|
+
const blob = new Blob$1([bytes], { type: value.mimeType });
|
|
837
|
+
return {
|
|
838
|
+
value,
|
|
839
|
+
absolutePath,
|
|
840
|
+
bytes,
|
|
841
|
+
arrayBuffer,
|
|
842
|
+
blob,
|
|
843
|
+
file: new File$1([bytes], value.name, { type: value.mimeType }),
|
|
844
|
+
text: async () => await blob.text(),
|
|
845
|
+
json: async () => {
|
|
846
|
+
return JSON.parse(await blob.text());
|
|
847
|
+
}
|
|
848
|
+
};
|
|
849
|
+
}
|
|
850
|
+
//#endregion
|
|
787
851
|
//#region ../sdk/src/repoFile.ts
|
|
788
852
|
/**
|
|
789
853
|
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
@@ -802,7 +866,8 @@ function repoFile(path, mimeType) {
|
|
|
802
866
|
}
|
|
803
867
|
//#endregion
|
|
804
868
|
//#region ../sdk/src/cacheSerialization.ts
|
|
805
|
-
const serializedCacheValueMarker = "
|
|
869
|
+
const serializedCacheValueMarker = "__aecs";
|
|
870
|
+
const legacySerializedCacheValueMarker = "__agentEvalsCacheSerialization";
|
|
806
871
|
const jsonSafeCacheValueVersion = "json-safe-v1";
|
|
807
872
|
const packedNumberArrayMinLength = 128;
|
|
808
873
|
const compressedStringMinBytes = 16 * 1024;
|
|
@@ -812,7 +877,7 @@ function isRecordLike$3(value) {
|
|
|
812
877
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
813
878
|
}
|
|
814
879
|
function isJsonSafeSerializedCacheValue(value) {
|
|
815
|
-
return isRecordLike$3(value) && value
|
|
880
|
+
return isRecordLike$3(value) && serializationMarkerValue(value) === jsonSafeCacheValueVersion && typeof value.type === "string";
|
|
816
881
|
}
|
|
817
882
|
function jsonSafeValue(type, value) {
|
|
818
883
|
return value === void 0 ? {
|
|
@@ -825,32 +890,39 @@ function jsonSafeValue(type, value) {
|
|
|
825
890
|
};
|
|
826
891
|
}
|
|
827
892
|
function hasSerializationMarkerKey(value) {
|
|
828
|
-
return Object.hasOwn(value, serializedCacheValueMarker);
|
|
893
|
+
return Object.hasOwn(value, serializedCacheValueMarker) || Object.hasOwn(value, legacySerializedCacheValueMarker);
|
|
894
|
+
}
|
|
895
|
+
function serializationMarkerValue(value) {
|
|
896
|
+
return value[serializedCacheValueMarker] ?? value[legacySerializedCacheValueMarker];
|
|
829
897
|
}
|
|
830
898
|
/**
|
|
831
899
|
* Serialize one cached value while keeping plain JSON as plain JSON.
|
|
832
900
|
*
|
|
833
|
-
* Rich runtime values use small tagged wrappers.
|
|
901
|
+
* Rich runtime values use small tagged wrappers. Undefined values are omitted
|
|
902
|
+
* by default; pass `preserveUndefined: true` to round-trip them explicitly.
|
|
834
903
|
*/
|
|
835
|
-
async function serializeCacheValue(value) {
|
|
836
|
-
return serializeJsonSafeValue(value, /* @__PURE__ */ new WeakSet(), 0);
|
|
904
|
+
async function serializeCacheValue(value, options = void 0) {
|
|
905
|
+
return serializeJsonSafeValue(value, /* @__PURE__ */ new WeakSet(), 0, normalizeCacheSerializationOptions(options));
|
|
837
906
|
}
|
|
838
907
|
/** Revive one cached value, while preserving legacy JSON-round-tripped data. */
|
|
839
908
|
function deserializeCacheValue(value) {
|
|
840
909
|
return deserializeJsonSafeValue(value);
|
|
841
910
|
}
|
|
842
911
|
/** Clone one value through the same serialization path used for cache data. */
|
|
843
|
-
async function cloneCacheValue(value) {
|
|
844
|
-
return deserializeCacheValue(await serializeCacheValue(value));
|
|
912
|
+
async function cloneCacheValue(value, options = void 0) {
|
|
913
|
+
return deserializeCacheValue(await serializeCacheValue(value, options));
|
|
914
|
+
}
|
|
915
|
+
function normalizeCacheSerializationOptions(options) {
|
|
916
|
+
return { preserveUndefined: options?.preserveUndefined === true };
|
|
845
917
|
}
|
|
846
|
-
async function serializeJsonSafeValue(value, refs, depth) {
|
|
847
|
-
if (value === void 0) return jsonSafeValue("Undefined");
|
|
918
|
+
async function serializeJsonSafeValue(value, refs, depth, config) {
|
|
919
|
+
if (value === void 0) return config.preserveUndefined ? jsonSafeValue("Undefined") : void 0;
|
|
848
920
|
if (typeof value === "bigint") return jsonSafeValue("BigInt", value.toString());
|
|
849
921
|
if (typeof value === "number") return serializeNumber(value);
|
|
850
922
|
if (typeof value === "string") return serializeString(value, depth);
|
|
851
923
|
if (value instanceof Date) return jsonSafeValue("Date", value.toISOString());
|
|
852
|
-
if (value instanceof Map) return serializeMap(value, refs, depth);
|
|
853
|
-
if (value instanceof Set) return serializeSet(value, refs, depth);
|
|
924
|
+
if (value instanceof Map) return serializeMap(value, refs, depth, config);
|
|
925
|
+
if (value instanceof Set) return serializeSet(value, refs, depth, config);
|
|
854
926
|
if (value instanceof RegExp) return jsonSafeValue("RegExp", {
|
|
855
927
|
flags: value.flags,
|
|
856
928
|
source: value.source
|
|
@@ -869,7 +941,7 @@ async function serializeJsonSafeValue(value, refs, depth) {
|
|
|
869
941
|
type: value.type
|
|
870
942
|
});
|
|
871
943
|
if (value instanceof ArrayBuffer) return jsonSafeValue("ArrayBuffer", bytesToBase64(new Uint8Array(value)));
|
|
872
|
-
if (value instanceof Error) return serializeError(value, refs, depth);
|
|
944
|
+
if (value instanceof Error) return serializeError(value, refs, depth, config);
|
|
873
945
|
if (!value || typeof value !== "object") return value;
|
|
874
946
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
875
947
|
refs.add(value);
|
|
@@ -882,12 +954,18 @@ async function serializeJsonSafeValue(value, refs, depth) {
|
|
|
882
954
|
}
|
|
883
955
|
}
|
|
884
956
|
const items = [];
|
|
885
|
-
for (const item of value)
|
|
957
|
+
for (const item of value) {
|
|
958
|
+
const serializedItem = await serializeJsonSafeValue(item, refs, depth + 1, config);
|
|
959
|
+
if (serializedItem !== void 0) items.push(serializedItem);
|
|
960
|
+
}
|
|
886
961
|
refs.delete(value);
|
|
887
962
|
return compressNestedJsonValue(items, depth) ?? items;
|
|
888
963
|
}
|
|
889
964
|
const entries = [];
|
|
890
|
-
for (const [key, entryValue] of Object.entries(value))
|
|
965
|
+
for (const [key, entryValue] of Object.entries(value)) {
|
|
966
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
967
|
+
if (serializedEntryValue !== void 0) entries.push([key, serializedEntryValue]);
|
|
968
|
+
}
|
|
891
969
|
refs.delete(value);
|
|
892
970
|
const serialized = hasSerializationMarkerKey(value) ? jsonSafeValue("Object", entries) : Object.fromEntries(entries);
|
|
893
971
|
return compressNestedJsonValue(serialized, depth) ?? serialized;
|
|
@@ -957,32 +1035,40 @@ function compressNestedJsonValue(value, depth) {
|
|
|
957
1035
|
function compressionIsWorthIt(value, rawSize) {
|
|
958
1036
|
return Buffer$1.byteLength(JSON.stringify(value)) < rawSize * maxCompressedSizeRatio;
|
|
959
1037
|
}
|
|
960
|
-
async function serializeMap(value, refs, depth) {
|
|
1038
|
+
async function serializeMap(value, refs, depth, config) {
|
|
961
1039
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
962
1040
|
refs.add(value);
|
|
963
1041
|
const entries = [];
|
|
964
|
-
for (const [key, entryValue] of value.entries())
|
|
1042
|
+
for (const [key, entryValue] of value.entries()) {
|
|
1043
|
+
const serializedKey = await serializeJsonSafeValue(key, refs, depth + 1, config);
|
|
1044
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
1045
|
+
if (serializedKey !== void 0 && serializedEntryValue !== void 0) entries.push([serializedKey, serializedEntryValue]);
|
|
1046
|
+
}
|
|
965
1047
|
refs.delete(value);
|
|
966
1048
|
return jsonSafeValue("Map", entries);
|
|
967
1049
|
}
|
|
968
|
-
async function serializeSet(value, refs, depth) {
|
|
1050
|
+
async function serializeSet(value, refs, depth, config) {
|
|
969
1051
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
970
1052
|
refs.add(value);
|
|
971
1053
|
const items = [];
|
|
972
|
-
for (const item of value.values())
|
|
1054
|
+
for (const item of value.values()) {
|
|
1055
|
+
const serializedItem = await serializeJsonSafeValue(item, refs, depth + 1, config);
|
|
1056
|
+
if (serializedItem !== void 0) items.push(serializedItem);
|
|
1057
|
+
}
|
|
973
1058
|
refs.delete(value);
|
|
974
1059
|
return jsonSafeValue("Set", items);
|
|
975
1060
|
}
|
|
976
|
-
async function serializeError(value, refs, depth) {
|
|
1061
|
+
async function serializeError(value, refs, depth, config) {
|
|
977
1062
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
978
1063
|
refs.add(value);
|
|
979
1064
|
const props = [];
|
|
980
1065
|
for (const [key, entryValue] of Object.entries(value)) {
|
|
981
1066
|
if (key === "cause") continue;
|
|
982
|
-
|
|
1067
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
1068
|
+
if (serializedEntryValue !== void 0) props.push([key, serializedEntryValue]);
|
|
983
1069
|
}
|
|
984
1070
|
const serialized = jsonSafeValue("Error", {
|
|
985
|
-
cause: "cause" in value ? await serializeJsonSafeValue(value.cause, refs, depth + 1) : void 0,
|
|
1071
|
+
cause: "cause" in value ? await serializeJsonSafeValue(value.cause, refs, depth + 1, config) : void 0,
|
|
986
1072
|
message: value.message,
|
|
987
1073
|
name: value.name,
|
|
988
1074
|
props,
|
|
@@ -1123,33 +1209,36 @@ function deserializeError(value) {
|
|
|
1123
1209
|
});
|
|
1124
1210
|
return error;
|
|
1125
1211
|
}
|
|
1126
|
-
async function serializeRecordValues(record) {
|
|
1212
|
+
async function serializeRecordValues(record, config) {
|
|
1127
1213
|
const entries = [];
|
|
1128
|
-
for (const [key, value] of Object.entries(record))
|
|
1214
|
+
for (const [key, value] of Object.entries(record)) {
|
|
1215
|
+
const serializedValue = await serializeCacheValue(value, config);
|
|
1216
|
+
if (serializedValue !== void 0) entries.push([key, serializedValue]);
|
|
1217
|
+
}
|
|
1129
1218
|
return Object.fromEntries(entries);
|
|
1130
1219
|
}
|
|
1131
1220
|
function deserializeRecordValues(record) {
|
|
1132
1221
|
return Object.fromEntries(Object.entries(record).map(([key, value]) => [key, deserializeCacheValue(value)]));
|
|
1133
1222
|
}
|
|
1134
|
-
async function serializeCacheRecordingOp(op) {
|
|
1223
|
+
async function serializeCacheRecordingOp(op, config) {
|
|
1135
1224
|
switch (op.kind) {
|
|
1136
1225
|
case "setOutput":
|
|
1137
1226
|
case "appendOutput": return {
|
|
1138
1227
|
...op,
|
|
1139
|
-
value: await serializeCacheValue(op.value)
|
|
1228
|
+
value: await serializeCacheValue(op.value, config)
|
|
1140
1229
|
};
|
|
1141
1230
|
case "mergeOutput": return {
|
|
1142
1231
|
...op,
|
|
1143
|
-
patch: await serializeRecordValues(op.patch)
|
|
1232
|
+
patch: await serializeRecordValues(op.patch, config)
|
|
1144
1233
|
};
|
|
1145
1234
|
case "incrementOutput": return op;
|
|
1146
1235
|
case "checkpoint": return {
|
|
1147
1236
|
...op,
|
|
1148
|
-
data: await serializeCacheValue(op.data)
|
|
1237
|
+
data: await serializeCacheValue(op.data, config)
|
|
1149
1238
|
};
|
|
1150
1239
|
case "subSpan": return {
|
|
1151
1240
|
...op,
|
|
1152
|
-
span: await serializeCacheSpan(op.span)
|
|
1241
|
+
span: await serializeCacheSpan(op.span, config)
|
|
1153
1242
|
};
|
|
1154
1243
|
}
|
|
1155
1244
|
}
|
|
@@ -1175,11 +1264,11 @@ function deserializeCacheRecordingOp(op) {
|
|
|
1175
1264
|
};
|
|
1176
1265
|
}
|
|
1177
1266
|
}
|
|
1178
|
-
async function serializeCacheSpan(span) {
|
|
1267
|
+
async function serializeCacheSpan(span, config) {
|
|
1179
1268
|
return {
|
|
1180
1269
|
...span,
|
|
1181
|
-
attributes: span.attributes === void 0 ? void 0 : await serializeRecordValues(span.attributes),
|
|
1182
|
-
children: await Promise.all(span.children.map(serializeCacheSpan))
|
|
1270
|
+
attributes: span.attributes === void 0 ? void 0 : await serializeRecordValues(span.attributes, config),
|
|
1271
|
+
children: await Promise.all(span.children.map((child) => serializeCacheSpan(child, config)))
|
|
1183
1272
|
};
|
|
1184
1273
|
}
|
|
1185
1274
|
function deserializeCacheSpan(span) {
|
|
@@ -1189,13 +1278,19 @@ function deserializeCacheSpan(span) {
|
|
|
1189
1278
|
children: span.children.map(deserializeCacheSpan)
|
|
1190
1279
|
};
|
|
1191
1280
|
}
|
|
1192
|
-
/**
|
|
1193
|
-
|
|
1281
|
+
/**
|
|
1282
|
+
* Serialize all rich values captured in a cache recording before persistence.
|
|
1283
|
+
*
|
|
1284
|
+
* Undefined values are omitted by default; pass `preserveUndefined: true` to
|
|
1285
|
+
* retain the legacy explicit undefined wrappers in the recording payload.
|
|
1286
|
+
*/
|
|
1287
|
+
async function serializeCacheRecording(recording, options = void 0) {
|
|
1288
|
+
const config = normalizeCacheSerializationOptions(options);
|
|
1194
1289
|
return {
|
|
1195
1290
|
...recording,
|
|
1196
|
-
returnValue: await serializeCacheValue(recording.returnValue),
|
|
1197
|
-
finalAttributes: await serializeRecordValues(recording.finalAttributes),
|
|
1198
|
-
ops: await Promise.all(recording.ops.map(serializeCacheRecordingOp))
|
|
1291
|
+
returnValue: await serializeCacheValue(recording.returnValue, config),
|
|
1292
|
+
finalAttributes: await serializeRecordValues(recording.finalAttributes, config),
|
|
1293
|
+
ops: await Promise.all(recording.ops.map((op) => serializeCacheRecordingOp(op, config)))
|
|
1199
1294
|
};
|
|
1200
1295
|
}
|
|
1201
1296
|
/** Revive all rich values captured in a cache recording after lookup. */
|
|
@@ -1587,7 +1682,9 @@ function createTraceCache(generateSpanId) {
|
|
|
1587
1682
|
key: info.key
|
|
1588
1683
|
}, { serializeFileBytes: info.serializeFileBytes === true });
|
|
1589
1684
|
const activeSpan = scope.activeSpanStack.at(-1);
|
|
1590
|
-
|
|
1685
|
+
const canRead = cacheCtx.mode === "use" && cacheCtx.read !== false;
|
|
1686
|
+
const canStore = cacheCtx.mode !== "bypass" && cacheCtx.store !== false;
|
|
1687
|
+
if (canRead) {
|
|
1591
1688
|
const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
|
|
1592
1689
|
if (hit) {
|
|
1593
1690
|
const storedAt = hit.storedAt;
|
|
@@ -1610,14 +1707,24 @@ function createTraceCache(generateSpanId) {
|
|
|
1610
1707
|
name: info.name,
|
|
1611
1708
|
namespace,
|
|
1612
1709
|
key: keyHash,
|
|
1613
|
-
status: "miss"
|
|
1710
|
+
status: "miss",
|
|
1711
|
+
...canStore ? {} : { stored: false }
|
|
1614
1712
|
});
|
|
1615
|
-
} else if (cacheCtx.mode === "
|
|
1713
|
+
} else if (cacheCtx.mode === "use" && canStore) recordCacheRef(scope, activeSpan, {
|
|
1714
|
+
type: "value",
|
|
1715
|
+
name: info.name,
|
|
1716
|
+
namespace,
|
|
1717
|
+
key: keyHash,
|
|
1718
|
+
status: "miss",
|
|
1719
|
+
read: false
|
|
1720
|
+
});
|
|
1721
|
+
else if (cacheCtx.mode === "refresh") recordCacheRef(scope, activeSpan, {
|
|
1616
1722
|
type: "value",
|
|
1617
1723
|
name: info.name,
|
|
1618
1724
|
namespace,
|
|
1619
1725
|
key: keyHash,
|
|
1620
|
-
status: "refresh"
|
|
1726
|
+
status: "refresh",
|
|
1727
|
+
...canStore ? {} : { stored: false }
|
|
1621
1728
|
});
|
|
1622
1729
|
else recordCacheRef(scope, activeSpan, {
|
|
1623
1730
|
type: "value",
|
|
@@ -1640,7 +1747,7 @@ function createTraceCache(generateSpanId) {
|
|
|
1640
1747
|
scope.recordingStack.pop();
|
|
1641
1748
|
}
|
|
1642
1749
|
appendSubSpanOps(scope, frame);
|
|
1643
|
-
if (
|
|
1750
|
+
if (canStore) {
|
|
1644
1751
|
const finalAttributes = diffNonCacheAttributes(beforeAttributes, await snapshotNonCacheAttributes(activeSpan));
|
|
1645
1752
|
const recording = {
|
|
1646
1753
|
returnValue: bodyResult,
|
|
@@ -1654,13 +1761,11 @@ function createTraceCache(generateSpanId) {
|
|
|
1654
1761
|
operationType: "value",
|
|
1655
1762
|
operationName: info.name,
|
|
1656
1763
|
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
1657
|
-
codeFingerprint: cacheCtx.codeFingerprint,
|
|
1658
1764
|
recording: await serializeCacheRecording(recording)
|
|
1659
1765
|
}, {
|
|
1660
1766
|
rawKey: info.key,
|
|
1661
1767
|
operationType: "value",
|
|
1662
|
-
operationName: info.name
|
|
1663
|
-
codeFingerprint: cacheCtx.codeFingerprint
|
|
1768
|
+
operationName: info.name
|
|
1664
1769
|
});
|
|
1665
1770
|
}
|
|
1666
1771
|
return bodyResult;
|
|
@@ -2031,11 +2136,13 @@ async function traceSpanInternal(info, fn) {
|
|
|
2031
2136
|
namespace,
|
|
2032
2137
|
key: cacheOpts.key
|
|
2033
2138
|
}, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
|
|
2139
|
+
const canRead = ctx.mode === "use" && ctx.read !== false;
|
|
2140
|
+
const canStore = ctx.mode !== "bypass" && ctx.store !== false;
|
|
2034
2141
|
mergeSpanAttributes(spanRecord, {
|
|
2035
2142
|
"cache.key": keyHash,
|
|
2036
2143
|
"cache.namespace": namespace
|
|
2037
2144
|
});
|
|
2038
|
-
if (
|
|
2145
|
+
if (canRead) {
|
|
2039
2146
|
const hit = await ctx.adapter.lookup(namespace, keyHash);
|
|
2040
2147
|
if (hit) {
|
|
2041
2148
|
const storedAt = hit.storedAt;
|
|
@@ -2050,8 +2157,18 @@ async function traceSpanInternal(info, fn) {
|
|
|
2050
2157
|
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
2051
2158
|
return recording.returnValue;
|
|
2052
2159
|
}
|
|
2053
|
-
mergeSpanAttributes(spanRecord, {
|
|
2054
|
-
|
|
2160
|
+
mergeSpanAttributes(spanRecord, {
|
|
2161
|
+
"cache.status": "miss",
|
|
2162
|
+
...canStore ? {} : { "cache.stored": false }
|
|
2163
|
+
});
|
|
2164
|
+
} else if (ctx.mode === "use" && canStore) mergeSpanAttributes(spanRecord, {
|
|
2165
|
+
"cache.status": "miss",
|
|
2166
|
+
"cache.read": false
|
|
2167
|
+
});
|
|
2168
|
+
else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, {
|
|
2169
|
+
"cache.status": "refresh",
|
|
2170
|
+
...canStore ? {} : { "cache.stored": false }
|
|
2171
|
+
});
|
|
2055
2172
|
else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
|
|
2056
2173
|
const frame = {
|
|
2057
2174
|
baseSpanIndex: scope.spans.length,
|
|
@@ -2067,7 +2184,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
2067
2184
|
}
|
|
2068
2185
|
appendSubSpanOps(scope, frame);
|
|
2069
2186
|
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
2070
|
-
if (
|
|
2187
|
+
if (canStore) {
|
|
2071
2188
|
const recording = {
|
|
2072
2189
|
returnValue: bodyResult,
|
|
2073
2190
|
finalAttributes: stripCacheAttributes(spanRecord.attributes),
|
|
@@ -2087,14 +2204,12 @@ async function traceSpanInternal(info, fn) {
|
|
|
2087
2204
|
spanName: info.name,
|
|
2088
2205
|
spanKind: info.kind,
|
|
2089
2206
|
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
2090
|
-
codeFingerprint: ctx.codeFingerprint,
|
|
2091
2207
|
recording: await serializeCacheRecording(recording)
|
|
2092
2208
|
};
|
|
2093
2209
|
await ctx.adapter.write(entry, {
|
|
2094
2210
|
rawKey: cacheOpts.key,
|
|
2095
2211
|
operationType: "span",
|
|
2096
|
-
operationName: info.name
|
|
2097
|
-
codeFingerprint: ctx.codeFingerprint
|
|
2212
|
+
operationName: info.name
|
|
2098
2213
|
});
|
|
2099
2214
|
}
|
|
2100
2215
|
return bodyResult;
|
|
@@ -2287,6 +2402,7 @@ const columnDefSchema = z.object({
|
|
|
2287
2402
|
passThreshold: z.number().optional(),
|
|
2288
2403
|
maxStars: z.number().int().min(2).optional(),
|
|
2289
2404
|
hideInTable: z.boolean().optional(),
|
|
2405
|
+
hideIfNoValue: z.boolean().optional(),
|
|
2290
2406
|
align: z.enum([
|
|
2291
2407
|
"left",
|
|
2292
2408
|
"center",
|
|
@@ -2430,6 +2546,10 @@ const traceCacheRefSchema = z.object({
|
|
|
2430
2546
|
namespace: z.string(),
|
|
2431
2547
|
key: z.string(),
|
|
2432
2548
|
status: cacheStatusSchema,
|
|
2549
|
+
/** Whether this ref attempted to read from cache. Defaults to true. */
|
|
2550
|
+
read: z.boolean().optional(),
|
|
2551
|
+
/** Whether this ref wrote a persisted cache entry. Defaults to true for misses/refreshes. */
|
|
2552
|
+
stored: z.boolean().optional(),
|
|
2433
2553
|
storedAt: z.string().optional(),
|
|
2434
2554
|
age: z.number().optional()
|
|
2435
2555
|
});
|
|
@@ -2442,7 +2562,6 @@ const cacheListItemSchema = z.object({
|
|
|
2442
2562
|
spanName: z.string().optional(),
|
|
2443
2563
|
spanKind: traceSpanKindSchema.optional(),
|
|
2444
2564
|
storedAt: z.string(),
|
|
2445
|
-
codeFingerprint: z.string(),
|
|
2446
2565
|
sizeBytes: z.number()
|
|
2447
2566
|
});
|
|
2448
2567
|
/** Zod schema for `SerializedCacheSpan`, defined lazily for recursion. */
|
|
@@ -2524,7 +2643,6 @@ const cacheEntrySchema = z.object({
|
|
|
2524
2643
|
spanName: z.string().optional(),
|
|
2525
2644
|
spanKind: traceSpanKindSchema.optional(),
|
|
2526
2645
|
storedAt: z.string(),
|
|
2527
|
-
codeFingerprint: z.string(),
|
|
2528
2646
|
recording: cacheRecordingSchema
|
|
2529
2647
|
});
|
|
2530
2648
|
/** Debug-only raw key metadata stored outside the reusable cache entry. */
|
|
@@ -2535,7 +2653,6 @@ const cacheDebugKeyEntrySchema = z.object({
|
|
|
2535
2653
|
operationType: cacheOperationTypeSchema,
|
|
2536
2654
|
operationName: z.string(),
|
|
2537
2655
|
storedAt: z.string(),
|
|
2538
|
-
codeFingerprint: z.string(),
|
|
2539
2656
|
rawKey: z.unknown()
|
|
2540
2657
|
});
|
|
2541
2658
|
/** Cache lookup response with optional debug-only raw key data. */
|
|
@@ -2627,6 +2744,16 @@ const evalChartTooltipExtraSchema = z.discriminatedUnion("source", [z.object({
|
|
|
2627
2744
|
const evalChartConfigSchema = z.object({
|
|
2628
2745
|
/** Optional heading shown above the chart frame in the UI. */
|
|
2629
2746
|
heading: z.string().optional(),
|
|
2747
|
+
/**
|
|
2748
|
+
* Hide this chart in the UI when none of its metrics has a numeric value in
|
|
2749
|
+
* the rendered history window.
|
|
2750
|
+
*/
|
|
2751
|
+
hideIfNoValue: z.boolean().optional(),
|
|
2752
|
+
/**
|
|
2753
|
+
* Drop consecutive history points whose plotted metrics and tooltip extras
|
|
2754
|
+
* have the same values as the previous kept point.
|
|
2755
|
+
*/
|
|
2756
|
+
dedupeConsecutiveValues: z.boolean().optional(),
|
|
2630
2757
|
type: evalChartTypeSchema,
|
|
2631
2758
|
/** At least one series must be declared. */
|
|
2632
2759
|
metrics: z.array(evalChartMetricSchema).min(1),
|
|
@@ -2652,6 +2779,122 @@ const evalChartConfigSchema = z.object({
|
|
|
2652
2779
|
*/
|
|
2653
2780
|
const evalChartsConfigSchema = z.array(evalChartConfigSchema);
|
|
2654
2781
|
//#endregion
|
|
2782
|
+
//#region ../shared/src/schemas/manualInput.ts
|
|
2783
|
+
/**
|
|
2784
|
+
* Common metadata shared by every manual-input field descriptor exposed to
|
|
2785
|
+
* the web UI. The runner builds these from the eval's authored Zod schema and
|
|
2786
|
+
* any per-field overrides, so the client never needs the schema itself.
|
|
2787
|
+
*/
|
|
2788
|
+
const manualInputFieldBaseSchema = z.object({
|
|
2789
|
+
/** Top-level key on the eval input object that this field writes to. */
|
|
2790
|
+
key: z.string(),
|
|
2791
|
+
/** Human-readable label rendered next to the field in the modal. */
|
|
2792
|
+
label: z.string(),
|
|
2793
|
+
/** Optional helper text rendered under the label. */
|
|
2794
|
+
description: z.string().optional(),
|
|
2795
|
+
/** Optional placeholder rendered inside the input element. */
|
|
2796
|
+
placeholder: z.string().optional(),
|
|
2797
|
+
/** Whether the field must be filled before the run can be submitted. */
|
|
2798
|
+
required: z.boolean(),
|
|
2799
|
+
/**
|
|
2800
|
+
* Default value used to prefill the field. Type matches the underlying
|
|
2801
|
+
* widget kind (`string` for text/multiline/select, `number` for number,
|
|
2802
|
+
* `boolean` for boolean, JSON-serialisable for `json`).
|
|
2803
|
+
*/
|
|
2804
|
+
defaultValue: z.unknown().optional()
|
|
2805
|
+
});
|
|
2806
|
+
/** One option rendered by the `select` widget. */
|
|
2807
|
+
const manualInputSelectOptionSchema = z.object({
|
|
2808
|
+
value: z.string(),
|
|
2809
|
+
label: z.string()
|
|
2810
|
+
});
|
|
2811
|
+
/** Single line text widget descriptor. */
|
|
2812
|
+
const manualInputTextFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2813
|
+
kind: z.literal("text"),
|
|
2814
|
+
/** Optional minimum character length enforced client-side. */
|
|
2815
|
+
minLength: z.number().int().min(0).optional(),
|
|
2816
|
+
/** Optional maximum character length enforced client-side. */
|
|
2817
|
+
maxLength: z.number().int().min(0).optional()
|
|
2818
|
+
});
|
|
2819
|
+
/** Multi-line textarea widget descriptor. */
|
|
2820
|
+
const manualInputMultilineFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2821
|
+
kind: z.literal("multiline"),
|
|
2822
|
+
/** Optional minimum character length enforced client-side. */
|
|
2823
|
+
minLength: z.number().int().min(0).optional(),
|
|
2824
|
+
/** Optional maximum character length enforced client-side. */
|
|
2825
|
+
maxLength: z.number().int().min(0).optional(),
|
|
2826
|
+
/** Suggested number of visible textarea rows; UI may clamp this. */
|
|
2827
|
+
rows: z.number().int().min(1).optional()
|
|
2828
|
+
});
|
|
2829
|
+
/** Numeric input widget descriptor. */
|
|
2830
|
+
const manualInputNumberFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2831
|
+
kind: z.literal("number"),
|
|
2832
|
+
/** Optional inclusive lower bound. */
|
|
2833
|
+
min: z.number().optional(),
|
|
2834
|
+
/** Optional inclusive upper bound. */
|
|
2835
|
+
max: z.number().optional(),
|
|
2836
|
+
/** Optional UI step increment. */
|
|
2837
|
+
step: z.number().positive().optional(),
|
|
2838
|
+
/** Whether the value must be an integer. */
|
|
2839
|
+
integer: z.boolean().optional()
|
|
2840
|
+
});
|
|
2841
|
+
/** Boolean checkbox/toggle widget descriptor. */
|
|
2842
|
+
const manualInputBooleanFieldSchema = manualInputFieldBaseSchema.extend({ kind: z.literal("boolean") });
|
|
2843
|
+
/** Single-select dropdown widget descriptor. */
|
|
2844
|
+
const manualInputSelectFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2845
|
+
kind: z.literal("select"),
|
|
2846
|
+
options: z.array(manualInputSelectOptionSchema)
|
|
2847
|
+
});
|
|
2848
|
+
/** JSON textarea widget descriptor used for nested objects, arrays, and unions. */
|
|
2849
|
+
const manualInputJsonFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2850
|
+
kind: z.literal("json"),
|
|
2851
|
+
/** Suggested number of visible textarea rows; UI may clamp this. */
|
|
2852
|
+
rows: z.number().int().min(1).optional()
|
|
2853
|
+
});
|
|
2854
|
+
/**
|
|
2855
|
+
* File / image upload widget descriptor. The widget supports clicking to
|
|
2856
|
+
* pick a file, drag-and-drop onto the dropzone, and pasting an image from
|
|
2857
|
+
* the system clipboard. The submitted value references a staged file artifact.
|
|
2858
|
+
*/
|
|
2859
|
+
const manualInputFileFieldSchema = manualInputFieldBaseSchema.extend({
|
|
2860
|
+
kind: z.literal("file"),
|
|
2861
|
+
/**
|
|
2862
|
+
* Browser `accept` attribute (e.g. `image/*`, `image/png,image/jpeg`,
|
|
2863
|
+
* `.pdf`). When omitted the picker accepts any file type.
|
|
2864
|
+
*/
|
|
2865
|
+
accept: z.string().optional(),
|
|
2866
|
+
/** Optional client-side maximum file size in bytes. */
|
|
2867
|
+
maxSizeBytes: z.number().int().positive().optional()
|
|
2868
|
+
});
|
|
2869
|
+
/**
|
|
2870
|
+
* Discriminated union of all supported manual-input widget kinds. The web UI
|
|
2871
|
+
* dispatches to the matching field component based on `kind`.
|
|
2872
|
+
*/
|
|
2873
|
+
const manualInputFieldDescriptorSchema = z.discriminatedUnion("kind", [
|
|
2874
|
+
manualInputTextFieldSchema,
|
|
2875
|
+
manualInputMultilineFieldSchema,
|
|
2876
|
+
manualInputNumberFieldSchema,
|
|
2877
|
+
manualInputBooleanFieldSchema,
|
|
2878
|
+
manualInputSelectFieldSchema,
|
|
2879
|
+
manualInputJsonFieldSchema,
|
|
2880
|
+
manualInputFileFieldSchema
|
|
2881
|
+
]);
|
|
2882
|
+
/**
|
|
2883
|
+
* Wire-format descriptor attached to an `EvalSummary` when the eval declares
|
|
2884
|
+
* `manualInput`. Carries the ordered list of fields the modal renders and
|
|
2885
|
+
* basic context shown in the modal header.
|
|
2886
|
+
*/
|
|
2887
|
+
const manualInputDescriptorSchema = z.object({
|
|
2888
|
+
/** Optional title shown in the modal header. Defaults to the eval title. */
|
|
2889
|
+
title: z.string().optional(),
|
|
2890
|
+
/** Optional helper text shown above the form. */
|
|
2891
|
+
description: z.string().optional(),
|
|
2892
|
+
/** Optional submit button label. Defaults to `Run`. */
|
|
2893
|
+
submitLabel: z.string().optional(),
|
|
2894
|
+
/** Ordered list of fields rendered in the modal. */
|
|
2895
|
+
fields: z.array(manualInputFieldDescriptorSchema)
|
|
2896
|
+
});
|
|
2897
|
+
//#endregion
|
|
2655
2898
|
//#region ../shared/src/schemas/eval.ts
|
|
2656
2899
|
/** Freshness signal derived from the latest relevant run plus git state. */
|
|
2657
2900
|
const evalFreshnessStatusSchema = z.enum([
|
|
@@ -2667,17 +2910,31 @@ const evalStatAggregateSchema = z.enum([
|
|
|
2667
2910
|
"sum",
|
|
2668
2911
|
"last"
|
|
2669
2912
|
]);
|
|
2913
|
+
const hideIfNoValueShape = {
|
|
2914
|
+
/**
|
|
2915
|
+
* Hide this stat in the UI when the current run has no displayable value.
|
|
2916
|
+
* Missing values, `null`, and empty strings count as no value; `0` remains
|
|
2917
|
+
* visible.
|
|
2918
|
+
*/
|
|
2919
|
+
hideIfNoValue: z.boolean().optional() };
|
|
2670
2920
|
/**
|
|
2671
2921
|
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
2672
2922
|
* `column` aggregates a score or numeric output column across the latest run.
|
|
2673
2923
|
*/
|
|
2674
2924
|
const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
2675
|
-
z.object({
|
|
2925
|
+
z.object({
|
|
2926
|
+
kind: z.literal("cases"),
|
|
2927
|
+
...hideIfNoValueShape
|
|
2928
|
+
}),
|
|
2676
2929
|
z.object({
|
|
2677
2930
|
kind: z.literal("passRate"),
|
|
2678
|
-
accent: z.boolean().optional()
|
|
2931
|
+
accent: z.boolean().optional(),
|
|
2932
|
+
...hideIfNoValueShape
|
|
2933
|
+
}),
|
|
2934
|
+
z.object({
|
|
2935
|
+
kind: z.literal("duration"),
|
|
2936
|
+
...hideIfNoValueShape
|
|
2679
2937
|
}),
|
|
2680
|
-
z.object({ kind: z.literal("duration") }),
|
|
2681
2938
|
z.object({
|
|
2682
2939
|
kind: z.literal("column"),
|
|
2683
2940
|
key: z.string(),
|
|
@@ -2686,7 +2943,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2686
2943
|
format: columnFormatSchema.optional(),
|
|
2687
2944
|
/** Number presentation options applied when `format: 'number'`. */
|
|
2688
2945
|
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
2689
|
-
accent: z.boolean().optional()
|
|
2946
|
+
accent: z.boolean().optional(),
|
|
2947
|
+
...hideIfNoValueShape
|
|
2690
2948
|
})
|
|
2691
2949
|
]);
|
|
2692
2950
|
/** Ordered list of stats rendered in the EvalCard stats row. */
|
|
@@ -2734,7 +2992,13 @@ const evalSummarySchema = z.object({
|
|
|
2734
2992
|
* Ordered per-eval history chart configuration for the EvalCard. Opt-in:
|
|
2735
2993
|
* when omitted or empty, the UI renders no history chart at all.
|
|
2736
2994
|
*/
|
|
2737
|
-
charts: evalChartsConfigSchema.optional()
|
|
2995
|
+
charts: evalChartsConfigSchema.optional(),
|
|
2996
|
+
/**
|
|
2997
|
+
* Manual-input form descriptor when the eval declares `manualInput`. The
|
|
2998
|
+
* web UI renders these fields in a modal before kicking off a run; the
|
|
2999
|
+
* runner consumes the validated values as the case input.
|
|
3000
|
+
*/
|
|
3001
|
+
manualInput: manualInputDescriptorSchema.optional()
|
|
2738
3002
|
});
|
|
2739
3003
|
/** Schema for one case row in an eval run result table. */
|
|
2740
3004
|
const caseRowSchema = z.object({
|
|
@@ -2767,6 +3031,12 @@ const caseRowSchema = z.object({
|
|
|
2767
3031
|
});
|
|
2768
3032
|
/** Structured assertion failure metadata captured for one case run. */
|
|
2769
3033
|
const assertionFailureSchema = z.object({
|
|
3034
|
+
/**
|
|
3035
|
+
* Error class or category label rendered alongside the message (e.g.
|
|
3036
|
+
* `EvalAssertionError`, `OutputsSchemaError`). Optional for legacy entries
|
|
3037
|
+
* and synthetic failures without an originating Error.
|
|
3038
|
+
*/
|
|
3039
|
+
name: z.string().optional(),
|
|
2770
3040
|
/** Human-readable assertion failure message shown in the UI and artifacts. */
|
|
2771
3041
|
message: z.string(),
|
|
2772
3042
|
/** Stack trace captured from the originating error when available. */
|
|
@@ -2868,7 +3138,7 @@ const caseDetailSchema = z.object({
|
|
|
2868
3138
|
});
|
|
2869
3139
|
/** Schema for discovery problems that should be shown before running evals. */
|
|
2870
3140
|
const discoveryIssueSchema = z.object({
|
|
2871
|
-
type: z.enum(["duplicate-eval-id"]),
|
|
3141
|
+
type: z.enum(["duplicate-eval-id", "manual-input-with-cases"]),
|
|
2872
3142
|
severity: z.enum(["error"]),
|
|
2873
3143
|
filePath: z.string(),
|
|
2874
3144
|
evalId: z.string(),
|
|
@@ -2915,6 +3185,25 @@ const defaultConfigKeySchema = z.enum([
|
|
|
2915
3185
|
]);
|
|
2916
3186
|
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
2917
3187
|
const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
|
|
3188
|
+
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
3189
|
+
/** Schema for keyed or object-returning trace-derived output config. */
|
|
3190
|
+
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
3191
|
+
/** Schema for UI overrides on derived or scored columns. */
|
|
3192
|
+
const evalColumnOverrideSchema = z.object({
|
|
3193
|
+
label: z.string().optional(),
|
|
3194
|
+
format: columnFormatSchema.optional(),
|
|
3195
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
3196
|
+
hideInTable: z.boolean().optional(),
|
|
3197
|
+
hideIfNoValue: z.boolean().optional(),
|
|
3198
|
+
align: z.enum([
|
|
3199
|
+
"left",
|
|
3200
|
+
"center",
|
|
3201
|
+
"right"
|
|
3202
|
+
]).optional(),
|
|
3203
|
+
maxStars: z.number().int().min(2).optional()
|
|
3204
|
+
});
|
|
3205
|
+
/** Schema for column override maps keyed by output or score field name. */
|
|
3206
|
+
const evalColumnsSchema = z.record(z.string(), evalColumnOverrideSchema);
|
|
2918
3207
|
/** Render formats supported by an LLM-call metric in the UI. */
|
|
2919
3208
|
const llmCallMetricFormatSchema = z.enum([
|
|
2920
3209
|
"string",
|
|
@@ -2992,18 +3281,9 @@ const apiCallMetricSchema = z.object({
|
|
|
2992
3281
|
placements: z.array(apiCallMetricPlacementSchema).nonempty().optional()
|
|
2993
3282
|
});
|
|
2994
3283
|
/**
|
|
2995
|
-
* Schema for
|
|
2996
|
-
* from token counts.
|
|
3284
|
+
* Schema for pricing rates used to derive LLM-call costs from token counts.
|
|
2997
3285
|
*/
|
|
2998
|
-
const
|
|
2999
|
-
/** Exact model name read from the configured `attributes.model` path. */
|
|
3000
|
-
model: z.string().min(1),
|
|
3001
|
-
/**
|
|
3002
|
-
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
3003
|
-
* the entry only applies to calls from that provider; provider-specific
|
|
3004
|
-
* entries take precedence over generic entries for the same model.
|
|
3005
|
-
*/
|
|
3006
|
-
provider: z.string().min(1).optional(),
|
|
3286
|
+
const llmCallPricingRateSchema = z.object({
|
|
3007
3287
|
/** USD per one million non-cached input tokens. */
|
|
3008
3288
|
inputUsdPerMillion: z.number().nonnegative().optional(),
|
|
3009
3289
|
/** USD per one million output tokens. */
|
|
@@ -3017,6 +3297,23 @@ const llmCallPricingSchema = z.object({
|
|
|
3017
3297
|
/** USD per one million reasoning tokens when reported separately. */
|
|
3018
3298
|
reasoningUsdPerMillion: z.number().nonnegative().optional()
|
|
3019
3299
|
});
|
|
3300
|
+
/**
|
|
3301
|
+
* Schema for one model's pricing config. The object key is the exact model
|
|
3302
|
+
* name. Use `providers` when a model has provider-specific rates in addition
|
|
3303
|
+
* to, or instead of, generic model rates.
|
|
3304
|
+
*/
|
|
3305
|
+
const llmCallPricingSchema = llmCallPricingRateSchema.extend({
|
|
3306
|
+
/**
|
|
3307
|
+
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
3308
|
+
* the top-level entry only applies to calls from that provider.
|
|
3309
|
+
*/
|
|
3310
|
+
provider: z.string().min(1).optional(),
|
|
3311
|
+
/**
|
|
3312
|
+
* Provider-specific pricing for the model. Provider entries take precedence
|
|
3313
|
+
* over generic rates for the same model.
|
|
3314
|
+
*/
|
|
3315
|
+
providers: z.record(z.string().min(1), llmCallPricingRateSchema).optional()
|
|
3316
|
+
});
|
|
3020
3317
|
/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
|
|
3021
3318
|
const llmCallsConfigSchema = z.object({
|
|
3022
3319
|
/** Span kinds treated as LLM calls. Defaults to `['llm']`. */
|
|
@@ -3053,10 +3350,10 @@ const llmCallsConfigSchema = z.object({
|
|
|
3053
3350
|
*/
|
|
3054
3351
|
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
3055
3352
|
/**
|
|
3056
|
-
* Model
|
|
3057
|
-
*
|
|
3353
|
+
* Model-keyed pricing registry used to calculate LLM-call costs from token
|
|
3354
|
+
* counts. Built-in LLM cost fields are only derived from this registry.
|
|
3058
3355
|
*/
|
|
3059
|
-
pricing: z.
|
|
3356
|
+
pricing: z.record(z.string().min(1), llmCallPricingSchema).optional(),
|
|
3060
3357
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
3061
3358
|
metrics: z.array(llmCallMetricSchema).optional()
|
|
3062
3359
|
});
|
|
@@ -3172,6 +3469,33 @@ function resolveApiCallMetric(metric) {
|
|
|
3172
3469
|
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
3173
3470
|
};
|
|
3174
3471
|
}
|
|
3472
|
+
function hasPricingRates(pricing) {
|
|
3473
|
+
return pricing.inputUsdPerMillion !== void 0 || pricing.outputUsdPerMillion !== void 0 || pricing.cachedInputUsdPerMillion !== void 0 || pricing.cacheCreationInputUsdPerMillion !== void 0 || pricing.cacheCreationInput1hUsdPerMillion !== void 0 || pricing.reasoningUsdPerMillion !== void 0;
|
|
3474
|
+
}
|
|
3475
|
+
function copyPricingRates(pricing) {
|
|
3476
|
+
return {
|
|
3477
|
+
inputUsdPerMillion: pricing.inputUsdPerMillion,
|
|
3478
|
+
outputUsdPerMillion: pricing.outputUsdPerMillion,
|
|
3479
|
+
cachedInputUsdPerMillion: pricing.cachedInputUsdPerMillion,
|
|
3480
|
+
cacheCreationInputUsdPerMillion: pricing.cacheCreationInputUsdPerMillion,
|
|
3481
|
+
cacheCreationInput1hUsdPerMillion: pricing.cacheCreationInput1hUsdPerMillion,
|
|
3482
|
+
reasoningUsdPerMillion: pricing.reasoningUsdPerMillion
|
|
3483
|
+
};
|
|
3484
|
+
}
|
|
3485
|
+
function resolveLlmCallPricingEntries(model, pricing) {
|
|
3486
|
+
const entries = [];
|
|
3487
|
+
if (hasPricingRates(pricing)) entries.push({
|
|
3488
|
+
model,
|
|
3489
|
+
provider: pricing.provider,
|
|
3490
|
+
...copyPricingRates(pricing)
|
|
3491
|
+
});
|
|
3492
|
+
for (const [provider, providerPricing] of Object.entries(pricing.providers ?? {})) entries.push({
|
|
3493
|
+
model,
|
|
3494
|
+
provider,
|
|
3495
|
+
...copyPricingRates(providerPricing)
|
|
3496
|
+
});
|
|
3497
|
+
return entries;
|
|
3498
|
+
}
|
|
3175
3499
|
/**
|
|
3176
3500
|
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
3177
3501
|
* by the UI to derive the LLM calls tab.
|
|
@@ -3182,7 +3506,7 @@ function resolveApiCallMetric(metric) {
|
|
|
3182
3506
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
3183
3507
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
3184
3508
|
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
3185
|
-
* derived from configured pricing and token counts.
|
|
3509
|
+
* derived from configured model-keyed pricing and token counts.
|
|
3186
3510
|
*/
|
|
3187
3511
|
function resolveLlmCallsConfig(input) {
|
|
3188
3512
|
return {
|
|
@@ -3193,16 +3517,7 @@ function resolveLlmCallsConfig(input) {
|
|
|
3193
3517
|
},
|
|
3194
3518
|
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3195
3519
|
metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
|
|
3196
|
-
pricing: (input?.pricing ??
|
|
3197
|
-
model: p.model,
|
|
3198
|
-
provider: p.provider,
|
|
3199
|
-
inputUsdPerMillion: p.inputUsdPerMillion,
|
|
3200
|
-
outputUsdPerMillion: p.outputUsdPerMillion,
|
|
3201
|
-
cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
|
|
3202
|
-
cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
|
|
3203
|
-
cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
|
|
3204
|
-
reasoningUsdPerMillion: p.reasoningUsdPerMillion
|
|
3205
|
-
}))
|
|
3520
|
+
pricing: Object.entries(input?.pricing ?? {}).flatMap(([model, pricing]) => resolveLlmCallPricingEntries(model, pricing))
|
|
3206
3521
|
};
|
|
3207
3522
|
}
|
|
3208
3523
|
/**
|
|
@@ -3236,6 +3551,9 @@ const agentEvalsConfigSchema = z.object({
|
|
|
3236
3551
|
staleAfterDays: z.number().optional(),
|
|
3237
3552
|
allowCliRunAll: z.boolean().optional(),
|
|
3238
3553
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
3554
|
+
columns: evalColumnsSchema.optional(),
|
|
3555
|
+
deriveFromTracing: evalDeriveConfigSchema.optional(),
|
|
3556
|
+
stats: evalStatsConfigSchema.optional(),
|
|
3239
3557
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
3240
3558
|
removeDefaultConfig: removeDefaultConfigSchema.optional(),
|
|
3241
3559
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
@@ -3888,6 +4206,11 @@ function readNumber(attributes, key) {
|
|
|
3888
4206
|
const value = attributes[key];
|
|
3889
4207
|
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
3890
4208
|
}
|
|
4209
|
+
function readBoolean(attributes, key) {
|
|
4210
|
+
if (!isRecord$2(attributes)) return void 0;
|
|
4211
|
+
const value = attributes[key];
|
|
4212
|
+
return typeof value === "boolean" ? value : void 0;
|
|
4213
|
+
}
|
|
3891
4214
|
function readArray(attributes, key) {
|
|
3892
4215
|
if (!isRecord$2(attributes)) return [];
|
|
3893
4216
|
const value = attributes[key];
|
|
@@ -3916,12 +4239,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3916
4239
|
const namespace = readString(span.attributes, "cache.namespace");
|
|
3917
4240
|
if (key !== void 0 && namespace !== void 0) {
|
|
3918
4241
|
const isHit = status === "hit";
|
|
4242
|
+
const stored = isHit ? true : readBoolean(span.attributes, "cache.stored") !== false;
|
|
3919
4243
|
entries.push({
|
|
3920
4244
|
id: span.id,
|
|
3921
4245
|
source: "span",
|
|
3922
4246
|
origin: "span",
|
|
3923
|
-
action: isHit ? "hit" : "added",
|
|
4247
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3924
4248
|
status,
|
|
4249
|
+
stored,
|
|
3925
4250
|
name: span.name,
|
|
3926
4251
|
namespace,
|
|
3927
4252
|
key,
|
|
@@ -3938,12 +4263,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3938
4263
|
const ref = parsed.data;
|
|
3939
4264
|
if (ref.status === "bypass") continue;
|
|
3940
4265
|
const isHit = ref.status === "hit";
|
|
4266
|
+
const stored = isHit ? true : ref.stored !== false;
|
|
3941
4267
|
entries.push({
|
|
3942
4268
|
id: `${span.id}:value:${String(index)}`,
|
|
3943
4269
|
source: "value",
|
|
3944
4270
|
origin: "span",
|
|
3945
|
-
action: isHit ? "hit" : "added",
|
|
4271
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3946
4272
|
status: ref.status,
|
|
4273
|
+
stored,
|
|
3947
4274
|
name: ref.name,
|
|
3948
4275
|
namespace: ref.namespace,
|
|
3949
4276
|
key: ref.key,
|
|
@@ -3956,12 +4283,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3956
4283
|
for (const [index, ref] of caseCacheRefs.entries()) {
|
|
3957
4284
|
if (ref.status === "bypass") continue;
|
|
3958
4285
|
const isHit = ref.status === "hit";
|
|
4286
|
+
const stored = isHit ? true : ref.stored !== false;
|
|
3959
4287
|
entries.push({
|
|
3960
4288
|
id: `case:value:${String(index)}`,
|
|
3961
4289
|
source: "value",
|
|
3962
4290
|
origin: "caseRoot",
|
|
3963
|
-
action: isHit ? "hit" : "added",
|
|
4291
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3964
4292
|
status: ref.status,
|
|
4293
|
+
stored,
|
|
3965
4294
|
name: ref.name,
|
|
3966
4295
|
namespace: ref.namespace,
|
|
3967
4296
|
key: ref.key,
|
|
@@ -3987,6 +4316,7 @@ function isCacheHitEntry(entry) {
|
|
|
3987
4316
|
}
|
|
3988
4317
|
z.enum([
|
|
3989
4318
|
"discovery.updated",
|
|
4319
|
+
"config.reload",
|
|
3990
4320
|
"run.started",
|
|
3991
4321
|
"run.summary",
|
|
3992
4322
|
"case.started",
|
|
@@ -4006,6 +4336,19 @@ const sseEnvelopeSchema = z.object({
|
|
|
4006
4336
|
});
|
|
4007
4337
|
//#endregion
|
|
4008
4338
|
//#region ../shared/src/schemas/api.ts
|
|
4339
|
+
/** Lifecycle state for an app config reload triggered by `agent-evals.config.ts`. */
|
|
4340
|
+
const configReloadStatusSchema = z.enum([
|
|
4341
|
+
"idle",
|
|
4342
|
+
"pending",
|
|
4343
|
+
"reloading"
|
|
4344
|
+
]);
|
|
4345
|
+
/** UI/API-visible state for config reloads in `agent-evals app`. */
|
|
4346
|
+
const configReloadStateSchema = z.object({
|
|
4347
|
+
status: configReloadStatusSchema,
|
|
4348
|
+
activeRunCount: z.number().int().min(0),
|
|
4349
|
+
lastChangedAt: z.string().nullable(),
|
|
4350
|
+
lastReloadedAt: z.string().nullable()
|
|
4351
|
+
});
|
|
4009
4352
|
/** Schema for the API request that starts a new eval run. */
|
|
4010
4353
|
const createRunRequestSchema = z.object({
|
|
4011
4354
|
target: z.object({
|
|
@@ -4026,14 +4369,22 @@ const createRunRequestSchema = z.object({
|
|
|
4026
4369
|
* Optional cache controls for the run. When omitted, the cache is used in
|
|
4027
4370
|
* its default read-through / write-on-miss mode.
|
|
4028
4371
|
*/
|
|
4029
|
-
cache: z.object({ mode: cacheModeSchema.default("use") }).optional()
|
|
4372
|
+
cache: z.object({ mode: cacheModeSchema.default("use") }).optional(),
|
|
4373
|
+
/**
|
|
4374
|
+
* Manual-input values keyed by eval `key` (workspace-relative file path
|
|
4375
|
+
* plus authored eval id). Required for any targeted eval that declares
|
|
4376
|
+
* `manualInput` in its definition; the server validates each entry against
|
|
4377
|
+
* the eval's authored Zod schema before starting the run.
|
|
4378
|
+
*/
|
|
4379
|
+
manualInputs: z.record(z.string(), z.unknown()).optional()
|
|
4030
4380
|
});
|
|
4031
4381
|
/** Schema for updating a UI-authored manual score on one persisted case. */
|
|
4032
4382
|
const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1).nullable() });
|
|
4033
4383
|
//#endregion
|
|
4034
4384
|
//#region ../runner/src/cacheStore.ts
|
|
4035
4385
|
const defaultMaxEntriesPerNamespace = 100;
|
|
4036
|
-
const cacheSerializationMarker = "
|
|
4386
|
+
const cacheSerializationMarker = "__aecs";
|
|
4387
|
+
const legacyCacheSerializationMarker = "__agentEvalsCacheSerialization";
|
|
4037
4388
|
const supportedCacheSerializationVersion = "json-safe-v1";
|
|
4038
4389
|
/**
|
|
4039
4390
|
* Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
|
|
@@ -4118,7 +4469,6 @@ function createFsCacheStore(options) {
|
|
|
4118
4469
|
spanName: entry.spanName,
|
|
4119
4470
|
spanKind: entry.spanKind,
|
|
4120
4471
|
storedAt: entry.storedAt,
|
|
4121
|
-
codeFingerprint: entry.codeFingerprint,
|
|
4122
4472
|
sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
|
|
4123
4473
|
});
|
|
4124
4474
|
}
|
|
@@ -4247,7 +4597,7 @@ async function readCacheFilePath(filePath) {
|
|
|
4247
4597
|
function usesSupportedCacheSerialization(value) {
|
|
4248
4598
|
if (Array.isArray(value)) return value.every(usesSupportedCacheSerialization);
|
|
4249
4599
|
if (!isRecordLike(value)) return true;
|
|
4250
|
-
if (Object.hasOwn(value,
|
|
4600
|
+
for (const marker of [cacheSerializationMarker, legacyCacheSerializationMarker]) if (Object.hasOwn(value, marker) && value[marker] !== supportedCacheSerializationVersion) return false;
|
|
4251
4601
|
return Object.values(value).every(usesSupportedCacheSerialization);
|
|
4252
4602
|
}
|
|
4253
4603
|
async function writeOrRemoveCacheFile(cacheDir, cacheFile) {
|
|
@@ -4291,7 +4641,6 @@ async function writeDebugKeyEntry(params) {
|
|
|
4291
4641
|
operationType: debugKey.operationType,
|
|
4292
4642
|
operationName: debugKey.operationName,
|
|
4293
4643
|
storedAt: entry.storedAt,
|
|
4294
|
-
codeFingerprint: debugKey.codeFingerprint,
|
|
4295
4644
|
rawKey: debugKey.rawKey
|
|
4296
4645
|
};
|
|
4297
4646
|
await writeDebugKeyFile(debugDir, {
|
|
@@ -4507,6 +4856,7 @@ function getScoreOverride(def) {
|
|
|
4507
4856
|
format: def.format,
|
|
4508
4857
|
numberFormat: def.numberFormat,
|
|
4509
4858
|
hideInTable: def.hideInTable,
|
|
4859
|
+
hideIfNoValue: def.hideIfNoValue,
|
|
4510
4860
|
align: def.align,
|
|
4511
4861
|
maxStars: def.maxStars
|
|
4512
4862
|
};
|
|
@@ -4519,6 +4869,7 @@ function mergeOverrides(base, override) {
|
|
|
4519
4869
|
format: override.format ?? base.format,
|
|
4520
4870
|
numberFormat: override.numberFormat ?? base.numberFormat,
|
|
4521
4871
|
hideInTable: override.hideInTable ?? base.hideInTable,
|
|
4872
|
+
hideIfNoValue: override.hideIfNoValue ?? base.hideIfNoValue,
|
|
4522
4873
|
align: override.align ?? base.align,
|
|
4523
4874
|
maxStars: override.maxStars ?? base.maxStars
|
|
4524
4875
|
};
|
|
@@ -4633,6 +4984,7 @@ function createColumnDef(params) {
|
|
|
4633
4984
|
if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
|
|
4634
4985
|
if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
|
|
4635
4986
|
if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
|
|
4987
|
+
if (override?.hideIfNoValue !== void 0) def.hideIfNoValue = override.hideIfNoValue;
|
|
4636
4988
|
if (override?.align !== void 0) def.align = override.align;
|
|
4637
4989
|
if (!isScore) return def;
|
|
4638
4990
|
def.isScore = true;
|
|
@@ -4676,7 +5028,9 @@ async function loadConfig() {
|
|
|
4676
5028
|
const configPath = resolve(process.cwd(), "agent-evals.config.ts");
|
|
4677
5029
|
if (!existsSync(configPath)) return defaultConfig;
|
|
4678
5030
|
try {
|
|
4679
|
-
const
|
|
5031
|
+
const configUrl = pathToFileURL(configPath);
|
|
5032
|
+
configUrl.searchParams.set("v", randomUUID());
|
|
5033
|
+
const imported = await import(configUrl.href);
|
|
4680
5034
|
const configModule = configModuleSchema.parse(imported);
|
|
4681
5035
|
const userConfig = configModule.default ?? configModule.config;
|
|
4682
5036
|
if (!userConfig) return defaultConfig;
|
|
@@ -4717,60 +5071,70 @@ const DEFAULT_COLUMNS = {
|
|
|
4717
5071
|
label: "API Calls",
|
|
4718
5072
|
format: "number",
|
|
4719
5073
|
numberFormat: countNumberFormat,
|
|
4720
|
-
align: "right"
|
|
5074
|
+
align: "right",
|
|
5075
|
+
hideIfNoValue: true
|
|
4721
5076
|
},
|
|
4722
5077
|
costUsd: {
|
|
4723
5078
|
label: "Cost",
|
|
4724
5079
|
format: "number",
|
|
4725
5080
|
numberFormat: costNumberFormat,
|
|
4726
|
-
align: "right"
|
|
5081
|
+
align: "right",
|
|
5082
|
+
hideIfNoValue: true
|
|
4727
5083
|
},
|
|
4728
5084
|
llmTurns: {
|
|
4729
5085
|
label: "LLM Turns",
|
|
4730
5086
|
format: "number",
|
|
4731
5087
|
numberFormat: countNumberFormat,
|
|
4732
|
-
align: "right"
|
|
5088
|
+
align: "right",
|
|
5089
|
+
hideIfNoValue: true
|
|
4733
5090
|
},
|
|
4734
5091
|
inputTokens: {
|
|
4735
5092
|
label: "Input Tokens",
|
|
4736
5093
|
format: "number",
|
|
4737
5094
|
numberFormat: tokenNumberFormat,
|
|
4738
|
-
align: "right"
|
|
5095
|
+
align: "right",
|
|
5096
|
+
hideIfNoValue: true
|
|
4739
5097
|
},
|
|
4740
5098
|
outputTokens: {
|
|
4741
5099
|
label: "Output Tokens",
|
|
4742
5100
|
format: "number",
|
|
4743
5101
|
numberFormat: tokenNumberFormat,
|
|
4744
|
-
align: "right"
|
|
5102
|
+
align: "right",
|
|
5103
|
+
hideIfNoValue: true
|
|
4745
5104
|
},
|
|
4746
5105
|
totalTokens: {
|
|
4747
5106
|
label: "Total Tokens",
|
|
4748
5107
|
format: "number",
|
|
4749
5108
|
numberFormat: tokenNumberFormat,
|
|
4750
|
-
align: "right"
|
|
5109
|
+
align: "right",
|
|
5110
|
+
hideIfNoValue: true
|
|
4751
5111
|
},
|
|
4752
5112
|
cachedInputTokens: {
|
|
4753
5113
|
label: "Cached Input Tokens",
|
|
4754
5114
|
format: "number",
|
|
4755
5115
|
numberFormat: tokenNumberFormat,
|
|
4756
|
-
align: "right"
|
|
5116
|
+
align: "right",
|
|
5117
|
+
hideIfNoValue: true
|
|
4757
5118
|
},
|
|
4758
5119
|
cacheCreationInputTokens: {
|
|
4759
5120
|
label: "Cache Write Tokens",
|
|
4760
5121
|
format: "number",
|
|
4761
5122
|
numberFormat: tokenNumberFormat,
|
|
4762
|
-
align: "right"
|
|
5123
|
+
align: "right",
|
|
5124
|
+
hideIfNoValue: true
|
|
4763
5125
|
},
|
|
4764
5126
|
reasoningTokens: {
|
|
4765
5127
|
label: "Reasoning Tokens",
|
|
4766
5128
|
format: "number",
|
|
4767
5129
|
numberFormat: tokenNumberFormat,
|
|
4768
|
-
align: "right"
|
|
5130
|
+
align: "right",
|
|
5131
|
+
hideIfNoValue: true
|
|
4769
5132
|
},
|
|
4770
5133
|
llmDurationMs: {
|
|
4771
5134
|
label: "LLM Duration",
|
|
4772
5135
|
format: "duration",
|
|
4773
|
-
align: "right"
|
|
5136
|
+
align: "right",
|
|
5137
|
+
hideIfNoValue: true
|
|
4774
5138
|
}
|
|
4775
5139
|
};
|
|
4776
5140
|
function resolveRemovedKeys(globalRemove, evalRemove) {
|
|
@@ -4783,9 +5147,16 @@ function getActiveDefaultConfigKeys(params) {
|
|
|
4783
5147
|
}
|
|
4784
5148
|
function mergeDefaultColumns(params) {
|
|
4785
5149
|
const activeKeys = getActiveDefaultConfigKeys(params);
|
|
4786
|
-
if (activeKeys.length === 0)
|
|
5150
|
+
if (activeKeys.length === 0) {
|
|
5151
|
+
const merged = {
|
|
5152
|
+
...params.globalColumns,
|
|
5153
|
+
...params.columns
|
|
5154
|
+
};
|
|
5155
|
+
return Object.keys(merged).length > 0 ? merged : void 0;
|
|
5156
|
+
}
|
|
4787
5157
|
return {
|
|
4788
5158
|
...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
|
|
5159
|
+
...params.globalColumns,
|
|
4789
5160
|
...params.columns
|
|
4790
5161
|
};
|
|
4791
5162
|
}
|
|
@@ -4797,30 +5168,38 @@ function appendDefaultStats(params) {
|
|
|
4797
5168
|
key: "apiCalls",
|
|
4798
5169
|
label: "API Calls",
|
|
4799
5170
|
aggregate: "avg",
|
|
4800
|
-
numberFormat: countNumberFormat
|
|
5171
|
+
numberFormat: countNumberFormat,
|
|
5172
|
+
hideIfNoValue: true
|
|
4801
5173
|
});
|
|
4802
5174
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4803
5175
|
kind: "column",
|
|
4804
5176
|
key: "costUsd",
|
|
4805
5177
|
label: "LLM Cost",
|
|
4806
5178
|
aggregate: "avg",
|
|
4807
|
-
numberFormat: costNumberFormat
|
|
5179
|
+
numberFormat: costNumberFormat,
|
|
5180
|
+
hideIfNoValue: true
|
|
4808
5181
|
});
|
|
4809
5182
|
if (activeKeys.has("totalTokens")) defaults.push({
|
|
4810
5183
|
kind: "column",
|
|
4811
5184
|
key: "totalTokens",
|
|
4812
5185
|
label: "Tokens",
|
|
4813
5186
|
aggregate: "avg",
|
|
4814
|
-
numberFormat: tokenNumberFormat
|
|
5187
|
+
numberFormat: tokenNumberFormat,
|
|
5188
|
+
hideIfNoValue: true
|
|
4815
5189
|
});
|
|
4816
5190
|
if (activeKeys.has("llmTurns")) defaults.push({
|
|
4817
5191
|
kind: "column",
|
|
4818
5192
|
key: "llmTurns",
|
|
4819
5193
|
label: "LLM Turns",
|
|
4820
5194
|
aggregate: "avg",
|
|
4821
|
-
numberFormat: countNumberFormat
|
|
5195
|
+
numberFormat: countNumberFormat,
|
|
5196
|
+
hideIfNoValue: true
|
|
4822
5197
|
});
|
|
4823
|
-
const merged = [
|
|
5198
|
+
const merged = [
|
|
5199
|
+
...params.globalStats ?? [],
|
|
5200
|
+
...params.stats ?? [],
|
|
5201
|
+
...defaults
|
|
5202
|
+
];
|
|
4824
5203
|
return merged.length > 0 ? merged : void 0;
|
|
4825
5204
|
}
|
|
4826
5205
|
function appendDefaultCharts(params) {
|
|
@@ -4828,6 +5207,8 @@ function appendDefaultCharts(params) {
|
|
|
4828
5207
|
const defaults = [];
|
|
4829
5208
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4830
5209
|
heading: "LLM Cost",
|
|
5210
|
+
hideIfNoValue: true,
|
|
5211
|
+
dedupeConsecutiveValues: true,
|
|
4831
5212
|
type: "area",
|
|
4832
5213
|
metrics: [{
|
|
4833
5214
|
source: "column",
|
|
@@ -4837,7 +5218,7 @@ function appendDefaultCharts(params) {
|
|
|
4837
5218
|
color: "warning"
|
|
4838
5219
|
}]
|
|
4839
5220
|
});
|
|
4840
|
-
const
|
|
5221
|
+
const inputTokenMetrics = [
|
|
4841
5222
|
activeKeys.has("inputTokens") ? {
|
|
4842
5223
|
source: "column",
|
|
4843
5224
|
key: "inputTokens",
|
|
@@ -4845,13 +5226,6 @@ function appendDefaultCharts(params) {
|
|
|
4845
5226
|
label: "Input",
|
|
4846
5227
|
color: "accent"
|
|
4847
5228
|
} : null,
|
|
4848
|
-
activeKeys.has("outputTokens") ? {
|
|
4849
|
-
source: "column",
|
|
4850
|
-
key: "outputTokens",
|
|
4851
|
-
aggregate: "avg",
|
|
4852
|
-
label: "Output",
|
|
4853
|
-
color: "success"
|
|
4854
|
-
} : null,
|
|
4855
5229
|
activeKeys.has("cachedInputTokens") ? {
|
|
4856
5230
|
source: "column",
|
|
4857
5231
|
key: "cachedInputTokens",
|
|
@@ -4867,16 +5241,25 @@ function appendDefaultCharts(params) {
|
|
|
4867
5241
|
color: "warning"
|
|
4868
5242
|
} : null
|
|
4869
5243
|
].filter((metric) => metric !== null);
|
|
4870
|
-
if (
|
|
4871
|
-
heading: "LLM Tokens",
|
|
5244
|
+
if (inputTokenMetrics.length > 0) defaults.push({
|
|
5245
|
+
heading: "LLM Input Tokens",
|
|
5246
|
+
hideIfNoValue: true,
|
|
5247
|
+
dedupeConsecutiveValues: true,
|
|
4872
5248
|
type: "bar",
|
|
4873
|
-
metrics:
|
|
4874
|
-
|
|
5249
|
+
metrics: inputTokenMetrics
|
|
5250
|
+
});
|
|
5251
|
+
if (activeKeys.has("outputTokens")) defaults.push({
|
|
5252
|
+
heading: "LLM Output Tokens",
|
|
5253
|
+
hideIfNoValue: true,
|
|
5254
|
+
dedupeConsecutiveValues: true,
|
|
5255
|
+
type: "bar",
|
|
5256
|
+
metrics: [{
|
|
4875
5257
|
source: "column",
|
|
4876
|
-
key: "
|
|
5258
|
+
key: "outputTokens",
|
|
4877
5259
|
aggregate: "avg",
|
|
4878
|
-
label: "
|
|
4879
|
-
|
|
5260
|
+
label: "Output",
|
|
5261
|
+
color: "success"
|
|
5262
|
+
}]
|
|
4880
5263
|
});
|
|
4881
5264
|
const merged = [...params.charts ?? [], ...defaults];
|
|
4882
5265
|
return merged.length > 0 ? merged : void 0;
|
|
@@ -4885,11 +5268,13 @@ function resolveEvalDefaultConfig(params) {
|
|
|
4885
5268
|
const evalRemove = params.evalDef.removeDefaultConfig;
|
|
4886
5269
|
return {
|
|
4887
5270
|
columns: mergeDefaultColumns({
|
|
5271
|
+
globalColumns: params.globalColumns,
|
|
4888
5272
|
columns: params.evalDef.columns,
|
|
4889
5273
|
globalRemove: params.globalRemove,
|
|
4890
5274
|
evalRemove
|
|
4891
5275
|
}),
|
|
4892
5276
|
stats: appendDefaultStats({
|
|
5277
|
+
globalStats: params.globalStats,
|
|
4893
5278
|
stats: params.evalDef.stats,
|
|
4894
5279
|
globalRemove: params.globalRemove,
|
|
4895
5280
|
evalRemove
|
|
@@ -5144,6 +5529,371 @@ function getRunFreshnessTimestamp(manifest) {
|
|
|
5144
5529
|
return manifest.endedAt ?? manifest.startedAt;
|
|
5145
5530
|
}
|
|
5146
5531
|
//#endregion
|
|
5532
|
+
//#region ../runner/src/manualInput/walker.ts
|
|
5533
|
+
function isObject(value) {
|
|
5534
|
+
return typeof value === "object" && value !== null;
|
|
5535
|
+
}
|
|
5536
|
+
function getZodDef(schema) {
|
|
5537
|
+
if (!isObject(schema)) return null;
|
|
5538
|
+
const zodHolder = schema._zod;
|
|
5539
|
+
if (!isObject(zodHolder)) return null;
|
|
5540
|
+
const def = zodHolder.def;
|
|
5541
|
+
if (!isObject(def)) return null;
|
|
5542
|
+
if (typeof def.type !== "string") return null;
|
|
5543
|
+
return {
|
|
5544
|
+
...def,
|
|
5545
|
+
type: def.type
|
|
5546
|
+
};
|
|
5547
|
+
}
|
|
5548
|
+
function getDescription(schema) {
|
|
5549
|
+
if (!isObject(schema)) return void 0;
|
|
5550
|
+
const description = schema.description;
|
|
5551
|
+
return typeof description === "string" ? description : void 0;
|
|
5552
|
+
}
|
|
5553
|
+
function getInnerSchema(def) {
|
|
5554
|
+
return def.innerType;
|
|
5555
|
+
}
|
|
5556
|
+
function getChecks(def) {
|
|
5557
|
+
const checks = def.checks;
|
|
5558
|
+
if (!Array.isArray(checks)) return [];
|
|
5559
|
+
const out = [];
|
|
5560
|
+
for (const check of checks) {
|
|
5561
|
+
if (!isObject(check)) continue;
|
|
5562
|
+
const zodHolder = check._zod;
|
|
5563
|
+
if (!isObject(zodHolder)) continue;
|
|
5564
|
+
const checkDef = zodHolder.def;
|
|
5565
|
+
if (!isObject(checkDef)) continue;
|
|
5566
|
+
if (typeof checkDef.check !== "string") continue;
|
|
5567
|
+
out.push({
|
|
5568
|
+
...checkDef,
|
|
5569
|
+
check: checkDef.check
|
|
5570
|
+
});
|
|
5571
|
+
}
|
|
5572
|
+
return out;
|
|
5573
|
+
}
|
|
5574
|
+
function findCheck(checks, name) {
|
|
5575
|
+
return checks.find((check) => check.check === name);
|
|
5576
|
+
}
|
|
5577
|
+
function unwrap(schema) {
|
|
5578
|
+
let current = schema;
|
|
5579
|
+
let required = true;
|
|
5580
|
+
let defaultValue = void 0;
|
|
5581
|
+
for (let depth = 0; depth < 8; depth += 1) {
|
|
5582
|
+
const def = getZodDef(current);
|
|
5583
|
+
if (!def) return null;
|
|
5584
|
+
if (def.type === "optional" || def.type === "nullable") {
|
|
5585
|
+
required = false;
|
|
5586
|
+
current = getInnerSchema(def);
|
|
5587
|
+
continue;
|
|
5588
|
+
}
|
|
5589
|
+
if (def.type === "nullish") {
|
|
5590
|
+
required = false;
|
|
5591
|
+
current = getInnerSchema(def);
|
|
5592
|
+
continue;
|
|
5593
|
+
}
|
|
5594
|
+
if (def.type === "default" || def.type === "prefault") {
|
|
5595
|
+
const raw = def.defaultValue;
|
|
5596
|
+
if (typeof raw === "function") defaultValue = Reflect.apply(raw, void 0, []);
|
|
5597
|
+
else defaultValue = raw;
|
|
5598
|
+
current = getInnerSchema(def);
|
|
5599
|
+
continue;
|
|
5600
|
+
}
|
|
5601
|
+
if (def.type === "readonly" || def.type === "pipe") {
|
|
5602
|
+
current = getInnerSchema(def) ?? def.in;
|
|
5603
|
+
continue;
|
|
5604
|
+
}
|
|
5605
|
+
return {
|
|
5606
|
+
schema: current,
|
|
5607
|
+
def,
|
|
5608
|
+
required,
|
|
5609
|
+
defaultValue
|
|
5610
|
+
};
|
|
5611
|
+
}
|
|
5612
|
+
return null;
|
|
5613
|
+
}
|
|
5614
|
+
function humaniseKey(key) {
|
|
5615
|
+
const spaced = key.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/[_-]+/g, " ").trim();
|
|
5616
|
+
if (!spaced) return key;
|
|
5617
|
+
const lowered = spaced.toLowerCase();
|
|
5618
|
+
return lowered.charAt(0).toUpperCase() + lowered.slice(1);
|
|
5619
|
+
}
|
|
5620
|
+
function normaliseSelectOptions(raw) {
|
|
5621
|
+
if (!raw) return void 0;
|
|
5622
|
+
return raw.map((entry) => {
|
|
5623
|
+
if (typeof entry === "string") return {
|
|
5624
|
+
value: entry,
|
|
5625
|
+
label: entry
|
|
5626
|
+
};
|
|
5627
|
+
return {
|
|
5628
|
+
value: entry.value,
|
|
5629
|
+
label: entry.label ?? entry.value
|
|
5630
|
+
};
|
|
5631
|
+
});
|
|
5632
|
+
}
|
|
5633
|
+
function enumOptionsFromEntries(def) {
|
|
5634
|
+
const entries = def.entries;
|
|
5635
|
+
if (!isObject(entries)) return null;
|
|
5636
|
+
const out = [];
|
|
5637
|
+
for (const [label, value] of Object.entries(entries)) if (typeof value === "string") out.push({
|
|
5638
|
+
value,
|
|
5639
|
+
label
|
|
5640
|
+
});
|
|
5641
|
+
else if (typeof value === "number") out.push({
|
|
5642
|
+
value: String(value),
|
|
5643
|
+
label
|
|
5644
|
+
});
|
|
5645
|
+
else return null;
|
|
5646
|
+
return out;
|
|
5647
|
+
}
|
|
5648
|
+
function literalUnionOptions(def) {
|
|
5649
|
+
const options = def.options;
|
|
5650
|
+
if (!Array.isArray(options)) return null;
|
|
5651
|
+
const out = [];
|
|
5652
|
+
for (const option of options) {
|
|
5653
|
+
const optDef = getZodDef(option);
|
|
5654
|
+
if (optDef?.type !== "literal") return null;
|
|
5655
|
+
const values = optDef.values;
|
|
5656
|
+
if (!Array.isArray(values) || values.length !== 1) return null;
|
|
5657
|
+
const value = values[0];
|
|
5658
|
+
if (typeof value === "string") out.push({
|
|
5659
|
+
value,
|
|
5660
|
+
label: value
|
|
5661
|
+
});
|
|
5662
|
+
else if (typeof value === "number") {
|
|
5663
|
+
const stringValue = String(value);
|
|
5664
|
+
out.push({
|
|
5665
|
+
value: stringValue,
|
|
5666
|
+
label: stringValue
|
|
5667
|
+
});
|
|
5668
|
+
} else return null;
|
|
5669
|
+
}
|
|
5670
|
+
return out.length > 0 ? out : null;
|
|
5671
|
+
}
|
|
5672
|
+
function literalSelectOptions(def) {
|
|
5673
|
+
const values = def.values;
|
|
5674
|
+
if (!Array.isArray(values)) return null;
|
|
5675
|
+
const out = [];
|
|
5676
|
+
for (const value of values) if (typeof value === "string") out.push({
|
|
5677
|
+
value,
|
|
5678
|
+
label: value
|
|
5679
|
+
});
|
|
5680
|
+
else if (typeof value === "number") {
|
|
5681
|
+
const stringValue = String(value);
|
|
5682
|
+
out.push({
|
|
5683
|
+
value: stringValue,
|
|
5684
|
+
label: stringValue
|
|
5685
|
+
});
|
|
5686
|
+
} else return null;
|
|
5687
|
+
return out;
|
|
5688
|
+
}
|
|
5689
|
+
function readStringChecks(def) {
|
|
5690
|
+
const checks = getChecks(def);
|
|
5691
|
+
const out = {};
|
|
5692
|
+
const min = findCheck(checks, "min_length");
|
|
5693
|
+
if (min && typeof min.minimum === "number") out.minLength = min.minimum;
|
|
5694
|
+
const max = findCheck(checks, "max_length");
|
|
5695
|
+
if (max && typeof max.maximum === "number") out.maxLength = max.maximum;
|
|
5696
|
+
return out;
|
|
5697
|
+
}
|
|
5698
|
+
const integerNumberFormats = new Set([
|
|
5699
|
+
"int",
|
|
5700
|
+
"safeint",
|
|
5701
|
+
"int32",
|
|
5702
|
+
"uint32",
|
|
5703
|
+
"int64",
|
|
5704
|
+
"uint64"
|
|
5705
|
+
]);
|
|
5706
|
+
function readNumberChecks(def) {
|
|
5707
|
+
const checks = getChecks(def);
|
|
5708
|
+
const out = {};
|
|
5709
|
+
const gt = findCheck(checks, "greater_than");
|
|
5710
|
+
if (gt && typeof gt.value === "number" && gt.inclusive === true) out.min = gt.value;
|
|
5711
|
+
const lt = findCheck(checks, "less_than");
|
|
5712
|
+
if (lt && typeof lt.value === "number" && lt.inclusive === true) out.max = lt.value;
|
|
5713
|
+
const format = findCheck(checks, "number_format");
|
|
5714
|
+
if (format && typeof format.format === "string" && integerNumberFormats.has(format.format)) out.integer = true;
|
|
5715
|
+
return out;
|
|
5716
|
+
}
|
|
5717
|
+
function buildField(key, fieldSchema, override) {
|
|
5718
|
+
const unwrapped = unwrap(fieldSchema);
|
|
5719
|
+
if (!unwrapped) return Result.err(/* @__PURE__ */ new Error(`manualInput: field "${key}" uses an unsupported Zod schema (could not introspect)`));
|
|
5720
|
+
const inner = unwrapped.def;
|
|
5721
|
+
const description = override?.description ?? getDescription(unwrapped.schema);
|
|
5722
|
+
const base = {
|
|
5723
|
+
key,
|
|
5724
|
+
label: override?.label ?? humaniseKey(key),
|
|
5725
|
+
description,
|
|
5726
|
+
placeholder: override?.placeholder,
|
|
5727
|
+
required: unwrapped.required,
|
|
5728
|
+
defaultValue: override?.defaultValue !== void 0 ? override.defaultValue : unwrapped.defaultValue
|
|
5729
|
+
};
|
|
5730
|
+
if (override?.asJson === true) {
|
|
5731
|
+
const rows = override.rows;
|
|
5732
|
+
return Result.ok({
|
|
5733
|
+
...base,
|
|
5734
|
+
kind: "json",
|
|
5735
|
+
rows
|
|
5736
|
+
});
|
|
5737
|
+
}
|
|
5738
|
+
if (override?.asFile === true) return Result.ok({
|
|
5739
|
+
...base,
|
|
5740
|
+
kind: "file",
|
|
5741
|
+
accept: override.accept,
|
|
5742
|
+
maxSizeBytes: override.maxSizeBytes
|
|
5743
|
+
});
|
|
5744
|
+
const overrideOptions = normaliseSelectOptions(override?.options);
|
|
5745
|
+
if (overrideOptions) return Result.ok({
|
|
5746
|
+
...base,
|
|
5747
|
+
kind: "select",
|
|
5748
|
+
options: overrideOptions
|
|
5749
|
+
});
|
|
5750
|
+
switch (inner.type) {
|
|
5751
|
+
case "string": {
|
|
5752
|
+
const checks = readStringChecks(inner);
|
|
5753
|
+
if (override?.multiline === true) return Result.ok({
|
|
5754
|
+
...base,
|
|
5755
|
+
kind: "multiline",
|
|
5756
|
+
rows: override.rows,
|
|
5757
|
+
minLength: checks.minLength,
|
|
5758
|
+
maxLength: checks.maxLength
|
|
5759
|
+
});
|
|
5760
|
+
return Result.ok({
|
|
5761
|
+
...base,
|
|
5762
|
+
kind: "text",
|
|
5763
|
+
minLength: checks.minLength,
|
|
5764
|
+
maxLength: checks.maxLength
|
|
5765
|
+
});
|
|
5766
|
+
}
|
|
5767
|
+
case "number":
|
|
5768
|
+
case "int":
|
|
5769
|
+
case "bigint": {
|
|
5770
|
+
const checks = readNumberChecks(inner);
|
|
5771
|
+
return Result.ok({
|
|
5772
|
+
...base,
|
|
5773
|
+
kind: "number",
|
|
5774
|
+
min: checks.min,
|
|
5775
|
+
max: checks.max,
|
|
5776
|
+
integer: checks.integer
|
|
5777
|
+
});
|
|
5778
|
+
}
|
|
5779
|
+
case "boolean": return Result.ok({
|
|
5780
|
+
...base,
|
|
5781
|
+
kind: "boolean"
|
|
5782
|
+
});
|
|
5783
|
+
case "enum": {
|
|
5784
|
+
const options = enumOptionsFromEntries(inner);
|
|
5785
|
+
if (options) return Result.ok({
|
|
5786
|
+
...base,
|
|
5787
|
+
kind: "select",
|
|
5788
|
+
options
|
|
5789
|
+
});
|
|
5790
|
+
return Result.ok({
|
|
5791
|
+
...base,
|
|
5792
|
+
kind: "json",
|
|
5793
|
+
rows: override?.rows
|
|
5794
|
+
});
|
|
5795
|
+
}
|
|
5796
|
+
case "literal": {
|
|
5797
|
+
const options = literalSelectOptions(inner);
|
|
5798
|
+
if (options && options.length > 0) return Result.ok({
|
|
5799
|
+
...base,
|
|
5800
|
+
kind: "select",
|
|
5801
|
+
options
|
|
5802
|
+
});
|
|
5803
|
+
return Result.ok({
|
|
5804
|
+
...base,
|
|
5805
|
+
kind: "json",
|
|
5806
|
+
rows: override?.rows
|
|
5807
|
+
});
|
|
5808
|
+
}
|
|
5809
|
+
case "union": {
|
|
5810
|
+
const options = literalUnionOptions(inner);
|
|
5811
|
+
if (options) return Result.ok({
|
|
5812
|
+
...base,
|
|
5813
|
+
kind: "select",
|
|
5814
|
+
options
|
|
5815
|
+
});
|
|
5816
|
+
return Result.ok({
|
|
5817
|
+
...base,
|
|
5818
|
+
kind: "json",
|
|
5819
|
+
rows: override?.rows
|
|
5820
|
+
});
|
|
5821
|
+
}
|
|
5822
|
+
default: return Result.ok({
|
|
5823
|
+
...base,
|
|
5824
|
+
kind: "json",
|
|
5825
|
+
rows: override?.rows
|
|
5826
|
+
});
|
|
5827
|
+
}
|
|
5828
|
+
}
|
|
5829
|
+
function getObjectShape(schema) {
|
|
5830
|
+
const def = getZodDef(schema);
|
|
5831
|
+
if (!def) return null;
|
|
5832
|
+
if (def.type !== "object") return null;
|
|
5833
|
+
const shape = def.shape;
|
|
5834
|
+
if (!isObject(shape)) return null;
|
|
5835
|
+
return shape;
|
|
5836
|
+
}
|
|
5837
|
+
/**
|
|
5838
|
+
* Walk an eval's `manualInput` configuration and produce the wire-format
|
|
5839
|
+
* descriptor consumed by the web UI. The schema must resolve to a top-level
|
|
5840
|
+
* `z.object(...)`; nested objects, arrays, unions, and other unsupported
|
|
5841
|
+
* shapes inside fields fall back to the JSON textarea widget.
|
|
5842
|
+
*
|
|
5843
|
+
* Returns a `Result` so the caller (eval discovery) can surface a discovery
|
|
5844
|
+
* issue without throwing when the schema is incompatible.
|
|
5845
|
+
*/
|
|
5846
|
+
function buildManualInputDescriptor(config) {
|
|
5847
|
+
const shape = getObjectShape(config.schema);
|
|
5848
|
+
if (!shape) return Result.err(/* @__PURE__ */ new Error("manualInput.schema must be a top-level z.object(...). Wrap nested types in an object schema."));
|
|
5849
|
+
const overrides = {};
|
|
5850
|
+
const rawOverrides = config.fields;
|
|
5851
|
+
if (rawOverrides) {
|
|
5852
|
+
for (const [key, override] of Object.entries(rawOverrides)) if (override) overrides[key] = override;
|
|
5853
|
+
}
|
|
5854
|
+
const fields = [];
|
|
5855
|
+
for (const [key, fieldSchema] of Object.entries(shape)) {
|
|
5856
|
+
const fieldResult = buildField(key, fieldSchema, overrides[key]);
|
|
5857
|
+
if (fieldResult.error) return fieldResult.errorResult();
|
|
5858
|
+
fields.push(fieldResult.value);
|
|
5859
|
+
}
|
|
5860
|
+
return Result.ok({
|
|
5861
|
+
title: config.title,
|
|
5862
|
+
description: config.description,
|
|
5863
|
+
submitLabel: config.submitLabel,
|
|
5864
|
+
fields
|
|
5865
|
+
});
|
|
5866
|
+
}
|
|
5867
|
+
/**
|
|
5868
|
+
* Resolve an eval's `manualInput` Zod schema against a raw user submission.
|
|
5869
|
+
* Returns the parsed value typed against the eval's `TInput` generic, or a
|
|
5870
|
+
* structured `Error` carrying the Zod issues for the caller to surface.
|
|
5871
|
+
*/
|
|
5872
|
+
function parseManualInputValues(config, raw) {
|
|
5873
|
+
const parsed = config.schema.safeParse(raw);
|
|
5874
|
+
if (!parsed.success) return Result.err(new ManualInputValidationError(parsed.error.issues.map(formatIssue)));
|
|
5875
|
+
return Result.ok(parsed.data);
|
|
5876
|
+
}
|
|
5877
|
+
/**
|
|
5878
|
+
* Error thrown / returned when manual-input values fail validation against
|
|
5879
|
+
* the eval's `manualInput.schema`. Carries the structured Zod issues so the
|
|
5880
|
+
* CLI and HTTP layers can surface them per-field.
|
|
5881
|
+
*/
|
|
5882
|
+
var ManualInputValidationError = class extends Error {
|
|
5883
|
+
issues;
|
|
5884
|
+
constructor(issues) {
|
|
5885
|
+
super(issues.length === 0 ? "manualInput validation failed" : `manualInput validation failed: ${issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ")}`);
|
|
5886
|
+
this.name = "ManualInputValidationError";
|
|
5887
|
+
this.issues = issues;
|
|
5888
|
+
}
|
|
5889
|
+
};
|
|
5890
|
+
function formatIssue(issue) {
|
|
5891
|
+
return {
|
|
5892
|
+
path: issue.path.map((segment) => typeof segment === "string" || typeof segment === "number" ? String(segment) : "").filter((segment) => segment !== "").join("."),
|
|
5893
|
+
message: issue.message
|
|
5894
|
+
};
|
|
5895
|
+
}
|
|
5896
|
+
//#endregion
|
|
5147
5897
|
//#region ../runner/src/outputArtifacts.ts
|
|
5148
5898
|
const mimeTypeExtensionMap = {
|
|
5149
5899
|
"application/json": ".json",
|
|
@@ -5227,6 +5977,65 @@ function isFile(value) {
|
|
|
5227
5977
|
return value instanceof File;
|
|
5228
5978
|
}
|
|
5229
5979
|
//#endregion
|
|
5980
|
+
//#region ../runner/src/traceDisplay.ts
|
|
5981
|
+
function isRecord$1(value) {
|
|
5982
|
+
return typeof value === "object" && value !== null;
|
|
5983
|
+
}
|
|
5984
|
+
function mergeNestedAttribute(value, path, attributeValue) {
|
|
5985
|
+
const root = value === void 0 ? {} : { ...value };
|
|
5986
|
+
const parts = path.split(".");
|
|
5987
|
+
let current = root;
|
|
5988
|
+
for (const [index, part] of parts.entries()) {
|
|
5989
|
+
if (index === parts.length - 1) {
|
|
5990
|
+
current[part] = attributeValue;
|
|
5991
|
+
continue;
|
|
5992
|
+
}
|
|
5993
|
+
const nextValue = current[part];
|
|
5994
|
+
const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
|
|
5995
|
+
current[part] = nextRecord;
|
|
5996
|
+
current = nextRecord;
|
|
5997
|
+
}
|
|
5998
|
+
return root;
|
|
5999
|
+
}
|
|
6000
|
+
function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
6001
|
+
const merged = /* @__PURE__ */ new Map();
|
|
6002
|
+
for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
6003
|
+
for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
6004
|
+
const resolvedAttributes = [];
|
|
6005
|
+
const transformedTrace = spans.map((span) => ({
|
|
6006
|
+
...span,
|
|
6007
|
+
attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
|
|
6008
|
+
}));
|
|
6009
|
+
for (const attribute of merged.values()) {
|
|
6010
|
+
const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
|
|
6011
|
+
resolvedAttributes.push({
|
|
6012
|
+
key: attribute.key,
|
|
6013
|
+
path: resolvedPath,
|
|
6014
|
+
label: attribute.label,
|
|
6015
|
+
format: attribute.format,
|
|
6016
|
+
numberFormat: attribute.numberFormat,
|
|
6017
|
+
placements: attribute.placements,
|
|
6018
|
+
scope: attribute.scope,
|
|
6019
|
+
mode: attribute.mode
|
|
6020
|
+
});
|
|
6021
|
+
if (!attribute.transform) continue;
|
|
6022
|
+
for (const span of transformedTrace) {
|
|
6023
|
+
const sourceValue = getNestedAttribute(span.attributes, attribute.path);
|
|
6024
|
+
if (sourceValue === void 0) continue;
|
|
6025
|
+
const transformedValue = attribute.transform({
|
|
6026
|
+
value: sourceValue,
|
|
6027
|
+
span
|
|
6028
|
+
});
|
|
6029
|
+
if (transformedValue === void 0) continue;
|
|
6030
|
+
span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
|
|
6031
|
+
}
|
|
6032
|
+
}
|
|
6033
|
+
return {
|
|
6034
|
+
trace: transformedTrace,
|
|
6035
|
+
traceDisplay: { attributes: resolvedAttributes }
|
|
6036
|
+
};
|
|
6037
|
+
}
|
|
6038
|
+
//#endregion
|
|
5230
6039
|
//#region ../runner/src/runMaintenance.ts
|
|
5231
6040
|
async function persistRunState(runState) {
|
|
5232
6041
|
await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
|
|
@@ -5551,65 +6360,6 @@ function stripTerminalControlCodes(value) {
|
|
|
5551
6360
|
return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
|
|
5552
6361
|
}
|
|
5553
6362
|
//#endregion
|
|
5554
|
-
//#region ../runner/src/traceDisplay.ts
|
|
5555
|
-
function isRecord$1(value) {
|
|
5556
|
-
return typeof value === "object" && value !== null;
|
|
5557
|
-
}
|
|
5558
|
-
function mergeNestedAttribute(value, path, attributeValue) {
|
|
5559
|
-
const root = value === void 0 ? {} : { ...value };
|
|
5560
|
-
const parts = path.split(".");
|
|
5561
|
-
let current = root;
|
|
5562
|
-
for (const [index, part] of parts.entries()) {
|
|
5563
|
-
if (index === parts.length - 1) {
|
|
5564
|
-
current[part] = attributeValue;
|
|
5565
|
-
continue;
|
|
5566
|
-
}
|
|
5567
|
-
const nextValue = current[part];
|
|
5568
|
-
const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
|
|
5569
|
-
current[part] = nextRecord;
|
|
5570
|
-
current = nextRecord;
|
|
5571
|
-
}
|
|
5572
|
-
return root;
|
|
5573
|
-
}
|
|
5574
|
-
function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
5575
|
-
const merged = /* @__PURE__ */ new Map();
|
|
5576
|
-
for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5577
|
-
for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5578
|
-
const resolvedAttributes = [];
|
|
5579
|
-
const transformedTrace = spans.map((span) => ({
|
|
5580
|
-
...span,
|
|
5581
|
-
attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
|
|
5582
|
-
}));
|
|
5583
|
-
for (const attribute of merged.values()) {
|
|
5584
|
-
const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
|
|
5585
|
-
resolvedAttributes.push({
|
|
5586
|
-
key: attribute.key,
|
|
5587
|
-
path: resolvedPath,
|
|
5588
|
-
label: attribute.label,
|
|
5589
|
-
format: attribute.format,
|
|
5590
|
-
numberFormat: attribute.numberFormat,
|
|
5591
|
-
placements: attribute.placements,
|
|
5592
|
-
scope: attribute.scope,
|
|
5593
|
-
mode: attribute.mode
|
|
5594
|
-
});
|
|
5595
|
-
if (!attribute.transform) continue;
|
|
5596
|
-
for (const span of transformedTrace) {
|
|
5597
|
-
const sourceValue = getNestedAttribute(span.attributes, attribute.path);
|
|
5598
|
-
if (sourceValue === void 0) continue;
|
|
5599
|
-
const transformedValue = attribute.transform({
|
|
5600
|
-
value: sourceValue,
|
|
5601
|
-
span
|
|
5602
|
-
});
|
|
5603
|
-
if (transformedValue === void 0) continue;
|
|
5604
|
-
span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
|
|
5605
|
-
}
|
|
5606
|
-
}
|
|
5607
|
-
return {
|
|
5608
|
-
trace: transformedTrace,
|
|
5609
|
-
traceDisplay: { attributes: resolvedAttributes }
|
|
5610
|
-
};
|
|
5611
|
-
}
|
|
5612
|
-
//#endregion
|
|
5613
6363
|
//#region ../runner/src/runExecution.ts
|
|
5614
6364
|
function filterEvalCases(cases, caseIds) {
|
|
5615
6365
|
if (!caseIds || caseIds.length === 0) return cases;
|
|
@@ -5639,8 +6389,54 @@ function buildScopedEvalIdPrefix(params) {
|
|
|
5639
6389
|
async function callWithUnknownResult(fn, args) {
|
|
5640
6390
|
return await Reflect.apply(fn, void 0, args);
|
|
5641
6391
|
}
|
|
6392
|
+
async function callUnknownFunction(fn, args) {
|
|
6393
|
+
if (typeof fn !== "function") throw new Error("Expected a function");
|
|
6394
|
+
return await Reflect.apply(fn, void 0, args);
|
|
6395
|
+
}
|
|
6396
|
+
function assignDerivedOutputs(params) {
|
|
6397
|
+
for (const [key, value] of Object.entries(params.derived)) {
|
|
6398
|
+
if (key in params.outputs) continue;
|
|
6399
|
+
params.outputs[key] = value;
|
|
6400
|
+
}
|
|
6401
|
+
}
|
|
6402
|
+
async function resolveDeriveFromTracingConfig(params) {
|
|
6403
|
+
const ctx = {
|
|
6404
|
+
trace: params.traceTree,
|
|
6405
|
+
input: params.evalCase.input,
|
|
6406
|
+
case: params.evalCase
|
|
6407
|
+
};
|
|
6408
|
+
if (typeof params.deriveFromTracing === "function") {
|
|
6409
|
+
const derived = await callUnknownFunction(params.deriveFromTracing, [ctx]);
|
|
6410
|
+
if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
|
|
6411
|
+
return derived;
|
|
6412
|
+
}
|
|
6413
|
+
const derived = {};
|
|
6414
|
+
for (const [key, compute] of Object.entries(params.deriveFromTracing)) {
|
|
6415
|
+
const value = await callUnknownFunction(compute, [ctx]);
|
|
6416
|
+
if (value !== void 0) derived[key] = value;
|
|
6417
|
+
}
|
|
6418
|
+
return derived;
|
|
6419
|
+
}
|
|
6420
|
+
async function runDeriveFromTracingConfig(params) {
|
|
6421
|
+
if (params.deriveFromTracing === void 0) return;
|
|
6422
|
+
const { deriveFromTracing } = params;
|
|
6423
|
+
try {
|
|
6424
|
+
const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
|
|
6425
|
+
deriveFromTracing,
|
|
6426
|
+
traceTree: params.traceTree,
|
|
6427
|
+
evalCase: params.evalCase
|
|
6428
|
+
}));
|
|
6429
|
+
assignDerivedOutputs({
|
|
6430
|
+
outputs: params.scope.outputs,
|
|
6431
|
+
derived
|
|
6432
|
+
});
|
|
6433
|
+
} catch (e) {
|
|
6434
|
+
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
6435
|
+
params.scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
6436
|
+
}
|
|
6437
|
+
}
|
|
5642
6438
|
async function runCase(params) {
|
|
5643
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode,
|
|
6439
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5644
6440
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
5645
6441
|
evalId,
|
|
5646
6442
|
evalFilePath,
|
|
@@ -5672,7 +6468,8 @@ async function runCase(params) {
|
|
|
5672
6468
|
adapter: cacheAdapter,
|
|
5673
6469
|
mode: cacheMode,
|
|
5674
6470
|
evalId,
|
|
5675
|
-
|
|
6471
|
+
read: evalDef.cache?.read,
|
|
6472
|
+
store: evalDef.cache?.store
|
|
5676
6473
|
} : void 0,
|
|
5677
6474
|
startTime: evalDef.startTime,
|
|
5678
6475
|
freezeTime: evalDef.freezeTime
|
|
@@ -5685,22 +6482,19 @@ async function runCase(params) {
|
|
|
5685
6482
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
5686
6483
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5687
6484
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5688
|
-
if (!nonAssertError
|
|
5689
|
-
|
|
5690
|
-
|
|
5691
|
-
|
|
5692
|
-
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
|
|
5696
|
-
|
|
5697
|
-
|
|
5698
|
-
|
|
5699
|
-
|
|
5700
|
-
}
|
|
5701
|
-
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5702
|
-
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5703
|
-
}
|
|
6485
|
+
if (!nonAssertError) {
|
|
6486
|
+
await runDeriveFromTracingConfig({
|
|
6487
|
+
deriveFromTracing: globalDeriveFromTracing,
|
|
6488
|
+
scope,
|
|
6489
|
+
traceTree,
|
|
6490
|
+
evalCase
|
|
6491
|
+
});
|
|
6492
|
+
await runDeriveFromTracingConfig({
|
|
6493
|
+
deriveFromTracing: evalDef.deriveFromTracing,
|
|
6494
|
+
scope,
|
|
6495
|
+
traceTree,
|
|
6496
|
+
evalCase
|
|
6497
|
+
});
|
|
5704
6498
|
}
|
|
5705
6499
|
if (!nonAssertError) addDefaultOutputs({
|
|
5706
6500
|
outputs: scope.outputs,
|
|
@@ -5717,7 +6511,7 @@ async function runCase(params) {
|
|
|
5717
6511
|
...scope.outputs,
|
|
5718
6512
|
...parsedOutputs.data
|
|
5719
6513
|
};
|
|
5720
|
-
else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error)));
|
|
6514
|
+
else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
|
|
5721
6515
|
}
|
|
5722
6516
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
5723
6517
|
const scoringTraces = {};
|
|
@@ -5740,7 +6534,8 @@ async function runCase(params) {
|
|
|
5740
6534
|
adapter: cacheAdapter,
|
|
5741
6535
|
mode: cacheMode,
|
|
5742
6536
|
evalId: `${evalId}__score__${key}`,
|
|
5743
|
-
|
|
6537
|
+
read: evalDef.cache?.read,
|
|
6538
|
+
store: evalDef.cache?.store
|
|
5744
6539
|
} : void 0,
|
|
5745
6540
|
startTime: scoreStartTime,
|
|
5746
6541
|
freezeTime: evalDef.freezeTime
|
|
@@ -5795,6 +6590,7 @@ async function runCase(params) {
|
|
|
5795
6590
|
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
5796
6591
|
const columns = {};
|
|
5797
6592
|
const columnOverrides = mergeDefaultColumns({
|
|
6593
|
+
globalColumns,
|
|
5798
6594
|
columns: evalDef.columns,
|
|
5799
6595
|
globalRemove: globalRemoveDefaultConfig,
|
|
5800
6596
|
evalRemove: evalDef.removeDefaultConfig
|
|
@@ -5858,14 +6654,17 @@ function formatOutputsSchemaError(error) {
|
|
|
5858
6654
|
const issueLines = error.issues.map((issue) => {
|
|
5859
6655
|
return `${issue.path.length > 0 ? issue.path.join(".") : "<root>"}: ${issue.message}`;
|
|
5860
6656
|
});
|
|
5861
|
-
if (issueLines.length === 0) return "
|
|
5862
|
-
return
|
|
6657
|
+
if (issueLines.length === 0) return "outputs did not match the configured schema";
|
|
6658
|
+
return issueLines.join("\n");
|
|
5863
6659
|
}
|
|
5864
|
-
function toAssertionFailure(message, error = void 0) {
|
|
5865
|
-
|
|
6660
|
+
function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
|
|
6661
|
+
const name = nameOverride ?? error?.name;
|
|
6662
|
+
const stack = error?.stack ? stripTerminalControlCodes(error.stack) : void 0;
|
|
6663
|
+
return {
|
|
6664
|
+
...name !== void 0 ? { name } : {},
|
|
5866
6665
|
message,
|
|
5867
|
-
stack:
|
|
5868
|
-
}
|
|
6666
|
+
...stack !== void 0 ? { stack } : {}
|
|
6667
|
+
};
|
|
5869
6668
|
}
|
|
5870
6669
|
//#endregion
|
|
5871
6670
|
//#region ../runner/src/runQueue.ts
|
|
@@ -6095,15 +6894,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6095
6894
|
const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
6096
6895
|
for (const evalMeta of targetEvals) {
|
|
6097
6896
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
6098
|
-
let
|
|
6897
|
+
let sourceFingerprint = "";
|
|
6099
6898
|
try {
|
|
6100
|
-
|
|
6899
|
+
sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
|
|
6101
6900
|
} catch {
|
|
6102
|
-
|
|
6901
|
+
sourceFingerprint = "";
|
|
6103
6902
|
}
|
|
6104
|
-
if (
|
|
6105
|
-
runState.manifest.evalSourceFingerprints[evalMeta.key] =
|
|
6106
|
-
evalMeta.sourceFingerprint =
|
|
6903
|
+
if (sourceFingerprint.length > 0) {
|
|
6904
|
+
runState.manifest.evalSourceFingerprints[evalMeta.key] = sourceFingerprint;
|
|
6905
|
+
evalMeta.sourceFingerprint = sourceFingerprint;
|
|
6107
6906
|
} else {
|
|
6108
6907
|
delete runState.manifest.evalSourceFingerprints[evalMeta.key];
|
|
6109
6908
|
evalMeta.sourceFingerprint = null;
|
|
@@ -6112,7 +6911,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6112
6911
|
const registry = getEvalRegistry();
|
|
6113
6912
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
6114
6913
|
await runInEvalRuntimeScope("env", async () => {
|
|
6115
|
-
await loadEvalModule(evalFilePath,
|
|
6914
|
+
await loadEvalModule(evalFilePath, sourceFingerprint);
|
|
6116
6915
|
});
|
|
6117
6916
|
});
|
|
6118
6917
|
const entry = registry.get(evalMeta.id);
|
|
@@ -6126,8 +6925,24 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6126
6925
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
6127
6926
|
await runInEvalRuntimeScope("cases", async () => {
|
|
6128
6927
|
await entry.use(async (evalDef) => {
|
|
6129
|
-
|
|
6130
|
-
|
|
6928
|
+
if (evalDef.manualInput && evalDef.cases !== void 0) throw new Error(`Eval "${evalMeta.id}" cannot declare both "cases" and "manualInput". Remove one of them.`);
|
|
6929
|
+
let manualInputCase = null;
|
|
6930
|
+
if (evalDef.manualInput) {
|
|
6931
|
+
const rawValue = request.manualInputs?.[evalMeta.key];
|
|
6932
|
+
if (rawValue === void 0) throw new Error(`Eval "${evalMeta.id}" requires manual input. Provide it via the run modal in the web UI or "--input" / "--input-file" on the CLI.`);
|
|
6933
|
+
const parsed = parseManualInputValues(evalDef.manualInput, rawValue);
|
|
6934
|
+
if (parsed.error) {
|
|
6935
|
+
const formatted = parsed.error.issues.map((issue) => issue.path ? `${issue.path}: ${issue.message}` : issue.message).join("; ");
|
|
6936
|
+
throw new Error(`Invalid manual input for eval "${evalMeta.id}": ${formatted}`);
|
|
6937
|
+
}
|
|
6938
|
+
manualInputCase = {
|
|
6939
|
+
id: `${evalMeta.id}-manual`,
|
|
6940
|
+
input: parsed.value
|
|
6941
|
+
};
|
|
6942
|
+
}
|
|
6943
|
+
const evalCases = manualInputCase ? [manualInputCase] : await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime });
|
|
6944
|
+
const runnableCases = manualInputCase ? evalCases : resolveRunnableEvalCases({
|
|
6945
|
+
cases: evalCases,
|
|
6131
6946
|
evalId: evalMeta.id
|
|
6132
6947
|
});
|
|
6133
6948
|
const duplicateCaseIds = findDuplicateCaseIds(runnableCases);
|
|
@@ -6136,6 +6951,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6136
6951
|
runState.summary.totalCases += cases.length;
|
|
6137
6952
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
6138
6953
|
evalDef,
|
|
6954
|
+
globalColumns: config.columns,
|
|
6955
|
+
globalStats: config.stats,
|
|
6139
6956
|
globalRemove: config.removeDefaultConfig
|
|
6140
6957
|
});
|
|
6141
6958
|
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
@@ -6181,6 +6998,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6181
6998
|
evalKey: evalMeta.key,
|
|
6182
6999
|
evalCase,
|
|
6183
7000
|
globalTraceDisplay,
|
|
7001
|
+
globalColumns: config.columns,
|
|
7002
|
+
globalDeriveFromTracing: config.deriveFromTracing,
|
|
6184
7003
|
llmCallsConfig,
|
|
6185
7004
|
apiCallsConfig,
|
|
6186
7005
|
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
@@ -6188,7 +7007,6 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6188
7007
|
startTime,
|
|
6189
7008
|
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
6190
7009
|
cacheMode,
|
|
6191
|
-
codeFingerprint,
|
|
6192
7010
|
moduleIsolation,
|
|
6193
7011
|
evalFilePath,
|
|
6194
7012
|
evalFileRelativePath: evalMeta.filePath,
|
|
@@ -6343,4 +7161,4 @@ function toLastRunStatus(status) {
|
|
|
6343
7161
|
return status === "pending" ? null : status;
|
|
6344
7162
|
}
|
|
6345
7163
|
//#endregion
|
|
6346
|
-
export {
|
|
7164
|
+
export { defaultConfigKeySchema as $, incrementEvalOutput as $n, cacheEntryWithDebugKeySchema as $t, createRunRequestSchema as A, buildTraceTree as An, runLogPhaseSchema as At, getEvalDisplayStatus as B, repoFile as Bn, manualInputTextFieldSchema as Bt, loadConfig as C, columnKindSchema as Cn, evalStatAggregateSchema as Ct, createFsCacheStore as D, repoFileRefSchema as Dn, runLogEntrySchema as Dt, validateCharts as E, numberDisplayOptionsSchema as En, evalSummarySchema as Et, extractApiCalls as F, hashCacheKeySync as Fn, manualInputJsonFieldSchema as Ft, runSummarySchema as G, advanceEvalTime as Gn, evalChartConfigSchema as Gt, deriveStatusFromCaseRows as H, readManualInputFile as Hn, evalChartAxisSchema as Ht, extractLlmCalls as I, deserializeCacheRecording as In, manualInputMultilineFieldSchema as It, agentEvalsConfigSchema as J, evalAssert as Jn, evalChartTypeSchema as Jt, DEFAULT_API_CALLS_CONFIG as K, appendToEvalOutput as Kn, evalChartMetricSchema as Kt, applyDerivedCallAttributes as L, deserializeCacheValue as Ln, manualInputNumberFieldSchema as Lt, sseEnvelopeSchema as M, evalSpan as Mn, manualInputBooleanFieldSchema as Mt, extractCacheEntries as N, evalTracer as Nn, manualInputDescriptorSchema as Nt, configReloadStateSchema as O, runArtifactRefSchema as On, runLogLevelSchema as Ot, extractCacheHits as P, hashCacheKey as Pn, manualInputFieldDescriptorSchema as Pt, apiCallsConfigSchema as Q, getEvalStartTime as Qn, cacheEntrySchema as Qt, getNestedAttribute as R, serializeCacheRecording as Rn, manualInputSelectFieldSchema as Rt, resolveEvalDefaultConfig as S, columnFormatSchema as Sn, evalFreshnessStatusSchema as St, normalizeScoreDef as T, jsonCellSchema as Tn, evalStatsConfigSchema as Tt, deriveStatusFromChildStatuses as U, evalExpect as Un, evalChartBuiltinMetricSchema as Ut, deriveScopedSummaryFromCases as V, manualInputFileValueSchema as Vn, evalChartAggregateSchema as Vt, runManifestSchema as W, EvalAssertionError as Wn, evalChartColorSchema as Wt, apiCallMetricPlacementSchema as X, getCurrentScope as Xn, cacheDebugKeyEntrySchema as Xt, apiCallMetricFormatSchema as Y, evalLog as Yn, evalChartsConfigSchema as Yt, apiCallMetricSchema as Z, getEvalCaseInput as Zn, cacheDebugKeyFileSchema as Zt, buildManualInputDescriptor as _, traceSpanKindSchema as _n, getCaseRowEvalKey as _t, getLastRunStatuses as a, cacheRecordingSchema as an, runInExistingEvalScope as ar, llmCallMetricSchema as at, loadEvalModule as b, cellValueSchema as bn, caseRowSchema as bt, loadPersistedRunSnapshots as c, spanCacheOptionsSchema as cn, startEvalBackgroundJob as cr, llmCallsConfigSchema as ct, persistRunState as d, traceAttributeDisplayInputSchema as dn, resolveLlmCallsConfig as dt, cacheFileSchema as en, isInEvalScope as er, evalColumnOverrideSchema as et, recomputeEvalStatusesInRuns as f, traceAttributeDisplayPlacementSchema as fn, runLogsConfigSchema as ft, resolveArtifactPath as g, traceSpanErrorSchema as gn, getCaseRowCaseKey as gt, resolveTracePresentation as h, traceDisplayInputConfigSchema as hn, buildEvalKey as ht, generateRunId as i, cacheRecordingOpSchema as in, runInEvalScope as ir, llmCallMetricPlacementSchema as it, updateManualScoreRequestSchema as j, captureEvalSpanError as jn, scoreTraceSchema as jt, configReloadStatusSchema as k, z$1 as kn, runLogLocationSchema as kt, nextShortIdFromSnapshots as l, traceCacheRefSchema as ln, defineEval as lr, removeDefaultConfigSchema as lt, runTouchesEval as m, traceDisplayConfigSchema as mn, buildCaseKey as mt, getTargetEvalKeys as n, cacheModeSchema as nn, nextEvalId as nr, evalDeriveConfigSchema as nt, getLatestRunInfos as o, cacheStatusSchema as on, setEvalOutput as or, llmCallPricingRateSchema as ot, recomputePersistedCaseStatus as p, traceAttributeDisplaySchema as pn, trialSelectionModeSchema as pt, DEFAULT_LLM_CALLS_CONFIG as q, configureEvalRunLogs as qn, evalChartTooltipExtraSchema as qt, getTargetEvals as r, cacheOperationTypeSchema as rn, runInEvalRuntimeScope as rr, llmCallMetricFormatSchema as rt, loadPersistedRunSnapshot as s, serializedCacheSpanSchema as sn, setScopeCacheContext as sr, llmCallPricingSchema as st, executeRun as t, cacheListItemSchema as tn, mergeEvalOutput as tr, evalColumnsSchema as tt, persistCaseDetail as u, traceAttributeDisplayFormatSchema as un, getEvalRegistry as ur, resolveApiCallsConfig as ut, parseManualInputValues as v, traceSpanSchema as vn, assertionFailureSchema as vt, buildDeclaredColumnDefs as w, fileRefSchema as wn, evalStatItemSchema as wt, parseEvalDiscovery as x, columnDefSchema as xn, discoveryIssueSchema as xt, deriveEvalFreshness as y, traceSpanWarningSchema as yn, caseDetailSchema as yt, getEvalTitle as z, serializeCacheValue as zn, manualInputSelectOptionSchema as zt };
|