@ls-stack/agent-eval 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-ByMLOds2.mjs → app-mBbAN-Gt.mjs} +15 -3
- package/dist/apps/web/dist/assets/index-8VE7b6RK.css +1 -0
- package/dist/apps/web/dist/assets/index-Czer_MdN.js +118 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-DRkwWgTj.mjs → cli-BQwRbqsL.mjs} +75 -4
- package/dist/index.d.mts +342 -90
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-DB0dwGrd.mjs → runOrchestration-ClWYWPen.mjs} +446 -203
- package/dist/{runner-39KGoaM1.mjs → runner-BQn_xf36.mjs} +1 -1
- package/dist/{runner-DRINcaN_.mjs → runner-DbVB66h9.mjs} +2 -2
- package/dist/src-CuirVcPY.mjs +3 -0
- package/package.json +6 -4
- package/skills/agent-eval/SKILL.md +52 -20
- package/dist/apps/web/dist/assets/index-DOXT0Y9V.css +0 -1
- package/dist/apps/web/dist/assets/index-DR2haqvV.js +0 -118
- package/dist/bin.d.mts +0 -1
- package/dist/runChild.d.mts +0 -1
- package/dist/src-BwKm3sKU.mjs +0 -3
|
@@ -2,7 +2,7 @@ import { createRequire, registerHooks } from "node:module";
|
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
3
|
import { mkdir, readFile, readdir, rename, rm, stat, writeFile } from "node:fs/promises";
|
|
4
4
|
import { extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
5
|
-
import { formatWithOptions, isDeepStrictEqual } from "node:util";
|
|
5
|
+
import { formatWithOptions, isDeepStrictEqual, stripVTControlCharacters } from "node:util";
|
|
6
6
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
7
7
|
import { z, z as z$1 } from "zod/v4";
|
|
8
8
|
import { Buffer as Buffer$1 } from "node:buffer";
|
|
@@ -29,6 +29,19 @@ function defineEval(definition) {
|
|
|
29
29
|
});
|
|
30
30
|
}
|
|
31
31
|
//#endregion
|
|
32
|
+
//#region ../sdk/src/stackFormatting.ts
|
|
33
|
+
const orphanedAnsiSgrPattern$1 = /\[(?:\d{1,3}(?:;\d{1,3})*)?m/g;
|
|
34
|
+
/**
|
|
35
|
+
* Remove terminal styling control codes from captured stack text.
|
|
36
|
+
*
|
|
37
|
+
* Some stack providers add ANSI SGR codes for terminal output. Persisted eval
|
|
38
|
+
* artifacts are rendered in the web UI, so stacks should be stored as plain
|
|
39
|
+
* text.
|
|
40
|
+
*/
|
|
41
|
+
function stripTerminalControlCodes$1(value) {
|
|
42
|
+
return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern$1, "");
|
|
43
|
+
}
|
|
44
|
+
//#endregion
|
|
32
45
|
//#region ../sdk/src/runtime.ts
|
|
33
46
|
const scopeStorage = new AsyncLocalStorage();
|
|
34
47
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
@@ -518,10 +531,13 @@ function recordOpIfActive(scope, op) {
|
|
|
518
531
|
if (top) top.ops.push(op);
|
|
519
532
|
}
|
|
520
533
|
function toAssertionFailure$1(message, error = void 0) {
|
|
521
|
-
|
|
534
|
+
const name = error?.name;
|
|
535
|
+
const stack = error?.stack ? stripTerminalControlCodes$1(error.stack) : void 0;
|
|
536
|
+
return {
|
|
537
|
+
...name !== void 0 ? { name } : {},
|
|
522
538
|
message,
|
|
523
|
-
stack:
|
|
524
|
-
}
|
|
539
|
+
...stack !== void 0 ? { stack } : {}
|
|
540
|
+
};
|
|
525
541
|
}
|
|
526
542
|
/**
|
|
527
543
|
* Record or replace an output value for the current case scope.
|
|
@@ -789,7 +805,8 @@ function repoFile(path, mimeType) {
|
|
|
789
805
|
}
|
|
790
806
|
//#endregion
|
|
791
807
|
//#region ../sdk/src/cacheSerialization.ts
|
|
792
|
-
const serializedCacheValueMarker = "
|
|
808
|
+
const serializedCacheValueMarker = "__aecs";
|
|
809
|
+
const legacySerializedCacheValueMarker = "__agentEvalsCacheSerialization";
|
|
793
810
|
const jsonSafeCacheValueVersion = "json-safe-v1";
|
|
794
811
|
const packedNumberArrayMinLength = 128;
|
|
795
812
|
const compressedStringMinBytes = 16 * 1024;
|
|
@@ -799,7 +816,7 @@ function isRecordLike$3(value) {
|
|
|
799
816
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
800
817
|
}
|
|
801
818
|
function isJsonSafeSerializedCacheValue(value) {
|
|
802
|
-
return isRecordLike$3(value) && value
|
|
819
|
+
return isRecordLike$3(value) && serializationMarkerValue(value) === jsonSafeCacheValueVersion && typeof value.type === "string";
|
|
803
820
|
}
|
|
804
821
|
function jsonSafeValue(type, value) {
|
|
805
822
|
return value === void 0 ? {
|
|
@@ -812,32 +829,39 @@ function jsonSafeValue(type, value) {
|
|
|
812
829
|
};
|
|
813
830
|
}
|
|
814
831
|
function hasSerializationMarkerKey(value) {
|
|
815
|
-
return Object.hasOwn(value, serializedCacheValueMarker);
|
|
832
|
+
return Object.hasOwn(value, serializedCacheValueMarker) || Object.hasOwn(value, legacySerializedCacheValueMarker);
|
|
833
|
+
}
|
|
834
|
+
function serializationMarkerValue(value) {
|
|
835
|
+
return value[serializedCacheValueMarker] ?? value[legacySerializedCacheValueMarker];
|
|
816
836
|
}
|
|
817
837
|
/**
|
|
818
838
|
* Serialize one cached value while keeping plain JSON as plain JSON.
|
|
819
839
|
*
|
|
820
|
-
* Rich runtime values use small tagged wrappers.
|
|
840
|
+
* Rich runtime values use small tagged wrappers. Undefined values are omitted
|
|
841
|
+
* by default; pass `preserveUndefined: true` to round-trip them explicitly.
|
|
821
842
|
*/
|
|
822
|
-
async function serializeCacheValue(value) {
|
|
823
|
-
return serializeJsonSafeValue(value, /* @__PURE__ */ new WeakSet(), 0);
|
|
843
|
+
async function serializeCacheValue(value, options = void 0) {
|
|
844
|
+
return serializeJsonSafeValue(value, /* @__PURE__ */ new WeakSet(), 0, normalizeCacheSerializationOptions(options));
|
|
824
845
|
}
|
|
825
846
|
/** Revive one cached value, while preserving legacy JSON-round-tripped data. */
|
|
826
847
|
function deserializeCacheValue(value) {
|
|
827
848
|
return deserializeJsonSafeValue(value);
|
|
828
849
|
}
|
|
829
850
|
/** Clone one value through the same serialization path used for cache data. */
|
|
830
|
-
async function cloneCacheValue(value) {
|
|
831
|
-
return deserializeCacheValue(await serializeCacheValue(value));
|
|
851
|
+
async function cloneCacheValue(value, options = void 0) {
|
|
852
|
+
return deserializeCacheValue(await serializeCacheValue(value, options));
|
|
853
|
+
}
|
|
854
|
+
function normalizeCacheSerializationOptions(options) {
|
|
855
|
+
return { preserveUndefined: options?.preserveUndefined === true };
|
|
832
856
|
}
|
|
833
|
-
async function serializeJsonSafeValue(value, refs, depth) {
|
|
834
|
-
if (value === void 0) return jsonSafeValue("Undefined");
|
|
857
|
+
async function serializeJsonSafeValue(value, refs, depth, config) {
|
|
858
|
+
if (value === void 0) return config.preserveUndefined ? jsonSafeValue("Undefined") : void 0;
|
|
835
859
|
if (typeof value === "bigint") return jsonSafeValue("BigInt", value.toString());
|
|
836
860
|
if (typeof value === "number") return serializeNumber(value);
|
|
837
861
|
if (typeof value === "string") return serializeString(value, depth);
|
|
838
862
|
if (value instanceof Date) return jsonSafeValue("Date", value.toISOString());
|
|
839
|
-
if (value instanceof Map) return serializeMap(value, refs, depth);
|
|
840
|
-
if (value instanceof Set) return serializeSet(value, refs, depth);
|
|
863
|
+
if (value instanceof Map) return serializeMap(value, refs, depth, config);
|
|
864
|
+
if (value instanceof Set) return serializeSet(value, refs, depth, config);
|
|
841
865
|
if (value instanceof RegExp) return jsonSafeValue("RegExp", {
|
|
842
866
|
flags: value.flags,
|
|
843
867
|
source: value.source
|
|
@@ -856,7 +880,7 @@ async function serializeJsonSafeValue(value, refs, depth) {
|
|
|
856
880
|
type: value.type
|
|
857
881
|
});
|
|
858
882
|
if (value instanceof ArrayBuffer) return jsonSafeValue("ArrayBuffer", bytesToBase64(new Uint8Array(value)));
|
|
859
|
-
if (value instanceof Error) return serializeError(value, refs, depth);
|
|
883
|
+
if (value instanceof Error) return serializeError(value, refs, depth, config);
|
|
860
884
|
if (!value || typeof value !== "object") return value;
|
|
861
885
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
862
886
|
refs.add(value);
|
|
@@ -869,12 +893,18 @@ async function serializeJsonSafeValue(value, refs, depth) {
|
|
|
869
893
|
}
|
|
870
894
|
}
|
|
871
895
|
const items = [];
|
|
872
|
-
for (const item of value)
|
|
896
|
+
for (const item of value) {
|
|
897
|
+
const serializedItem = await serializeJsonSafeValue(item, refs, depth + 1, config);
|
|
898
|
+
if (serializedItem !== void 0) items.push(serializedItem);
|
|
899
|
+
}
|
|
873
900
|
refs.delete(value);
|
|
874
901
|
return compressNestedJsonValue(items, depth) ?? items;
|
|
875
902
|
}
|
|
876
903
|
const entries = [];
|
|
877
|
-
for (const [key, entryValue] of Object.entries(value))
|
|
904
|
+
for (const [key, entryValue] of Object.entries(value)) {
|
|
905
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
906
|
+
if (serializedEntryValue !== void 0) entries.push([key, serializedEntryValue]);
|
|
907
|
+
}
|
|
878
908
|
refs.delete(value);
|
|
879
909
|
const serialized = hasSerializationMarkerKey(value) ? jsonSafeValue("Object", entries) : Object.fromEntries(entries);
|
|
880
910
|
return compressNestedJsonValue(serialized, depth) ?? serialized;
|
|
@@ -944,32 +974,40 @@ function compressNestedJsonValue(value, depth) {
|
|
|
944
974
|
function compressionIsWorthIt(value, rawSize) {
|
|
945
975
|
return Buffer$1.byteLength(JSON.stringify(value)) < rawSize * maxCompressedSizeRatio;
|
|
946
976
|
}
|
|
947
|
-
async function serializeMap(value, refs, depth) {
|
|
977
|
+
async function serializeMap(value, refs, depth, config) {
|
|
948
978
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
949
979
|
refs.add(value);
|
|
950
980
|
const entries = [];
|
|
951
|
-
for (const [key, entryValue] of value.entries())
|
|
981
|
+
for (const [key, entryValue] of value.entries()) {
|
|
982
|
+
const serializedKey = await serializeJsonSafeValue(key, refs, depth + 1, config);
|
|
983
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
984
|
+
if (serializedKey !== void 0 && serializedEntryValue !== void 0) entries.push([serializedKey, serializedEntryValue]);
|
|
985
|
+
}
|
|
952
986
|
refs.delete(value);
|
|
953
987
|
return jsonSafeValue("Map", entries);
|
|
954
988
|
}
|
|
955
|
-
async function serializeSet(value, refs, depth) {
|
|
989
|
+
async function serializeSet(value, refs, depth, config) {
|
|
956
990
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
957
991
|
refs.add(value);
|
|
958
992
|
const items = [];
|
|
959
|
-
for (const item of value.values())
|
|
993
|
+
for (const item of value.values()) {
|
|
994
|
+
const serializedItem = await serializeJsonSafeValue(item, refs, depth + 1, config);
|
|
995
|
+
if (serializedItem !== void 0) items.push(serializedItem);
|
|
996
|
+
}
|
|
960
997
|
refs.delete(value);
|
|
961
998
|
return jsonSafeValue("Set", items);
|
|
962
999
|
}
|
|
963
|
-
async function serializeError(value, refs, depth) {
|
|
1000
|
+
async function serializeError(value, refs, depth, config) {
|
|
964
1001
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
965
1002
|
refs.add(value);
|
|
966
1003
|
const props = [];
|
|
967
1004
|
for (const [key, entryValue] of Object.entries(value)) {
|
|
968
1005
|
if (key === "cause") continue;
|
|
969
|
-
|
|
1006
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
1007
|
+
if (serializedEntryValue !== void 0) props.push([key, serializedEntryValue]);
|
|
970
1008
|
}
|
|
971
1009
|
const serialized = jsonSafeValue("Error", {
|
|
972
|
-
cause: "cause" in value ? await serializeJsonSafeValue(value.cause, refs, depth + 1) : void 0,
|
|
1010
|
+
cause: "cause" in value ? await serializeJsonSafeValue(value.cause, refs, depth + 1, config) : void 0,
|
|
973
1011
|
message: value.message,
|
|
974
1012
|
name: value.name,
|
|
975
1013
|
props,
|
|
@@ -1110,33 +1148,36 @@ function deserializeError(value) {
|
|
|
1110
1148
|
});
|
|
1111
1149
|
return error;
|
|
1112
1150
|
}
|
|
1113
|
-
async function serializeRecordValues(record) {
|
|
1151
|
+
async function serializeRecordValues(record, config) {
|
|
1114
1152
|
const entries = [];
|
|
1115
|
-
for (const [key, value] of Object.entries(record))
|
|
1153
|
+
for (const [key, value] of Object.entries(record)) {
|
|
1154
|
+
const serializedValue = await serializeCacheValue(value, config);
|
|
1155
|
+
if (serializedValue !== void 0) entries.push([key, serializedValue]);
|
|
1156
|
+
}
|
|
1116
1157
|
return Object.fromEntries(entries);
|
|
1117
1158
|
}
|
|
1118
1159
|
function deserializeRecordValues(record) {
|
|
1119
1160
|
return Object.fromEntries(Object.entries(record).map(([key, value]) => [key, deserializeCacheValue(value)]));
|
|
1120
1161
|
}
|
|
1121
|
-
async function serializeCacheRecordingOp(op) {
|
|
1162
|
+
async function serializeCacheRecordingOp(op, config) {
|
|
1122
1163
|
switch (op.kind) {
|
|
1123
1164
|
case "setOutput":
|
|
1124
1165
|
case "appendOutput": return {
|
|
1125
1166
|
...op,
|
|
1126
|
-
value: await serializeCacheValue(op.value)
|
|
1167
|
+
value: await serializeCacheValue(op.value, config)
|
|
1127
1168
|
};
|
|
1128
1169
|
case "mergeOutput": return {
|
|
1129
1170
|
...op,
|
|
1130
|
-
patch: await serializeRecordValues(op.patch)
|
|
1171
|
+
patch: await serializeRecordValues(op.patch, config)
|
|
1131
1172
|
};
|
|
1132
1173
|
case "incrementOutput": return op;
|
|
1133
1174
|
case "checkpoint": return {
|
|
1134
1175
|
...op,
|
|
1135
|
-
data: await serializeCacheValue(op.data)
|
|
1176
|
+
data: await serializeCacheValue(op.data, config)
|
|
1136
1177
|
};
|
|
1137
1178
|
case "subSpan": return {
|
|
1138
1179
|
...op,
|
|
1139
|
-
span: await serializeCacheSpan(op.span)
|
|
1180
|
+
span: await serializeCacheSpan(op.span, config)
|
|
1140
1181
|
};
|
|
1141
1182
|
}
|
|
1142
1183
|
}
|
|
@@ -1162,11 +1203,11 @@ function deserializeCacheRecordingOp(op) {
|
|
|
1162
1203
|
};
|
|
1163
1204
|
}
|
|
1164
1205
|
}
|
|
1165
|
-
async function serializeCacheSpan(span) {
|
|
1206
|
+
async function serializeCacheSpan(span, config) {
|
|
1166
1207
|
return {
|
|
1167
1208
|
...span,
|
|
1168
|
-
attributes: span.attributes === void 0 ? void 0 : await serializeRecordValues(span.attributes),
|
|
1169
|
-
children: await Promise.all(span.children.map(serializeCacheSpan))
|
|
1209
|
+
attributes: span.attributes === void 0 ? void 0 : await serializeRecordValues(span.attributes, config),
|
|
1210
|
+
children: await Promise.all(span.children.map((child) => serializeCacheSpan(child, config)))
|
|
1170
1211
|
};
|
|
1171
1212
|
}
|
|
1172
1213
|
function deserializeCacheSpan(span) {
|
|
@@ -1176,13 +1217,19 @@ function deserializeCacheSpan(span) {
|
|
|
1176
1217
|
children: span.children.map(deserializeCacheSpan)
|
|
1177
1218
|
};
|
|
1178
1219
|
}
|
|
1179
|
-
/**
|
|
1180
|
-
|
|
1220
|
+
/**
|
|
1221
|
+
* Serialize all rich values captured in a cache recording before persistence.
|
|
1222
|
+
*
|
|
1223
|
+
* Undefined values are omitted by default; pass `preserveUndefined: true` to
|
|
1224
|
+
* retain the legacy explicit undefined wrappers in the recording payload.
|
|
1225
|
+
*/
|
|
1226
|
+
async function serializeCacheRecording(recording, options = void 0) {
|
|
1227
|
+
const config = normalizeCacheSerializationOptions(options);
|
|
1181
1228
|
return {
|
|
1182
1229
|
...recording,
|
|
1183
|
-
returnValue: await serializeCacheValue(recording.returnValue),
|
|
1184
|
-
finalAttributes: await serializeRecordValues(recording.finalAttributes),
|
|
1185
|
-
ops: await Promise.all(recording.ops.map(serializeCacheRecordingOp))
|
|
1230
|
+
returnValue: await serializeCacheValue(recording.returnValue, config),
|
|
1231
|
+
finalAttributes: await serializeRecordValues(recording.finalAttributes, config),
|
|
1232
|
+
ops: await Promise.all(recording.ops.map((op) => serializeCacheRecordingOp(op, config)))
|
|
1186
1233
|
};
|
|
1187
1234
|
}
|
|
1188
1235
|
/** Revive all rich values captured in a cache recording after lookup. */
|
|
@@ -1574,7 +1621,9 @@ function createTraceCache(generateSpanId) {
|
|
|
1574
1621
|
key: info.key
|
|
1575
1622
|
}, { serializeFileBytes: info.serializeFileBytes === true });
|
|
1576
1623
|
const activeSpan = scope.activeSpanStack.at(-1);
|
|
1577
|
-
|
|
1624
|
+
const canRead = cacheCtx.mode === "use" && cacheCtx.read !== false;
|
|
1625
|
+
const canStore = cacheCtx.mode !== "bypass" && cacheCtx.store !== false;
|
|
1626
|
+
if (canRead) {
|
|
1578
1627
|
const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
|
|
1579
1628
|
if (hit) {
|
|
1580
1629
|
const storedAt = hit.storedAt;
|
|
@@ -1597,14 +1646,24 @@ function createTraceCache(generateSpanId) {
|
|
|
1597
1646
|
name: info.name,
|
|
1598
1647
|
namespace,
|
|
1599
1648
|
key: keyHash,
|
|
1600
|
-
status: "miss"
|
|
1649
|
+
status: "miss",
|
|
1650
|
+
...canStore ? {} : { stored: false }
|
|
1601
1651
|
});
|
|
1602
|
-
} else if (cacheCtx.mode === "
|
|
1652
|
+
} else if (cacheCtx.mode === "use" && canStore) recordCacheRef(scope, activeSpan, {
|
|
1653
|
+
type: "value",
|
|
1654
|
+
name: info.name,
|
|
1655
|
+
namespace,
|
|
1656
|
+
key: keyHash,
|
|
1657
|
+
status: "miss",
|
|
1658
|
+
read: false
|
|
1659
|
+
});
|
|
1660
|
+
else if (cacheCtx.mode === "refresh") recordCacheRef(scope, activeSpan, {
|
|
1603
1661
|
type: "value",
|
|
1604
1662
|
name: info.name,
|
|
1605
1663
|
namespace,
|
|
1606
1664
|
key: keyHash,
|
|
1607
|
-
status: "refresh"
|
|
1665
|
+
status: "refresh",
|
|
1666
|
+
...canStore ? {} : { stored: false }
|
|
1608
1667
|
});
|
|
1609
1668
|
else recordCacheRef(scope, activeSpan, {
|
|
1610
1669
|
type: "value",
|
|
@@ -1627,7 +1686,7 @@ function createTraceCache(generateSpanId) {
|
|
|
1627
1686
|
scope.recordingStack.pop();
|
|
1628
1687
|
}
|
|
1629
1688
|
appendSubSpanOps(scope, frame);
|
|
1630
|
-
if (
|
|
1689
|
+
if (canStore) {
|
|
1631
1690
|
const finalAttributes = diffNonCacheAttributes(beforeAttributes, await snapshotNonCacheAttributes(activeSpan));
|
|
1632
1691
|
const recording = {
|
|
1633
1692
|
returnValue: bodyResult,
|
|
@@ -1641,13 +1700,11 @@ function createTraceCache(generateSpanId) {
|
|
|
1641
1700
|
operationType: "value",
|
|
1642
1701
|
operationName: info.name,
|
|
1643
1702
|
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
1644
|
-
codeFingerprint: cacheCtx.codeFingerprint,
|
|
1645
1703
|
recording: await serializeCacheRecording(recording)
|
|
1646
1704
|
}, {
|
|
1647
1705
|
rawKey: info.key,
|
|
1648
1706
|
operationType: "value",
|
|
1649
|
-
operationName: info.name
|
|
1650
|
-
codeFingerprint: cacheCtx.codeFingerprint
|
|
1707
|
+
operationName: info.name
|
|
1651
1708
|
});
|
|
1652
1709
|
}
|
|
1653
1710
|
return bodyResult;
|
|
@@ -2018,11 +2075,13 @@ async function traceSpanInternal(info, fn) {
|
|
|
2018
2075
|
namespace,
|
|
2019
2076
|
key: cacheOpts.key
|
|
2020
2077
|
}, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
|
|
2078
|
+
const canRead = ctx.mode === "use" && ctx.read !== false;
|
|
2079
|
+
const canStore = ctx.mode !== "bypass" && ctx.store !== false;
|
|
2021
2080
|
mergeSpanAttributes(spanRecord, {
|
|
2022
2081
|
"cache.key": keyHash,
|
|
2023
2082
|
"cache.namespace": namespace
|
|
2024
2083
|
});
|
|
2025
|
-
if (
|
|
2084
|
+
if (canRead) {
|
|
2026
2085
|
const hit = await ctx.adapter.lookup(namespace, keyHash);
|
|
2027
2086
|
if (hit) {
|
|
2028
2087
|
const storedAt = hit.storedAt;
|
|
@@ -2037,8 +2096,18 @@ async function traceSpanInternal(info, fn) {
|
|
|
2037
2096
|
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
2038
2097
|
return recording.returnValue;
|
|
2039
2098
|
}
|
|
2040
|
-
mergeSpanAttributes(spanRecord, {
|
|
2041
|
-
|
|
2099
|
+
mergeSpanAttributes(spanRecord, {
|
|
2100
|
+
"cache.status": "miss",
|
|
2101
|
+
...canStore ? {} : { "cache.stored": false }
|
|
2102
|
+
});
|
|
2103
|
+
} else if (ctx.mode === "use" && canStore) mergeSpanAttributes(spanRecord, {
|
|
2104
|
+
"cache.status": "miss",
|
|
2105
|
+
"cache.read": false
|
|
2106
|
+
});
|
|
2107
|
+
else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, {
|
|
2108
|
+
"cache.status": "refresh",
|
|
2109
|
+
...canStore ? {} : { "cache.stored": false }
|
|
2110
|
+
});
|
|
2042
2111
|
else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
|
|
2043
2112
|
const frame = {
|
|
2044
2113
|
baseSpanIndex: scope.spans.length,
|
|
@@ -2054,7 +2123,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
2054
2123
|
}
|
|
2055
2124
|
appendSubSpanOps(scope, frame);
|
|
2056
2125
|
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
2057
|
-
if (
|
|
2126
|
+
if (canStore) {
|
|
2058
2127
|
const recording = {
|
|
2059
2128
|
returnValue: bodyResult,
|
|
2060
2129
|
finalAttributes: stripCacheAttributes(spanRecord.attributes),
|
|
@@ -2074,14 +2143,12 @@ async function traceSpanInternal(info, fn) {
|
|
|
2074
2143
|
spanName: info.name,
|
|
2075
2144
|
spanKind: info.kind,
|
|
2076
2145
|
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
2077
|
-
codeFingerprint: ctx.codeFingerprint,
|
|
2078
2146
|
recording: await serializeCacheRecording(recording)
|
|
2079
2147
|
};
|
|
2080
2148
|
await ctx.adapter.write(entry, {
|
|
2081
2149
|
rawKey: cacheOpts.key,
|
|
2082
2150
|
operationType: "span",
|
|
2083
|
-
operationName: info.name
|
|
2084
|
-
codeFingerprint: ctx.codeFingerprint
|
|
2151
|
+
operationName: info.name
|
|
2085
2152
|
});
|
|
2086
2153
|
}
|
|
2087
2154
|
return bodyResult;
|
|
@@ -2274,6 +2341,7 @@ const columnDefSchema = z.object({
|
|
|
2274
2341
|
passThreshold: z.number().optional(),
|
|
2275
2342
|
maxStars: z.number().int().min(2).optional(),
|
|
2276
2343
|
hideInTable: z.boolean().optional(),
|
|
2344
|
+
hideIfNoValue: z.boolean().optional(),
|
|
2277
2345
|
align: z.enum([
|
|
2278
2346
|
"left",
|
|
2279
2347
|
"center",
|
|
@@ -2417,6 +2485,10 @@ const traceCacheRefSchema = z.object({
|
|
|
2417
2485
|
namespace: z.string(),
|
|
2418
2486
|
key: z.string(),
|
|
2419
2487
|
status: cacheStatusSchema,
|
|
2488
|
+
/** Whether this ref attempted to read from cache. Defaults to true. */
|
|
2489
|
+
read: z.boolean().optional(),
|
|
2490
|
+
/** Whether this ref wrote a persisted cache entry. Defaults to true for misses/refreshes. */
|
|
2491
|
+
stored: z.boolean().optional(),
|
|
2420
2492
|
storedAt: z.string().optional(),
|
|
2421
2493
|
age: z.number().optional()
|
|
2422
2494
|
});
|
|
@@ -2429,7 +2501,6 @@ const cacheListItemSchema = z.object({
|
|
|
2429
2501
|
spanName: z.string().optional(),
|
|
2430
2502
|
spanKind: traceSpanKindSchema.optional(),
|
|
2431
2503
|
storedAt: z.string(),
|
|
2432
|
-
codeFingerprint: z.string(),
|
|
2433
2504
|
sizeBytes: z.number()
|
|
2434
2505
|
});
|
|
2435
2506
|
/** Zod schema for `SerializedCacheSpan`, defined lazily for recursion. */
|
|
@@ -2511,7 +2582,6 @@ const cacheEntrySchema = z.object({
|
|
|
2511
2582
|
spanName: z.string().optional(),
|
|
2512
2583
|
spanKind: traceSpanKindSchema.optional(),
|
|
2513
2584
|
storedAt: z.string(),
|
|
2514
|
-
codeFingerprint: z.string(),
|
|
2515
2585
|
recording: cacheRecordingSchema
|
|
2516
2586
|
});
|
|
2517
2587
|
/** Debug-only raw key metadata stored outside the reusable cache entry. */
|
|
@@ -2522,7 +2592,6 @@ const cacheDebugKeyEntrySchema = z.object({
|
|
|
2522
2592
|
operationType: cacheOperationTypeSchema,
|
|
2523
2593
|
operationName: z.string(),
|
|
2524
2594
|
storedAt: z.string(),
|
|
2525
|
-
codeFingerprint: z.string(),
|
|
2526
2595
|
rawKey: z.unknown()
|
|
2527
2596
|
});
|
|
2528
2597
|
/** Cache lookup response with optional debug-only raw key data. */
|
|
@@ -2614,6 +2683,11 @@ const evalChartTooltipExtraSchema = z.discriminatedUnion("source", [z.object({
|
|
|
2614
2683
|
const evalChartConfigSchema = z.object({
|
|
2615
2684
|
/** Optional heading shown above the chart frame in the UI. */
|
|
2616
2685
|
heading: z.string().optional(),
|
|
2686
|
+
/**
|
|
2687
|
+
* Hide this chart in the UI when none of its metrics has a numeric value in
|
|
2688
|
+
* the rendered history window.
|
|
2689
|
+
*/
|
|
2690
|
+
hideIfNoValue: z.boolean().optional(),
|
|
2617
2691
|
type: evalChartTypeSchema,
|
|
2618
2692
|
/** At least one series must be declared. */
|
|
2619
2693
|
metrics: z.array(evalChartMetricSchema).min(1),
|
|
@@ -2654,17 +2728,31 @@ const evalStatAggregateSchema = z.enum([
|
|
|
2654
2728
|
"sum",
|
|
2655
2729
|
"last"
|
|
2656
2730
|
]);
|
|
2731
|
+
const hideIfNoValueShape = {
|
|
2732
|
+
/**
|
|
2733
|
+
* Hide this stat in the UI when the current run has no displayable value.
|
|
2734
|
+
* Missing values, `null`, and empty strings count as no value; `0` remains
|
|
2735
|
+
* visible.
|
|
2736
|
+
*/
|
|
2737
|
+
hideIfNoValue: z.boolean().optional() };
|
|
2657
2738
|
/**
|
|
2658
2739
|
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
2659
2740
|
* `column` aggregates a score or numeric output column across the latest run.
|
|
2660
2741
|
*/
|
|
2661
2742
|
const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
2662
|
-
z.object({
|
|
2743
|
+
z.object({
|
|
2744
|
+
kind: z.literal("cases"),
|
|
2745
|
+
...hideIfNoValueShape
|
|
2746
|
+
}),
|
|
2663
2747
|
z.object({
|
|
2664
2748
|
kind: z.literal("passRate"),
|
|
2665
|
-
accent: z.boolean().optional()
|
|
2749
|
+
accent: z.boolean().optional(),
|
|
2750
|
+
...hideIfNoValueShape
|
|
2751
|
+
}),
|
|
2752
|
+
z.object({
|
|
2753
|
+
kind: z.literal("duration"),
|
|
2754
|
+
...hideIfNoValueShape
|
|
2666
2755
|
}),
|
|
2667
|
-
z.object({ kind: z.literal("duration") }),
|
|
2668
2756
|
z.object({
|
|
2669
2757
|
kind: z.literal("column"),
|
|
2670
2758
|
key: z.string(),
|
|
@@ -2673,7 +2761,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2673
2761
|
format: columnFormatSchema.optional(),
|
|
2674
2762
|
/** Number presentation options applied when `format: 'number'`. */
|
|
2675
2763
|
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
2676
|
-
accent: z.boolean().optional()
|
|
2764
|
+
accent: z.boolean().optional(),
|
|
2765
|
+
...hideIfNoValueShape
|
|
2677
2766
|
})
|
|
2678
2767
|
]);
|
|
2679
2768
|
/** Ordered list of stats rendered in the EvalCard stats row. */
|
|
@@ -2754,6 +2843,12 @@ const caseRowSchema = z.object({
|
|
|
2754
2843
|
});
|
|
2755
2844
|
/** Structured assertion failure metadata captured for one case run. */
|
|
2756
2845
|
const assertionFailureSchema = z.object({
|
|
2846
|
+
/**
|
|
2847
|
+
* Error class or category label rendered alongside the message (e.g.
|
|
2848
|
+
* `EvalAssertionError`, `OutputsSchemaError`). Optional for legacy entries
|
|
2849
|
+
* and synthetic failures without an originating Error.
|
|
2850
|
+
*/
|
|
2851
|
+
name: z.string().optional(),
|
|
2757
2852
|
/** Human-readable assertion failure message shown in the UI and artifacts. */
|
|
2758
2853
|
message: z.string(),
|
|
2759
2854
|
/** Stack trace captured from the originating error when available. */
|
|
@@ -2902,6 +2997,25 @@ const defaultConfigKeySchema = z.enum([
|
|
|
2902
2997
|
]);
|
|
2903
2998
|
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
2904
2999
|
const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
|
|
3000
|
+
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
3001
|
+
/** Schema for keyed or object-returning trace-derived output config. */
|
|
3002
|
+
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
3003
|
+
/** Schema for UI overrides on derived or scored columns. */
|
|
3004
|
+
const evalColumnOverrideSchema = z.object({
|
|
3005
|
+
label: z.string().optional(),
|
|
3006
|
+
format: columnFormatSchema.optional(),
|
|
3007
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
3008
|
+
hideInTable: z.boolean().optional(),
|
|
3009
|
+
hideIfNoValue: z.boolean().optional(),
|
|
3010
|
+
align: z.enum([
|
|
3011
|
+
"left",
|
|
3012
|
+
"center",
|
|
3013
|
+
"right"
|
|
3014
|
+
]).optional(),
|
|
3015
|
+
maxStars: z.number().int().min(2).optional()
|
|
3016
|
+
});
|
|
3017
|
+
/** Schema for column override maps keyed by output or score field name. */
|
|
3018
|
+
const evalColumnsSchema = z.record(z.string(), evalColumnOverrideSchema);
|
|
2905
3019
|
/** Render formats supported by an LLM-call metric in the UI. */
|
|
2906
3020
|
const llmCallMetricFormatSchema = z.enum([
|
|
2907
3021
|
"string",
|
|
@@ -2979,18 +3093,9 @@ const apiCallMetricSchema = z.object({
|
|
|
2979
3093
|
placements: z.array(apiCallMetricPlacementSchema).nonempty().optional()
|
|
2980
3094
|
});
|
|
2981
3095
|
/**
|
|
2982
|
-
* Schema for
|
|
2983
|
-
* from token counts.
|
|
3096
|
+
* Schema for pricing rates used to derive LLM-call costs from token counts.
|
|
2984
3097
|
*/
|
|
2985
|
-
const
|
|
2986
|
-
/** Exact model name read from the configured `attributes.model` path. */
|
|
2987
|
-
model: z.string().min(1),
|
|
2988
|
-
/**
|
|
2989
|
-
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
2990
|
-
* the entry only applies to calls from that provider; provider-specific
|
|
2991
|
-
* entries take precedence over generic entries for the same model.
|
|
2992
|
-
*/
|
|
2993
|
-
provider: z.string().min(1).optional(),
|
|
3098
|
+
const llmCallPricingRateSchema = z.object({
|
|
2994
3099
|
/** USD per one million non-cached input tokens. */
|
|
2995
3100
|
inputUsdPerMillion: z.number().nonnegative().optional(),
|
|
2996
3101
|
/** USD per one million output tokens. */
|
|
@@ -3004,6 +3109,23 @@ const llmCallPricingSchema = z.object({
|
|
|
3004
3109
|
/** USD per one million reasoning tokens when reported separately. */
|
|
3005
3110
|
reasoningUsdPerMillion: z.number().nonnegative().optional()
|
|
3006
3111
|
});
|
|
3112
|
+
/**
|
|
3113
|
+
* Schema for one model's pricing config. The object key is the exact model
|
|
3114
|
+
* name. Use `providers` when a model has provider-specific rates in addition
|
|
3115
|
+
* to, or instead of, generic model rates.
|
|
3116
|
+
*/
|
|
3117
|
+
const llmCallPricingSchema = llmCallPricingRateSchema.extend({
|
|
3118
|
+
/**
|
|
3119
|
+
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
3120
|
+
* the top-level entry only applies to calls from that provider.
|
|
3121
|
+
*/
|
|
3122
|
+
provider: z.string().min(1).optional(),
|
|
3123
|
+
/**
|
|
3124
|
+
* Provider-specific pricing for the model. Provider entries take precedence
|
|
3125
|
+
* over generic rates for the same model.
|
|
3126
|
+
*/
|
|
3127
|
+
providers: z.record(z.string().min(1), llmCallPricingRateSchema).optional()
|
|
3128
|
+
});
|
|
3007
3129
|
/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
|
|
3008
3130
|
const llmCallsConfigSchema = z.object({
|
|
3009
3131
|
/** Span kinds treated as LLM calls. Defaults to `['llm']`. */
|
|
@@ -3040,10 +3162,10 @@ const llmCallsConfigSchema = z.object({
|
|
|
3040
3162
|
*/
|
|
3041
3163
|
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
3042
3164
|
/**
|
|
3043
|
-
* Model
|
|
3044
|
-
*
|
|
3165
|
+
* Model-keyed pricing registry used to calculate LLM-call costs from token
|
|
3166
|
+
* counts. Built-in LLM cost fields are only derived from this registry.
|
|
3045
3167
|
*/
|
|
3046
|
-
pricing: z.
|
|
3168
|
+
pricing: z.record(z.string().min(1), llmCallPricingSchema).optional(),
|
|
3047
3169
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
3048
3170
|
metrics: z.array(llmCallMetricSchema).optional()
|
|
3049
3171
|
});
|
|
@@ -3159,6 +3281,33 @@ function resolveApiCallMetric(metric) {
|
|
|
3159
3281
|
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
3160
3282
|
};
|
|
3161
3283
|
}
|
|
3284
|
+
function hasPricingRates(pricing) {
|
|
3285
|
+
return pricing.inputUsdPerMillion !== void 0 || pricing.outputUsdPerMillion !== void 0 || pricing.cachedInputUsdPerMillion !== void 0 || pricing.cacheCreationInputUsdPerMillion !== void 0 || pricing.cacheCreationInput1hUsdPerMillion !== void 0 || pricing.reasoningUsdPerMillion !== void 0;
|
|
3286
|
+
}
|
|
3287
|
+
function copyPricingRates(pricing) {
|
|
3288
|
+
return {
|
|
3289
|
+
inputUsdPerMillion: pricing.inputUsdPerMillion,
|
|
3290
|
+
outputUsdPerMillion: pricing.outputUsdPerMillion,
|
|
3291
|
+
cachedInputUsdPerMillion: pricing.cachedInputUsdPerMillion,
|
|
3292
|
+
cacheCreationInputUsdPerMillion: pricing.cacheCreationInputUsdPerMillion,
|
|
3293
|
+
cacheCreationInput1hUsdPerMillion: pricing.cacheCreationInput1hUsdPerMillion,
|
|
3294
|
+
reasoningUsdPerMillion: pricing.reasoningUsdPerMillion
|
|
3295
|
+
};
|
|
3296
|
+
}
|
|
3297
|
+
function resolveLlmCallPricingEntries(model, pricing) {
|
|
3298
|
+
const entries = [];
|
|
3299
|
+
if (hasPricingRates(pricing)) entries.push({
|
|
3300
|
+
model,
|
|
3301
|
+
provider: pricing.provider,
|
|
3302
|
+
...copyPricingRates(pricing)
|
|
3303
|
+
});
|
|
3304
|
+
for (const [provider, providerPricing] of Object.entries(pricing.providers ?? {})) entries.push({
|
|
3305
|
+
model,
|
|
3306
|
+
provider,
|
|
3307
|
+
...copyPricingRates(providerPricing)
|
|
3308
|
+
});
|
|
3309
|
+
return entries;
|
|
3310
|
+
}
|
|
3162
3311
|
/**
|
|
3163
3312
|
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
3164
3313
|
* by the UI to derive the LLM calls tab.
|
|
@@ -3169,7 +3318,7 @@ function resolveApiCallMetric(metric) {
|
|
|
3169
3318
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
3170
3319
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
3171
3320
|
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
3172
|
-
* derived from configured pricing and token counts.
|
|
3321
|
+
* derived from configured model-keyed pricing and token counts.
|
|
3173
3322
|
*/
|
|
3174
3323
|
function resolveLlmCallsConfig(input) {
|
|
3175
3324
|
return {
|
|
@@ -3180,16 +3329,7 @@ function resolveLlmCallsConfig(input) {
|
|
|
3180
3329
|
},
|
|
3181
3330
|
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3182
3331
|
metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
|
|
3183
|
-
pricing: (input?.pricing ??
|
|
3184
|
-
model: p.model,
|
|
3185
|
-
provider: p.provider,
|
|
3186
|
-
inputUsdPerMillion: p.inputUsdPerMillion,
|
|
3187
|
-
outputUsdPerMillion: p.outputUsdPerMillion,
|
|
3188
|
-
cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
|
|
3189
|
-
cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
|
|
3190
|
-
cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
|
|
3191
|
-
reasoningUsdPerMillion: p.reasoningUsdPerMillion
|
|
3192
|
-
}))
|
|
3332
|
+
pricing: Object.entries(input?.pricing ?? {}).flatMap(([model, pricing]) => resolveLlmCallPricingEntries(model, pricing))
|
|
3193
3333
|
};
|
|
3194
3334
|
}
|
|
3195
3335
|
/**
|
|
@@ -3223,6 +3363,9 @@ const agentEvalsConfigSchema = z.object({
|
|
|
3223
3363
|
staleAfterDays: z.number().optional(),
|
|
3224
3364
|
allowCliRunAll: z.boolean().optional(),
|
|
3225
3365
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
3366
|
+
columns: evalColumnsSchema.optional(),
|
|
3367
|
+
deriveFromTracing: evalDeriveConfigSchema.optional(),
|
|
3368
|
+
stats: evalStatsConfigSchema.optional(),
|
|
3226
3369
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
3227
3370
|
removeDefaultConfig: removeDefaultConfigSchema.optional(),
|
|
3228
3371
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
@@ -3875,6 +4018,11 @@ function readNumber(attributes, key) {
|
|
|
3875
4018
|
const value = attributes[key];
|
|
3876
4019
|
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
3877
4020
|
}
|
|
4021
|
+
function readBoolean(attributes, key) {
|
|
4022
|
+
if (!isRecord$2(attributes)) return void 0;
|
|
4023
|
+
const value = attributes[key];
|
|
4024
|
+
return typeof value === "boolean" ? value : void 0;
|
|
4025
|
+
}
|
|
3878
4026
|
function readArray(attributes, key) {
|
|
3879
4027
|
if (!isRecord$2(attributes)) return [];
|
|
3880
4028
|
const value = attributes[key];
|
|
@@ -3903,12 +4051,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3903
4051
|
const namespace = readString(span.attributes, "cache.namespace");
|
|
3904
4052
|
if (key !== void 0 && namespace !== void 0) {
|
|
3905
4053
|
const isHit = status === "hit";
|
|
4054
|
+
const stored = isHit ? true : readBoolean(span.attributes, "cache.stored") !== false;
|
|
3906
4055
|
entries.push({
|
|
3907
4056
|
id: span.id,
|
|
3908
4057
|
source: "span",
|
|
3909
4058
|
origin: "span",
|
|
3910
|
-
action: isHit ? "hit" : "added",
|
|
4059
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3911
4060
|
status,
|
|
4061
|
+
stored,
|
|
3912
4062
|
name: span.name,
|
|
3913
4063
|
namespace,
|
|
3914
4064
|
key,
|
|
@@ -3925,12 +4075,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3925
4075
|
const ref = parsed.data;
|
|
3926
4076
|
if (ref.status === "bypass") continue;
|
|
3927
4077
|
const isHit = ref.status === "hit";
|
|
4078
|
+
const stored = isHit ? true : ref.stored !== false;
|
|
3928
4079
|
entries.push({
|
|
3929
4080
|
id: `${span.id}:value:${String(index)}`,
|
|
3930
4081
|
source: "value",
|
|
3931
4082
|
origin: "span",
|
|
3932
|
-
action: isHit ? "hit" : "added",
|
|
4083
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3933
4084
|
status: ref.status,
|
|
4085
|
+
stored,
|
|
3934
4086
|
name: ref.name,
|
|
3935
4087
|
namespace: ref.namespace,
|
|
3936
4088
|
key: ref.key,
|
|
@@ -3943,12 +4095,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3943
4095
|
for (const [index, ref] of caseCacheRefs.entries()) {
|
|
3944
4096
|
if (ref.status === "bypass") continue;
|
|
3945
4097
|
const isHit = ref.status === "hit";
|
|
4098
|
+
const stored = isHit ? true : ref.stored !== false;
|
|
3946
4099
|
entries.push({
|
|
3947
4100
|
id: `case:value:${String(index)}`,
|
|
3948
4101
|
source: "value",
|
|
3949
4102
|
origin: "caseRoot",
|
|
3950
|
-
action: isHit ? "hit" : "added",
|
|
4103
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3951
4104
|
status: ref.status,
|
|
4105
|
+
stored,
|
|
3952
4106
|
name: ref.name,
|
|
3953
4107
|
namespace: ref.namespace,
|
|
3954
4108
|
key: ref.key,
|
|
@@ -4020,7 +4174,8 @@ const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1
|
|
|
4020
4174
|
//#endregion
|
|
4021
4175
|
//#region ../runner/src/cacheStore.ts
|
|
4022
4176
|
const defaultMaxEntriesPerNamespace = 100;
|
|
4023
|
-
const cacheSerializationMarker = "
|
|
4177
|
+
const cacheSerializationMarker = "__aecs";
|
|
4178
|
+
const legacyCacheSerializationMarker = "__agentEvalsCacheSerialization";
|
|
4024
4179
|
const supportedCacheSerializationVersion = "json-safe-v1";
|
|
4025
4180
|
/**
|
|
4026
4181
|
* Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
|
|
@@ -4105,7 +4260,6 @@ function createFsCacheStore(options) {
|
|
|
4105
4260
|
spanName: entry.spanName,
|
|
4106
4261
|
spanKind: entry.spanKind,
|
|
4107
4262
|
storedAt: entry.storedAt,
|
|
4108
|
-
codeFingerprint: entry.codeFingerprint,
|
|
4109
4263
|
sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
|
|
4110
4264
|
});
|
|
4111
4265
|
}
|
|
@@ -4234,7 +4388,7 @@ async function readCacheFilePath(filePath) {
|
|
|
4234
4388
|
function usesSupportedCacheSerialization(value) {
|
|
4235
4389
|
if (Array.isArray(value)) return value.every(usesSupportedCacheSerialization);
|
|
4236
4390
|
if (!isRecordLike(value)) return true;
|
|
4237
|
-
if (Object.hasOwn(value,
|
|
4391
|
+
for (const marker of [cacheSerializationMarker, legacyCacheSerializationMarker]) if (Object.hasOwn(value, marker) && value[marker] !== supportedCacheSerializationVersion) return false;
|
|
4238
4392
|
return Object.values(value).every(usesSupportedCacheSerialization);
|
|
4239
4393
|
}
|
|
4240
4394
|
async function writeOrRemoveCacheFile(cacheDir, cacheFile) {
|
|
@@ -4278,7 +4432,6 @@ async function writeDebugKeyEntry(params) {
|
|
|
4278
4432
|
operationType: debugKey.operationType,
|
|
4279
4433
|
operationName: debugKey.operationName,
|
|
4280
4434
|
storedAt: entry.storedAt,
|
|
4281
|
-
codeFingerprint: debugKey.codeFingerprint,
|
|
4282
4435
|
rawKey: debugKey.rawKey
|
|
4283
4436
|
};
|
|
4284
4437
|
await writeDebugKeyFile(debugDir, {
|
|
@@ -4494,6 +4647,7 @@ function getScoreOverride(def) {
|
|
|
4494
4647
|
format: def.format,
|
|
4495
4648
|
numberFormat: def.numberFormat,
|
|
4496
4649
|
hideInTable: def.hideInTable,
|
|
4650
|
+
hideIfNoValue: def.hideIfNoValue,
|
|
4497
4651
|
align: def.align,
|
|
4498
4652
|
maxStars: def.maxStars
|
|
4499
4653
|
};
|
|
@@ -4506,6 +4660,7 @@ function mergeOverrides(base, override) {
|
|
|
4506
4660
|
format: override.format ?? base.format,
|
|
4507
4661
|
numberFormat: override.numberFormat ?? base.numberFormat,
|
|
4508
4662
|
hideInTable: override.hideInTable ?? base.hideInTable,
|
|
4663
|
+
hideIfNoValue: override.hideIfNoValue ?? base.hideIfNoValue,
|
|
4509
4664
|
align: override.align ?? base.align,
|
|
4510
4665
|
maxStars: override.maxStars ?? base.maxStars
|
|
4511
4666
|
};
|
|
@@ -4620,6 +4775,7 @@ function createColumnDef(params) {
|
|
|
4620
4775
|
if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
|
|
4621
4776
|
if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
|
|
4622
4777
|
if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
|
|
4778
|
+
if (override?.hideIfNoValue !== void 0) def.hideIfNoValue = override.hideIfNoValue;
|
|
4623
4779
|
if (override?.align !== void 0) def.align = override.align;
|
|
4624
4780
|
if (!isScore) return def;
|
|
4625
4781
|
def.isScore = true;
|
|
@@ -4704,60 +4860,70 @@ const DEFAULT_COLUMNS = {
|
|
|
4704
4860
|
label: "API Calls",
|
|
4705
4861
|
format: "number",
|
|
4706
4862
|
numberFormat: countNumberFormat,
|
|
4707
|
-
align: "right"
|
|
4863
|
+
align: "right",
|
|
4864
|
+
hideIfNoValue: true
|
|
4708
4865
|
},
|
|
4709
4866
|
costUsd: {
|
|
4710
4867
|
label: "Cost",
|
|
4711
4868
|
format: "number",
|
|
4712
4869
|
numberFormat: costNumberFormat,
|
|
4713
|
-
align: "right"
|
|
4870
|
+
align: "right",
|
|
4871
|
+
hideIfNoValue: true
|
|
4714
4872
|
},
|
|
4715
4873
|
llmTurns: {
|
|
4716
4874
|
label: "LLM Turns",
|
|
4717
4875
|
format: "number",
|
|
4718
4876
|
numberFormat: countNumberFormat,
|
|
4719
|
-
align: "right"
|
|
4877
|
+
align: "right",
|
|
4878
|
+
hideIfNoValue: true
|
|
4720
4879
|
},
|
|
4721
4880
|
inputTokens: {
|
|
4722
4881
|
label: "Input Tokens",
|
|
4723
4882
|
format: "number",
|
|
4724
4883
|
numberFormat: tokenNumberFormat,
|
|
4725
|
-
align: "right"
|
|
4884
|
+
align: "right",
|
|
4885
|
+
hideIfNoValue: true
|
|
4726
4886
|
},
|
|
4727
4887
|
outputTokens: {
|
|
4728
4888
|
label: "Output Tokens",
|
|
4729
4889
|
format: "number",
|
|
4730
4890
|
numberFormat: tokenNumberFormat,
|
|
4731
|
-
align: "right"
|
|
4891
|
+
align: "right",
|
|
4892
|
+
hideIfNoValue: true
|
|
4732
4893
|
},
|
|
4733
4894
|
totalTokens: {
|
|
4734
4895
|
label: "Total Tokens",
|
|
4735
4896
|
format: "number",
|
|
4736
4897
|
numberFormat: tokenNumberFormat,
|
|
4737
|
-
align: "right"
|
|
4898
|
+
align: "right",
|
|
4899
|
+
hideIfNoValue: true
|
|
4738
4900
|
},
|
|
4739
4901
|
cachedInputTokens: {
|
|
4740
4902
|
label: "Cached Input Tokens",
|
|
4741
4903
|
format: "number",
|
|
4742
4904
|
numberFormat: tokenNumberFormat,
|
|
4743
|
-
align: "right"
|
|
4905
|
+
align: "right",
|
|
4906
|
+
hideIfNoValue: true
|
|
4744
4907
|
},
|
|
4745
4908
|
cacheCreationInputTokens: {
|
|
4746
4909
|
label: "Cache Write Tokens",
|
|
4747
4910
|
format: "number",
|
|
4748
4911
|
numberFormat: tokenNumberFormat,
|
|
4749
|
-
align: "right"
|
|
4912
|
+
align: "right",
|
|
4913
|
+
hideIfNoValue: true
|
|
4750
4914
|
},
|
|
4751
4915
|
reasoningTokens: {
|
|
4752
4916
|
label: "Reasoning Tokens",
|
|
4753
4917
|
format: "number",
|
|
4754
4918
|
numberFormat: tokenNumberFormat,
|
|
4755
|
-
align: "right"
|
|
4919
|
+
align: "right",
|
|
4920
|
+
hideIfNoValue: true
|
|
4756
4921
|
},
|
|
4757
4922
|
llmDurationMs: {
|
|
4758
4923
|
label: "LLM Duration",
|
|
4759
4924
|
format: "duration",
|
|
4760
|
-
align: "right"
|
|
4925
|
+
align: "right",
|
|
4926
|
+
hideIfNoValue: true
|
|
4761
4927
|
}
|
|
4762
4928
|
};
|
|
4763
4929
|
function resolveRemovedKeys(globalRemove, evalRemove) {
|
|
@@ -4770,9 +4936,16 @@ function getActiveDefaultConfigKeys(params) {
|
|
|
4770
4936
|
}
|
|
4771
4937
|
function mergeDefaultColumns(params) {
|
|
4772
4938
|
const activeKeys = getActiveDefaultConfigKeys(params);
|
|
4773
|
-
if (activeKeys.length === 0)
|
|
4939
|
+
if (activeKeys.length === 0) {
|
|
4940
|
+
const merged = {
|
|
4941
|
+
...params.globalColumns,
|
|
4942
|
+
...params.columns
|
|
4943
|
+
};
|
|
4944
|
+
return Object.keys(merged).length > 0 ? merged : void 0;
|
|
4945
|
+
}
|
|
4774
4946
|
return {
|
|
4775
4947
|
...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
|
|
4948
|
+
...params.globalColumns,
|
|
4776
4949
|
...params.columns
|
|
4777
4950
|
};
|
|
4778
4951
|
}
|
|
@@ -4784,30 +4957,38 @@ function appendDefaultStats(params) {
|
|
|
4784
4957
|
key: "apiCalls",
|
|
4785
4958
|
label: "API Calls",
|
|
4786
4959
|
aggregate: "avg",
|
|
4787
|
-
numberFormat: countNumberFormat
|
|
4960
|
+
numberFormat: countNumberFormat,
|
|
4961
|
+
hideIfNoValue: true
|
|
4788
4962
|
});
|
|
4789
4963
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4790
4964
|
kind: "column",
|
|
4791
4965
|
key: "costUsd",
|
|
4792
4966
|
label: "LLM Cost",
|
|
4793
4967
|
aggregate: "avg",
|
|
4794
|
-
numberFormat: costNumberFormat
|
|
4968
|
+
numberFormat: costNumberFormat,
|
|
4969
|
+
hideIfNoValue: true
|
|
4795
4970
|
});
|
|
4796
4971
|
if (activeKeys.has("totalTokens")) defaults.push({
|
|
4797
4972
|
kind: "column",
|
|
4798
4973
|
key: "totalTokens",
|
|
4799
4974
|
label: "Tokens",
|
|
4800
4975
|
aggregate: "avg",
|
|
4801
|
-
numberFormat: tokenNumberFormat
|
|
4976
|
+
numberFormat: tokenNumberFormat,
|
|
4977
|
+
hideIfNoValue: true
|
|
4802
4978
|
});
|
|
4803
4979
|
if (activeKeys.has("llmTurns")) defaults.push({
|
|
4804
4980
|
kind: "column",
|
|
4805
4981
|
key: "llmTurns",
|
|
4806
4982
|
label: "LLM Turns",
|
|
4807
4983
|
aggregate: "avg",
|
|
4808
|
-
numberFormat: countNumberFormat
|
|
4984
|
+
numberFormat: countNumberFormat,
|
|
4985
|
+
hideIfNoValue: true
|
|
4809
4986
|
});
|
|
4810
|
-
const merged = [
|
|
4987
|
+
const merged = [
|
|
4988
|
+
...params.globalStats ?? [],
|
|
4989
|
+
...params.stats ?? [],
|
|
4990
|
+
...defaults
|
|
4991
|
+
];
|
|
4811
4992
|
return merged.length > 0 ? merged : void 0;
|
|
4812
4993
|
}
|
|
4813
4994
|
function appendDefaultCharts(params) {
|
|
@@ -4815,6 +4996,7 @@ function appendDefaultCharts(params) {
|
|
|
4815
4996
|
const defaults = [];
|
|
4816
4997
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4817
4998
|
heading: "LLM Cost",
|
|
4999
|
+
hideIfNoValue: true,
|
|
4818
5000
|
type: "area",
|
|
4819
5001
|
metrics: [{
|
|
4820
5002
|
source: "column",
|
|
@@ -4856,6 +5038,7 @@ function appendDefaultCharts(params) {
|
|
|
4856
5038
|
].filter((metric) => metric !== null);
|
|
4857
5039
|
if (tokenMetrics.length > 0) defaults.push({
|
|
4858
5040
|
heading: "LLM Tokens",
|
|
5041
|
+
hideIfNoValue: true,
|
|
4859
5042
|
type: "bar",
|
|
4860
5043
|
metrics: tokenMetrics,
|
|
4861
5044
|
tooltipExtras: activeKeys.has("totalTokens") ? [{
|
|
@@ -4872,11 +5055,13 @@ function resolveEvalDefaultConfig(params) {
|
|
|
4872
5055
|
const evalRemove = params.evalDef.removeDefaultConfig;
|
|
4873
5056
|
return {
|
|
4874
5057
|
columns: mergeDefaultColumns({
|
|
5058
|
+
globalColumns: params.globalColumns,
|
|
4875
5059
|
columns: params.evalDef.columns,
|
|
4876
5060
|
globalRemove: params.globalRemove,
|
|
4877
5061
|
evalRemove
|
|
4878
5062
|
}),
|
|
4879
5063
|
stats: appendDefaultStats({
|
|
5064
|
+
globalStats: params.globalStats,
|
|
4880
5065
|
stats: params.evalDef.stats,
|
|
4881
5066
|
globalRemove: params.globalRemove,
|
|
4882
5067
|
evalRemove
|
|
@@ -5214,6 +5399,65 @@ function isFile(value) {
|
|
|
5214
5399
|
return value instanceof File;
|
|
5215
5400
|
}
|
|
5216
5401
|
//#endregion
|
|
5402
|
+
//#region ../runner/src/traceDisplay.ts
|
|
5403
|
+
function isRecord$1(value) {
|
|
5404
|
+
return typeof value === "object" && value !== null;
|
|
5405
|
+
}
|
|
5406
|
+
function mergeNestedAttribute(value, path, attributeValue) {
|
|
5407
|
+
const root = value === void 0 ? {} : { ...value };
|
|
5408
|
+
const parts = path.split(".");
|
|
5409
|
+
let current = root;
|
|
5410
|
+
for (const [index, part] of parts.entries()) {
|
|
5411
|
+
if (index === parts.length - 1) {
|
|
5412
|
+
current[part] = attributeValue;
|
|
5413
|
+
continue;
|
|
5414
|
+
}
|
|
5415
|
+
const nextValue = current[part];
|
|
5416
|
+
const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
|
|
5417
|
+
current[part] = nextRecord;
|
|
5418
|
+
current = nextRecord;
|
|
5419
|
+
}
|
|
5420
|
+
return root;
|
|
5421
|
+
}
|
|
5422
|
+
function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
5423
|
+
const merged = /* @__PURE__ */ new Map();
|
|
5424
|
+
for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5425
|
+
for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5426
|
+
const resolvedAttributes = [];
|
|
5427
|
+
const transformedTrace = spans.map((span) => ({
|
|
5428
|
+
...span,
|
|
5429
|
+
attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
|
|
5430
|
+
}));
|
|
5431
|
+
for (const attribute of merged.values()) {
|
|
5432
|
+
const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
|
|
5433
|
+
resolvedAttributes.push({
|
|
5434
|
+
key: attribute.key,
|
|
5435
|
+
path: resolvedPath,
|
|
5436
|
+
label: attribute.label,
|
|
5437
|
+
format: attribute.format,
|
|
5438
|
+
numberFormat: attribute.numberFormat,
|
|
5439
|
+
placements: attribute.placements,
|
|
5440
|
+
scope: attribute.scope,
|
|
5441
|
+
mode: attribute.mode
|
|
5442
|
+
});
|
|
5443
|
+
if (!attribute.transform) continue;
|
|
5444
|
+
for (const span of transformedTrace) {
|
|
5445
|
+
const sourceValue = getNestedAttribute(span.attributes, attribute.path);
|
|
5446
|
+
if (sourceValue === void 0) continue;
|
|
5447
|
+
const transformedValue = attribute.transform({
|
|
5448
|
+
value: sourceValue,
|
|
5449
|
+
span
|
|
5450
|
+
});
|
|
5451
|
+
if (transformedValue === void 0) continue;
|
|
5452
|
+
span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
|
|
5453
|
+
}
|
|
5454
|
+
}
|
|
5455
|
+
return {
|
|
5456
|
+
trace: transformedTrace,
|
|
5457
|
+
traceDisplay: { attributes: resolvedAttributes }
|
|
5458
|
+
};
|
|
5459
|
+
}
|
|
5460
|
+
//#endregion
|
|
5217
5461
|
//#region ../runner/src/runMaintenance.ts
|
|
5218
5462
|
async function persistRunState(runState) {
|
|
5219
5463
|
await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
|
|
@@ -5532,63 +5776,10 @@ async function runWithModuleIsolation(context, fn) {
|
|
|
5532
5776
|
return await isolationStorage.run(context, fn);
|
|
5533
5777
|
}
|
|
5534
5778
|
//#endregion
|
|
5535
|
-
//#region ../runner/src/
|
|
5536
|
-
|
|
5537
|
-
|
|
5538
|
-
|
|
5539
|
-
function mergeNestedAttribute(value, path, attributeValue) {
|
|
5540
|
-
const root = value === void 0 ? {} : { ...value };
|
|
5541
|
-
const parts = path.split(".");
|
|
5542
|
-
let current = root;
|
|
5543
|
-
for (const [index, part] of parts.entries()) {
|
|
5544
|
-
if (index === parts.length - 1) {
|
|
5545
|
-
current[part] = attributeValue;
|
|
5546
|
-
continue;
|
|
5547
|
-
}
|
|
5548
|
-
const nextValue = current[part];
|
|
5549
|
-
const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
|
|
5550
|
-
current[part] = nextRecord;
|
|
5551
|
-
current = nextRecord;
|
|
5552
|
-
}
|
|
5553
|
-
return root;
|
|
5554
|
-
}
|
|
5555
|
-
function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
5556
|
-
const merged = /* @__PURE__ */ new Map();
|
|
5557
|
-
for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5558
|
-
for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5559
|
-
const resolvedAttributes = [];
|
|
5560
|
-
const transformedTrace = spans.map((span) => ({
|
|
5561
|
-
...span,
|
|
5562
|
-
attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
|
|
5563
|
-
}));
|
|
5564
|
-
for (const attribute of merged.values()) {
|
|
5565
|
-
const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
|
|
5566
|
-
resolvedAttributes.push({
|
|
5567
|
-
key: attribute.key,
|
|
5568
|
-
path: resolvedPath,
|
|
5569
|
-
label: attribute.label,
|
|
5570
|
-
format: attribute.format,
|
|
5571
|
-
numberFormat: attribute.numberFormat,
|
|
5572
|
-
placements: attribute.placements,
|
|
5573
|
-
scope: attribute.scope,
|
|
5574
|
-
mode: attribute.mode
|
|
5575
|
-
});
|
|
5576
|
-
if (!attribute.transform) continue;
|
|
5577
|
-
for (const span of transformedTrace) {
|
|
5578
|
-
const sourceValue = getNestedAttribute(span.attributes, attribute.path);
|
|
5579
|
-
if (sourceValue === void 0) continue;
|
|
5580
|
-
const transformedValue = attribute.transform({
|
|
5581
|
-
value: sourceValue,
|
|
5582
|
-
span
|
|
5583
|
-
});
|
|
5584
|
-
if (transformedValue === void 0) continue;
|
|
5585
|
-
span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
|
|
5586
|
-
}
|
|
5587
|
-
}
|
|
5588
|
-
return {
|
|
5589
|
-
trace: transformedTrace,
|
|
5590
|
-
traceDisplay: { attributes: resolvedAttributes }
|
|
5591
|
-
};
|
|
5779
|
+
//#region ../runner/src/stackFormatting.ts
|
|
5780
|
+
const orphanedAnsiSgrPattern = /\[(?:\d{1,3}(?:;\d{1,3})*)?m/g;
|
|
5781
|
+
function stripTerminalControlCodes(value) {
|
|
5782
|
+
return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
|
|
5592
5783
|
}
|
|
5593
5784
|
//#endregion
|
|
5594
5785
|
//#region ../runner/src/runExecution.ts
|
|
@@ -5620,8 +5811,54 @@ function buildScopedEvalIdPrefix(params) {
|
|
|
5620
5811
|
async function callWithUnknownResult(fn, args) {
|
|
5621
5812
|
return await Reflect.apply(fn, void 0, args);
|
|
5622
5813
|
}
|
|
5814
|
+
async function callUnknownFunction(fn, args) {
|
|
5815
|
+
if (typeof fn !== "function") throw new Error("Expected a function");
|
|
5816
|
+
return await Reflect.apply(fn, void 0, args);
|
|
5817
|
+
}
|
|
5818
|
+
function assignDerivedOutputs(params) {
|
|
5819
|
+
for (const [key, value] of Object.entries(params.derived)) {
|
|
5820
|
+
if (key in params.outputs) continue;
|
|
5821
|
+
params.outputs[key] = value;
|
|
5822
|
+
}
|
|
5823
|
+
}
|
|
5824
|
+
async function resolveDeriveFromTracingConfig(params) {
|
|
5825
|
+
const ctx = {
|
|
5826
|
+
trace: params.traceTree,
|
|
5827
|
+
input: params.evalCase.input,
|
|
5828
|
+
case: params.evalCase
|
|
5829
|
+
};
|
|
5830
|
+
if (typeof params.deriveFromTracing === "function") {
|
|
5831
|
+
const derived = await callUnknownFunction(params.deriveFromTracing, [ctx]);
|
|
5832
|
+
if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
|
|
5833
|
+
return derived;
|
|
5834
|
+
}
|
|
5835
|
+
const derived = {};
|
|
5836
|
+
for (const [key, compute] of Object.entries(params.deriveFromTracing)) {
|
|
5837
|
+
const value = await callUnknownFunction(compute, [ctx]);
|
|
5838
|
+
if (value !== void 0) derived[key] = value;
|
|
5839
|
+
}
|
|
5840
|
+
return derived;
|
|
5841
|
+
}
|
|
5842
|
+
async function runDeriveFromTracingConfig(params) {
|
|
5843
|
+
if (params.deriveFromTracing === void 0) return;
|
|
5844
|
+
const { deriveFromTracing } = params;
|
|
5845
|
+
try {
|
|
5846
|
+
const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
|
|
5847
|
+
deriveFromTracing,
|
|
5848
|
+
traceTree: params.traceTree,
|
|
5849
|
+
evalCase: params.evalCase
|
|
5850
|
+
}));
|
|
5851
|
+
assignDerivedOutputs({
|
|
5852
|
+
outputs: params.scope.outputs,
|
|
5853
|
+
derived
|
|
5854
|
+
});
|
|
5855
|
+
} catch (e) {
|
|
5856
|
+
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5857
|
+
params.scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5858
|
+
}
|
|
5859
|
+
}
|
|
5623
5860
|
async function runCase(params) {
|
|
5624
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode,
|
|
5861
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5625
5862
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
5626
5863
|
evalId,
|
|
5627
5864
|
evalFilePath,
|
|
@@ -5653,7 +5890,8 @@ async function runCase(params) {
|
|
|
5653
5890
|
adapter: cacheAdapter,
|
|
5654
5891
|
mode: cacheMode,
|
|
5655
5892
|
evalId,
|
|
5656
|
-
|
|
5893
|
+
read: evalDef.cache?.read,
|
|
5894
|
+
store: evalDef.cache?.store
|
|
5657
5895
|
} : void 0,
|
|
5658
5896
|
startTime: evalDef.startTime,
|
|
5659
5897
|
freezeTime: evalDef.freezeTime
|
|
@@ -5666,22 +5904,19 @@ async function runCase(params) {
|
|
|
5666
5904
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
5667
5905
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5668
5906
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5669
|
-
if (!nonAssertError
|
|
5670
|
-
|
|
5671
|
-
|
|
5672
|
-
|
|
5673
|
-
|
|
5674
|
-
|
|
5675
|
-
|
|
5676
|
-
|
|
5677
|
-
|
|
5678
|
-
|
|
5679
|
-
|
|
5680
|
-
|
|
5681
|
-
}
|
|
5682
|
-
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5683
|
-
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5684
|
-
}
|
|
5907
|
+
if (!nonAssertError) {
|
|
5908
|
+
await runDeriveFromTracingConfig({
|
|
5909
|
+
deriveFromTracing: globalDeriveFromTracing,
|
|
5910
|
+
scope,
|
|
5911
|
+
traceTree,
|
|
5912
|
+
evalCase
|
|
5913
|
+
});
|
|
5914
|
+
await runDeriveFromTracingConfig({
|
|
5915
|
+
deriveFromTracing: evalDef.deriveFromTracing,
|
|
5916
|
+
scope,
|
|
5917
|
+
traceTree,
|
|
5918
|
+
evalCase
|
|
5919
|
+
});
|
|
5685
5920
|
}
|
|
5686
5921
|
if (!nonAssertError) addDefaultOutputs({
|
|
5687
5922
|
outputs: scope.outputs,
|
|
@@ -5698,7 +5933,7 @@ async function runCase(params) {
|
|
|
5698
5933
|
...scope.outputs,
|
|
5699
5934
|
...parsedOutputs.data
|
|
5700
5935
|
};
|
|
5701
|
-
else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error)));
|
|
5936
|
+
else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
|
|
5702
5937
|
}
|
|
5703
5938
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
5704
5939
|
const scoringTraces = {};
|
|
@@ -5721,7 +5956,8 @@ async function runCase(params) {
|
|
|
5721
5956
|
adapter: cacheAdapter,
|
|
5722
5957
|
mode: cacheMode,
|
|
5723
5958
|
evalId: `${evalId}__score__${key}`,
|
|
5724
|
-
|
|
5959
|
+
read: evalDef.cache?.read,
|
|
5960
|
+
store: evalDef.cache?.store
|
|
5725
5961
|
} : void 0,
|
|
5726
5962
|
startTime: scoreStartTime,
|
|
5727
5963
|
freezeTime: evalDef.freezeTime
|
|
@@ -5776,6 +6012,7 @@ async function runCase(params) {
|
|
|
5776
6012
|
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
5777
6013
|
const columns = {};
|
|
5778
6014
|
const columnOverrides = mergeDefaultColumns({
|
|
6015
|
+
globalColumns,
|
|
5779
6016
|
columns: evalDef.columns,
|
|
5780
6017
|
globalRemove: globalRemoveDefaultConfig,
|
|
5781
6018
|
evalRemove: evalDef.removeDefaultConfig
|
|
@@ -5839,14 +6076,17 @@ function formatOutputsSchemaError(error) {
|
|
|
5839
6076
|
const issueLines = error.issues.map((issue) => {
|
|
5840
6077
|
return `${issue.path.length > 0 ? issue.path.join(".") : "<root>"}: ${issue.message}`;
|
|
5841
6078
|
});
|
|
5842
|
-
if (issueLines.length === 0) return "
|
|
5843
|
-
return
|
|
6079
|
+
if (issueLines.length === 0) return "outputs did not match the configured schema";
|
|
6080
|
+
return issueLines.join("\n");
|
|
5844
6081
|
}
|
|
5845
|
-
function toAssertionFailure(message, error = void 0) {
|
|
5846
|
-
|
|
6082
|
+
function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
|
|
6083
|
+
const name = nameOverride ?? error?.name;
|
|
6084
|
+
const stack = error?.stack ? stripTerminalControlCodes(error.stack) : void 0;
|
|
6085
|
+
return {
|
|
6086
|
+
...name !== void 0 ? { name } : {},
|
|
5847
6087
|
message,
|
|
5848
|
-
stack:
|
|
5849
|
-
}
|
|
6088
|
+
...stack !== void 0 ? { stack } : {}
|
|
6089
|
+
};
|
|
5850
6090
|
}
|
|
5851
6091
|
//#endregion
|
|
5852
6092
|
//#region ../runner/src/runQueue.ts
|
|
@@ -6076,15 +6316,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6076
6316
|
const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
6077
6317
|
for (const evalMeta of targetEvals) {
|
|
6078
6318
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
6079
|
-
let
|
|
6319
|
+
let sourceFingerprint = "";
|
|
6080
6320
|
try {
|
|
6081
|
-
|
|
6321
|
+
sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
|
|
6082
6322
|
} catch {
|
|
6083
|
-
|
|
6323
|
+
sourceFingerprint = "";
|
|
6084
6324
|
}
|
|
6085
|
-
if (
|
|
6086
|
-
runState.manifest.evalSourceFingerprints[evalMeta.key] =
|
|
6087
|
-
evalMeta.sourceFingerprint =
|
|
6325
|
+
if (sourceFingerprint.length > 0) {
|
|
6326
|
+
runState.manifest.evalSourceFingerprints[evalMeta.key] = sourceFingerprint;
|
|
6327
|
+
evalMeta.sourceFingerprint = sourceFingerprint;
|
|
6088
6328
|
} else {
|
|
6089
6329
|
delete runState.manifest.evalSourceFingerprints[evalMeta.key];
|
|
6090
6330
|
evalMeta.sourceFingerprint = null;
|
|
@@ -6093,7 +6333,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6093
6333
|
const registry = getEvalRegistry();
|
|
6094
6334
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
6095
6335
|
await runInEvalRuntimeScope("env", async () => {
|
|
6096
|
-
await loadEvalModule(evalFilePath,
|
|
6336
|
+
await loadEvalModule(evalFilePath, sourceFingerprint);
|
|
6097
6337
|
});
|
|
6098
6338
|
});
|
|
6099
6339
|
const entry = registry.get(evalMeta.id);
|
|
@@ -6117,6 +6357,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6117
6357
|
runState.summary.totalCases += cases.length;
|
|
6118
6358
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
6119
6359
|
evalDef,
|
|
6360
|
+
globalColumns: config.columns,
|
|
6361
|
+
globalStats: config.stats,
|
|
6120
6362
|
globalRemove: config.removeDefaultConfig
|
|
6121
6363
|
});
|
|
6122
6364
|
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
@@ -6162,6 +6404,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6162
6404
|
evalKey: evalMeta.key,
|
|
6163
6405
|
evalCase,
|
|
6164
6406
|
globalTraceDisplay,
|
|
6407
|
+
globalColumns: config.columns,
|
|
6408
|
+
globalDeriveFromTracing: config.deriveFromTracing,
|
|
6165
6409
|
llmCallsConfig,
|
|
6166
6410
|
apiCallsConfig,
|
|
6167
6411
|
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
@@ -6169,7 +6413,6 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6169
6413
|
startTime,
|
|
6170
6414
|
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
6171
6415
|
cacheMode,
|
|
6172
|
-
codeFingerprint,
|
|
6173
6416
|
moduleIsolation,
|
|
6174
6417
|
evalFilePath,
|
|
6175
6418
|
evalFileRelativePath: evalMeta.filePath,
|
|
@@ -6324,4 +6567,4 @@ function toLastRunStatus(status) {
|
|
|
6324
6567
|
return status === "pending" ? null : status;
|
|
6325
6568
|
}
|
|
6326
6569
|
//#endregion
|
|
6327
|
-
export {
|
|
6570
|
+
export { llmCallMetricFormatSchema as $, traceAttributeDisplayPlacementSchema as $t, extractCacheHits as A, advanceEvalTime as An, evalChartBuiltinMetricSchema as At, runManifestSchema as B, mergeEvalOutput as Bn, cacheEntryWithDebugKeySchema as Bt, normalizeScoreDef as C, deserializeCacheRecording as Cn, runLogEntrySchema as Ct, updateManualScoreRequestSchema as D, repoFile as Dn, scoreTraceSchema as Dt, createRunRequestSchema as E, serializeCacheValue as En, runLogPhaseSchema as Et, getEvalTitle as F, getCurrentScope as Fn, evalChartTypeSchema as Ft, apiCallMetricFormatSchema as G, setEvalOutput as Gn, cacheRecordingOpSchema as Gt, DEFAULT_API_CALLS_CONFIG as H, runInEvalRuntimeScope as Hn, cacheListItemSchema as Ht, getEvalDisplayStatus as I, getEvalCaseInput as In, evalChartsConfigSchema as It, apiCallsConfigSchema as J, defineEval as Jn, serializedCacheSpanSchema as Jt, apiCallMetricPlacementSchema as K, setScopeCacheContext as Kn, cacheRecordingSchema as Kt, deriveScopedSummaryFromCases as L, getEvalStartTime as Ln, cacheDebugKeyEntrySchema as Lt, extractLlmCalls as M, configureEvalRunLogs as Mn, evalChartConfigSchema as Mt, applyDerivedCallAttributes as N, evalAssert as Nn, evalChartMetricSchema as Nt, sseEnvelopeSchema as O, evalExpect as On, evalChartAggregateSchema as Ot, getNestedAttribute as P, evalLog as Pn, evalChartTooltipExtraSchema as Pt, evalDeriveConfigSchema as Q, traceAttributeDisplayInputSchema as Qt, deriveStatusFromCaseRows as R, incrementEvalOutput as Rn, cacheDebugKeyFileSchema as Rt, buildDeclaredColumnDefs as S, hashCacheKeySync as Sn, evalSummarySchema as St, createFsCacheStore as T, serializeCacheRecording as Tn, runLogLocationSchema as Tt, DEFAULT_LLM_CALLS_CONFIG as U, runInEvalScope as Un, cacheModeSchema as Ut, runSummarySchema as V, nextEvalId as Vn, cacheFileSchema as Vt, agentEvalsConfigSchema as W, runInExistingEvalScope as Wn, cacheOperationTypeSchema as Wt, evalColumnOverrideSchema as X, traceCacheRefSchema as Xt, defaultConfigKeySchema as Y, getEvalRegistry as Yn, spanCacheOptionsSchema as Yt, evalColumnsSchema as Z, traceAttributeDisplayFormatSchema as Zt, deriveEvalFreshness as _, buildTraceTree as _n, discoveryIssueSchema as _t, getLastRunStatuses as a, traceSpanSchema as an, removeDefaultConfigSchema as at, resolveEvalDefaultConfig as b, evalTracer as bn, evalStatItemSchema as bt, loadPersistedRunSnapshots as c, columnDefSchema as cn, runLogsConfigSchema as ct, persistRunState as d, fileRefSchema as dn, buildEvalKey as dt, traceAttributeDisplaySchema as en, llmCallMetricPlacementSchema as et, recomputeEvalStatusesInRuns as f, jsonCellSchema as fn, getCaseRowCaseKey as ft, resolveArtifactPath as g, z$1 as gn, caseRowSchema as gt, resolveTracePresentation as h, runArtifactRefSchema as hn, caseDetailSchema as ht, generateRunId as i, traceSpanKindSchema as in, llmCallsConfigSchema as it, extractApiCalls as j, appendToEvalOutput as jn, evalChartColorSchema as jt, extractCacheEntries as k, EvalAssertionError as kn, evalChartAxisSchema as kt, nextShortIdFromSnapshots as l, columnFormatSchema as ln, trialSelectionModeSchema as lt, runTouchesEval as m, repoFileRefSchema as mn, assertionFailureSchema as mt, getTargetEvalKeys as n, traceDisplayInputConfigSchema as nn, llmCallPricingRateSchema as nt, getLatestRunInfos as o, traceSpanWarningSchema as on, resolveApiCallsConfig as ot, recomputePersistedCaseStatus as p, numberDisplayOptionsSchema as pn, getCaseRowEvalKey as pt, apiCallMetricSchema as q, startEvalBackgroundJob as qn, cacheStatusSchema as qt, getTargetEvals as r, traceSpanErrorSchema as rn, llmCallPricingSchema as rt, loadPersistedRunSnapshot as s, cellValueSchema as sn, resolveLlmCallsConfig as st, executeRun as t, traceDisplayConfigSchema as tn, llmCallMetricSchema as tt, persistCaseDetail as u, columnKindSchema as un, buildCaseKey as ut, loadEvalModule as v, captureEvalSpanError as vn, evalFreshnessStatusSchema as vt, validateCharts as w, deserializeCacheValue as wn, runLogLevelSchema as wt, loadConfig as x, hashCacheKey as xn, evalStatsConfigSchema as xt, parseEvalDiscovery as y, evalSpan as yn, evalStatAggregateSchema as yt, deriveStatusFromChildStatuses as z, isInEvalScope as zn, cacheEntrySchema as zt };
|