@ls-stack/agent-eval 0.27.1 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-CJj1yPPD.mjs → app-mBbAN-Gt.mjs} +15 -3
- package/dist/apps/web/dist/assets/index-8VE7b6RK.css +1 -0
- package/dist/apps/web/dist/assets/index-Czer_MdN.js +118 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-Clf8xUFa.mjs → cli-BQwRbqsL.mjs} +75 -4
- package/dist/index.d.mts +342 -90
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-FEvBwwJI.mjs → runOrchestration-ClWYWPen.mjs} +428 -204
- package/dist/{runner-KbDKLSU4.mjs → runner-BQn_xf36.mjs} +1 -1
- package/dist/{runner-zqKwTlNj.mjs → runner-DbVB66h9.mjs} +2 -2
- package/dist/src-CuirVcPY.mjs +3 -0
- package/package.json +6 -4
- package/skills/agent-eval/SKILL.md +52 -20
- package/dist/apps/web/dist/assets/index-6YqV9t4k.js +0 -118
- package/dist/apps/web/dist/assets/index-C-OiMSQD.css +0 -1
- package/dist/bin.d.mts +0 -1
- package/dist/runChild.d.mts +0 -1
- package/dist/src-BBwT7_cy.mjs +0 -3
|
@@ -531,10 +531,13 @@ function recordOpIfActive(scope, op) {
|
|
|
531
531
|
if (top) top.ops.push(op);
|
|
532
532
|
}
|
|
533
533
|
function toAssertionFailure$1(message, error = void 0) {
|
|
534
|
-
|
|
534
|
+
const name = error?.name;
|
|
535
|
+
const stack = error?.stack ? stripTerminalControlCodes$1(error.stack) : void 0;
|
|
536
|
+
return {
|
|
537
|
+
...name !== void 0 ? { name } : {},
|
|
535
538
|
message,
|
|
536
|
-
stack:
|
|
537
|
-
}
|
|
539
|
+
...stack !== void 0 ? { stack } : {}
|
|
540
|
+
};
|
|
538
541
|
}
|
|
539
542
|
/**
|
|
540
543
|
* Record or replace an output value for the current case scope.
|
|
@@ -802,7 +805,8 @@ function repoFile(path, mimeType) {
|
|
|
802
805
|
}
|
|
803
806
|
//#endregion
|
|
804
807
|
//#region ../sdk/src/cacheSerialization.ts
|
|
805
|
-
const serializedCacheValueMarker = "
|
|
808
|
+
const serializedCacheValueMarker = "__aecs";
|
|
809
|
+
const legacySerializedCacheValueMarker = "__agentEvalsCacheSerialization";
|
|
806
810
|
const jsonSafeCacheValueVersion = "json-safe-v1";
|
|
807
811
|
const packedNumberArrayMinLength = 128;
|
|
808
812
|
const compressedStringMinBytes = 16 * 1024;
|
|
@@ -812,7 +816,7 @@ function isRecordLike$3(value) {
|
|
|
812
816
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
813
817
|
}
|
|
814
818
|
function isJsonSafeSerializedCacheValue(value) {
|
|
815
|
-
return isRecordLike$3(value) && value
|
|
819
|
+
return isRecordLike$3(value) && serializationMarkerValue(value) === jsonSafeCacheValueVersion && typeof value.type === "string";
|
|
816
820
|
}
|
|
817
821
|
function jsonSafeValue(type, value) {
|
|
818
822
|
return value === void 0 ? {
|
|
@@ -825,32 +829,39 @@ function jsonSafeValue(type, value) {
|
|
|
825
829
|
};
|
|
826
830
|
}
|
|
827
831
|
function hasSerializationMarkerKey(value) {
|
|
828
|
-
return Object.hasOwn(value, serializedCacheValueMarker);
|
|
832
|
+
return Object.hasOwn(value, serializedCacheValueMarker) || Object.hasOwn(value, legacySerializedCacheValueMarker);
|
|
833
|
+
}
|
|
834
|
+
function serializationMarkerValue(value) {
|
|
835
|
+
return value[serializedCacheValueMarker] ?? value[legacySerializedCacheValueMarker];
|
|
829
836
|
}
|
|
830
837
|
/**
|
|
831
838
|
* Serialize one cached value while keeping plain JSON as plain JSON.
|
|
832
839
|
*
|
|
833
|
-
* Rich runtime values use small tagged wrappers.
|
|
840
|
+
* Rich runtime values use small tagged wrappers. Undefined values are omitted
|
|
841
|
+
* by default; pass `preserveUndefined: true` to round-trip them explicitly.
|
|
834
842
|
*/
|
|
835
|
-
async function serializeCacheValue(value) {
|
|
836
|
-
return serializeJsonSafeValue(value, /* @__PURE__ */ new WeakSet(), 0);
|
|
843
|
+
async function serializeCacheValue(value, options = void 0) {
|
|
844
|
+
return serializeJsonSafeValue(value, /* @__PURE__ */ new WeakSet(), 0, normalizeCacheSerializationOptions(options));
|
|
837
845
|
}
|
|
838
846
|
/** Revive one cached value, while preserving legacy JSON-round-tripped data. */
|
|
839
847
|
function deserializeCacheValue(value) {
|
|
840
848
|
return deserializeJsonSafeValue(value);
|
|
841
849
|
}
|
|
842
850
|
/** Clone one value through the same serialization path used for cache data. */
|
|
843
|
-
async function cloneCacheValue(value) {
|
|
844
|
-
return deserializeCacheValue(await serializeCacheValue(value));
|
|
851
|
+
async function cloneCacheValue(value, options = void 0) {
|
|
852
|
+
return deserializeCacheValue(await serializeCacheValue(value, options));
|
|
853
|
+
}
|
|
854
|
+
function normalizeCacheSerializationOptions(options) {
|
|
855
|
+
return { preserveUndefined: options?.preserveUndefined === true };
|
|
845
856
|
}
|
|
846
|
-
async function serializeJsonSafeValue(value, refs, depth) {
|
|
847
|
-
if (value === void 0) return jsonSafeValue("Undefined");
|
|
857
|
+
async function serializeJsonSafeValue(value, refs, depth, config) {
|
|
858
|
+
if (value === void 0) return config.preserveUndefined ? jsonSafeValue("Undefined") : void 0;
|
|
848
859
|
if (typeof value === "bigint") return jsonSafeValue("BigInt", value.toString());
|
|
849
860
|
if (typeof value === "number") return serializeNumber(value);
|
|
850
861
|
if (typeof value === "string") return serializeString(value, depth);
|
|
851
862
|
if (value instanceof Date) return jsonSafeValue("Date", value.toISOString());
|
|
852
|
-
if (value instanceof Map) return serializeMap(value, refs, depth);
|
|
853
|
-
if (value instanceof Set) return serializeSet(value, refs, depth);
|
|
863
|
+
if (value instanceof Map) return serializeMap(value, refs, depth, config);
|
|
864
|
+
if (value instanceof Set) return serializeSet(value, refs, depth, config);
|
|
854
865
|
if (value instanceof RegExp) return jsonSafeValue("RegExp", {
|
|
855
866
|
flags: value.flags,
|
|
856
867
|
source: value.source
|
|
@@ -869,7 +880,7 @@ async function serializeJsonSafeValue(value, refs, depth) {
|
|
|
869
880
|
type: value.type
|
|
870
881
|
});
|
|
871
882
|
if (value instanceof ArrayBuffer) return jsonSafeValue("ArrayBuffer", bytesToBase64(new Uint8Array(value)));
|
|
872
|
-
if (value instanceof Error) return serializeError(value, refs, depth);
|
|
883
|
+
if (value instanceof Error) return serializeError(value, refs, depth, config);
|
|
873
884
|
if (!value || typeof value !== "object") return value;
|
|
874
885
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
875
886
|
refs.add(value);
|
|
@@ -882,12 +893,18 @@ async function serializeJsonSafeValue(value, refs, depth) {
|
|
|
882
893
|
}
|
|
883
894
|
}
|
|
884
895
|
const items = [];
|
|
885
|
-
for (const item of value)
|
|
896
|
+
for (const item of value) {
|
|
897
|
+
const serializedItem = await serializeJsonSafeValue(item, refs, depth + 1, config);
|
|
898
|
+
if (serializedItem !== void 0) items.push(serializedItem);
|
|
899
|
+
}
|
|
886
900
|
refs.delete(value);
|
|
887
901
|
return compressNestedJsonValue(items, depth) ?? items;
|
|
888
902
|
}
|
|
889
903
|
const entries = [];
|
|
890
|
-
for (const [key, entryValue] of Object.entries(value))
|
|
904
|
+
for (const [key, entryValue] of Object.entries(value)) {
|
|
905
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
906
|
+
if (serializedEntryValue !== void 0) entries.push([key, serializedEntryValue]);
|
|
907
|
+
}
|
|
891
908
|
refs.delete(value);
|
|
892
909
|
const serialized = hasSerializationMarkerKey(value) ? jsonSafeValue("Object", entries) : Object.fromEntries(entries);
|
|
893
910
|
return compressNestedJsonValue(serialized, depth) ?? serialized;
|
|
@@ -957,32 +974,40 @@ function compressNestedJsonValue(value, depth) {
|
|
|
957
974
|
function compressionIsWorthIt(value, rawSize) {
|
|
958
975
|
return Buffer$1.byteLength(JSON.stringify(value)) < rawSize * maxCompressedSizeRatio;
|
|
959
976
|
}
|
|
960
|
-
async function serializeMap(value, refs, depth) {
|
|
977
|
+
async function serializeMap(value, refs, depth, config) {
|
|
961
978
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
962
979
|
refs.add(value);
|
|
963
980
|
const entries = [];
|
|
964
|
-
for (const [key, entryValue] of value.entries())
|
|
981
|
+
for (const [key, entryValue] of value.entries()) {
|
|
982
|
+
const serializedKey = await serializeJsonSafeValue(key, refs, depth + 1, config);
|
|
983
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
984
|
+
if (serializedKey !== void 0 && serializedEntryValue !== void 0) entries.push([serializedKey, serializedEntryValue]);
|
|
985
|
+
}
|
|
965
986
|
refs.delete(value);
|
|
966
987
|
return jsonSafeValue("Map", entries);
|
|
967
988
|
}
|
|
968
|
-
async function serializeSet(value, refs, depth) {
|
|
989
|
+
async function serializeSet(value, refs, depth, config) {
|
|
969
990
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
970
991
|
refs.add(value);
|
|
971
992
|
const items = [];
|
|
972
|
-
for (const item of value.values())
|
|
993
|
+
for (const item of value.values()) {
|
|
994
|
+
const serializedItem = await serializeJsonSafeValue(item, refs, depth + 1, config);
|
|
995
|
+
if (serializedItem !== void 0) items.push(serializedItem);
|
|
996
|
+
}
|
|
973
997
|
refs.delete(value);
|
|
974
998
|
return jsonSafeValue("Set", items);
|
|
975
999
|
}
|
|
976
|
-
async function serializeError(value, refs, depth) {
|
|
1000
|
+
async function serializeError(value, refs, depth, config) {
|
|
977
1001
|
if (refs.has(value)) throw new Error("Circular cache values are not supported");
|
|
978
1002
|
refs.add(value);
|
|
979
1003
|
const props = [];
|
|
980
1004
|
for (const [key, entryValue] of Object.entries(value)) {
|
|
981
1005
|
if (key === "cause") continue;
|
|
982
|
-
|
|
1006
|
+
const serializedEntryValue = await serializeJsonSafeValue(entryValue, refs, depth + 1, config);
|
|
1007
|
+
if (serializedEntryValue !== void 0) props.push([key, serializedEntryValue]);
|
|
983
1008
|
}
|
|
984
1009
|
const serialized = jsonSafeValue("Error", {
|
|
985
|
-
cause: "cause" in value ? await serializeJsonSafeValue(value.cause, refs, depth + 1) : void 0,
|
|
1010
|
+
cause: "cause" in value ? await serializeJsonSafeValue(value.cause, refs, depth + 1, config) : void 0,
|
|
986
1011
|
message: value.message,
|
|
987
1012
|
name: value.name,
|
|
988
1013
|
props,
|
|
@@ -1123,33 +1148,36 @@ function deserializeError(value) {
|
|
|
1123
1148
|
});
|
|
1124
1149
|
return error;
|
|
1125
1150
|
}
|
|
1126
|
-
async function serializeRecordValues(record) {
|
|
1151
|
+
async function serializeRecordValues(record, config) {
|
|
1127
1152
|
const entries = [];
|
|
1128
|
-
for (const [key, value] of Object.entries(record))
|
|
1153
|
+
for (const [key, value] of Object.entries(record)) {
|
|
1154
|
+
const serializedValue = await serializeCacheValue(value, config);
|
|
1155
|
+
if (serializedValue !== void 0) entries.push([key, serializedValue]);
|
|
1156
|
+
}
|
|
1129
1157
|
return Object.fromEntries(entries);
|
|
1130
1158
|
}
|
|
1131
1159
|
function deserializeRecordValues(record) {
|
|
1132
1160
|
return Object.fromEntries(Object.entries(record).map(([key, value]) => [key, deserializeCacheValue(value)]));
|
|
1133
1161
|
}
|
|
1134
|
-
async function serializeCacheRecordingOp(op) {
|
|
1162
|
+
async function serializeCacheRecordingOp(op, config) {
|
|
1135
1163
|
switch (op.kind) {
|
|
1136
1164
|
case "setOutput":
|
|
1137
1165
|
case "appendOutput": return {
|
|
1138
1166
|
...op,
|
|
1139
|
-
value: await serializeCacheValue(op.value)
|
|
1167
|
+
value: await serializeCacheValue(op.value, config)
|
|
1140
1168
|
};
|
|
1141
1169
|
case "mergeOutput": return {
|
|
1142
1170
|
...op,
|
|
1143
|
-
patch: await serializeRecordValues(op.patch)
|
|
1171
|
+
patch: await serializeRecordValues(op.patch, config)
|
|
1144
1172
|
};
|
|
1145
1173
|
case "incrementOutput": return op;
|
|
1146
1174
|
case "checkpoint": return {
|
|
1147
1175
|
...op,
|
|
1148
|
-
data: await serializeCacheValue(op.data)
|
|
1176
|
+
data: await serializeCacheValue(op.data, config)
|
|
1149
1177
|
};
|
|
1150
1178
|
case "subSpan": return {
|
|
1151
1179
|
...op,
|
|
1152
|
-
span: await serializeCacheSpan(op.span)
|
|
1180
|
+
span: await serializeCacheSpan(op.span, config)
|
|
1153
1181
|
};
|
|
1154
1182
|
}
|
|
1155
1183
|
}
|
|
@@ -1175,11 +1203,11 @@ function deserializeCacheRecordingOp(op) {
|
|
|
1175
1203
|
};
|
|
1176
1204
|
}
|
|
1177
1205
|
}
|
|
1178
|
-
async function serializeCacheSpan(span) {
|
|
1206
|
+
async function serializeCacheSpan(span, config) {
|
|
1179
1207
|
return {
|
|
1180
1208
|
...span,
|
|
1181
|
-
attributes: span.attributes === void 0 ? void 0 : await serializeRecordValues(span.attributes),
|
|
1182
|
-
children: await Promise.all(span.children.map(serializeCacheSpan))
|
|
1209
|
+
attributes: span.attributes === void 0 ? void 0 : await serializeRecordValues(span.attributes, config),
|
|
1210
|
+
children: await Promise.all(span.children.map((child) => serializeCacheSpan(child, config)))
|
|
1183
1211
|
};
|
|
1184
1212
|
}
|
|
1185
1213
|
function deserializeCacheSpan(span) {
|
|
@@ -1189,13 +1217,19 @@ function deserializeCacheSpan(span) {
|
|
|
1189
1217
|
children: span.children.map(deserializeCacheSpan)
|
|
1190
1218
|
};
|
|
1191
1219
|
}
|
|
1192
|
-
/**
|
|
1193
|
-
|
|
1220
|
+
/**
|
|
1221
|
+
* Serialize all rich values captured in a cache recording before persistence.
|
|
1222
|
+
*
|
|
1223
|
+
* Undefined values are omitted by default; pass `preserveUndefined: true` to
|
|
1224
|
+
* retain the legacy explicit undefined wrappers in the recording payload.
|
|
1225
|
+
*/
|
|
1226
|
+
async function serializeCacheRecording(recording, options = void 0) {
|
|
1227
|
+
const config = normalizeCacheSerializationOptions(options);
|
|
1194
1228
|
return {
|
|
1195
1229
|
...recording,
|
|
1196
|
-
returnValue: await serializeCacheValue(recording.returnValue),
|
|
1197
|
-
finalAttributes: await serializeRecordValues(recording.finalAttributes),
|
|
1198
|
-
ops: await Promise.all(recording.ops.map(serializeCacheRecordingOp))
|
|
1230
|
+
returnValue: await serializeCacheValue(recording.returnValue, config),
|
|
1231
|
+
finalAttributes: await serializeRecordValues(recording.finalAttributes, config),
|
|
1232
|
+
ops: await Promise.all(recording.ops.map((op) => serializeCacheRecordingOp(op, config)))
|
|
1199
1233
|
};
|
|
1200
1234
|
}
|
|
1201
1235
|
/** Revive all rich values captured in a cache recording after lookup. */
|
|
@@ -1587,7 +1621,9 @@ function createTraceCache(generateSpanId) {
|
|
|
1587
1621
|
key: info.key
|
|
1588
1622
|
}, { serializeFileBytes: info.serializeFileBytes === true });
|
|
1589
1623
|
const activeSpan = scope.activeSpanStack.at(-1);
|
|
1590
|
-
|
|
1624
|
+
const canRead = cacheCtx.mode === "use" && cacheCtx.read !== false;
|
|
1625
|
+
const canStore = cacheCtx.mode !== "bypass" && cacheCtx.store !== false;
|
|
1626
|
+
if (canRead) {
|
|
1591
1627
|
const hit = await cacheCtx.adapter.lookup(namespace, keyHash);
|
|
1592
1628
|
if (hit) {
|
|
1593
1629
|
const storedAt = hit.storedAt;
|
|
@@ -1610,14 +1646,24 @@ function createTraceCache(generateSpanId) {
|
|
|
1610
1646
|
name: info.name,
|
|
1611
1647
|
namespace,
|
|
1612
1648
|
key: keyHash,
|
|
1613
|
-
status: "miss"
|
|
1649
|
+
status: "miss",
|
|
1650
|
+
...canStore ? {} : { stored: false }
|
|
1614
1651
|
});
|
|
1615
|
-
} else if (cacheCtx.mode === "
|
|
1652
|
+
} else if (cacheCtx.mode === "use" && canStore) recordCacheRef(scope, activeSpan, {
|
|
1616
1653
|
type: "value",
|
|
1617
1654
|
name: info.name,
|
|
1618
1655
|
namespace,
|
|
1619
1656
|
key: keyHash,
|
|
1620
|
-
status: "
|
|
1657
|
+
status: "miss",
|
|
1658
|
+
read: false
|
|
1659
|
+
});
|
|
1660
|
+
else if (cacheCtx.mode === "refresh") recordCacheRef(scope, activeSpan, {
|
|
1661
|
+
type: "value",
|
|
1662
|
+
name: info.name,
|
|
1663
|
+
namespace,
|
|
1664
|
+
key: keyHash,
|
|
1665
|
+
status: "refresh",
|
|
1666
|
+
...canStore ? {} : { stored: false }
|
|
1621
1667
|
});
|
|
1622
1668
|
else recordCacheRef(scope, activeSpan, {
|
|
1623
1669
|
type: "value",
|
|
@@ -1640,7 +1686,7 @@ function createTraceCache(generateSpanId) {
|
|
|
1640
1686
|
scope.recordingStack.pop();
|
|
1641
1687
|
}
|
|
1642
1688
|
appendSubSpanOps(scope, frame);
|
|
1643
|
-
if (
|
|
1689
|
+
if (canStore) {
|
|
1644
1690
|
const finalAttributes = diffNonCacheAttributes(beforeAttributes, await snapshotNonCacheAttributes(activeSpan));
|
|
1645
1691
|
const recording = {
|
|
1646
1692
|
returnValue: bodyResult,
|
|
@@ -1654,13 +1700,11 @@ function createTraceCache(generateSpanId) {
|
|
|
1654
1700
|
operationType: "value",
|
|
1655
1701
|
operationName: info.name,
|
|
1656
1702
|
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
1657
|
-
codeFingerprint: cacheCtx.codeFingerprint,
|
|
1658
1703
|
recording: await serializeCacheRecording(recording)
|
|
1659
1704
|
}, {
|
|
1660
1705
|
rawKey: info.key,
|
|
1661
1706
|
operationType: "value",
|
|
1662
|
-
operationName: info.name
|
|
1663
|
-
codeFingerprint: cacheCtx.codeFingerprint
|
|
1707
|
+
operationName: info.name
|
|
1664
1708
|
});
|
|
1665
1709
|
}
|
|
1666
1710
|
return bodyResult;
|
|
@@ -2031,11 +2075,13 @@ async function traceSpanInternal(info, fn) {
|
|
|
2031
2075
|
namespace,
|
|
2032
2076
|
key: cacheOpts.key
|
|
2033
2077
|
}, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
|
|
2078
|
+
const canRead = ctx.mode === "use" && ctx.read !== false;
|
|
2079
|
+
const canStore = ctx.mode !== "bypass" && ctx.store !== false;
|
|
2034
2080
|
mergeSpanAttributes(spanRecord, {
|
|
2035
2081
|
"cache.key": keyHash,
|
|
2036
2082
|
"cache.namespace": namespace
|
|
2037
2083
|
});
|
|
2038
|
-
if (
|
|
2084
|
+
if (canRead) {
|
|
2039
2085
|
const hit = await ctx.adapter.lookup(namespace, keyHash);
|
|
2040
2086
|
if (hit) {
|
|
2041
2087
|
const storedAt = hit.storedAt;
|
|
@@ -2050,8 +2096,18 @@ async function traceSpanInternal(info, fn) {
|
|
|
2050
2096
|
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
2051
2097
|
return recording.returnValue;
|
|
2052
2098
|
}
|
|
2053
|
-
mergeSpanAttributes(spanRecord, {
|
|
2054
|
-
|
|
2099
|
+
mergeSpanAttributes(spanRecord, {
|
|
2100
|
+
"cache.status": "miss",
|
|
2101
|
+
...canStore ? {} : { "cache.stored": false }
|
|
2102
|
+
});
|
|
2103
|
+
} else if (ctx.mode === "use" && canStore) mergeSpanAttributes(spanRecord, {
|
|
2104
|
+
"cache.status": "miss",
|
|
2105
|
+
"cache.read": false
|
|
2106
|
+
});
|
|
2107
|
+
else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, {
|
|
2108
|
+
"cache.status": "refresh",
|
|
2109
|
+
...canStore ? {} : { "cache.stored": false }
|
|
2110
|
+
});
|
|
2055
2111
|
else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
|
|
2056
2112
|
const frame = {
|
|
2057
2113
|
baseSpanIndex: scope.spans.length,
|
|
@@ -2067,7 +2123,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
2067
2123
|
}
|
|
2068
2124
|
appendSubSpanOps(scope, frame);
|
|
2069
2125
|
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
2070
|
-
if (
|
|
2126
|
+
if (canStore) {
|
|
2071
2127
|
const recording = {
|
|
2072
2128
|
returnValue: bodyResult,
|
|
2073
2129
|
finalAttributes: stripCacheAttributes(spanRecord.attributes),
|
|
@@ -2087,14 +2143,12 @@ async function traceSpanInternal(info, fn) {
|
|
|
2087
2143
|
spanName: info.name,
|
|
2088
2144
|
spanKind: info.kind,
|
|
2089
2145
|
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
2090
|
-
codeFingerprint: ctx.codeFingerprint,
|
|
2091
2146
|
recording: await serializeCacheRecording(recording)
|
|
2092
2147
|
};
|
|
2093
2148
|
await ctx.adapter.write(entry, {
|
|
2094
2149
|
rawKey: cacheOpts.key,
|
|
2095
2150
|
operationType: "span",
|
|
2096
|
-
operationName: info.name
|
|
2097
|
-
codeFingerprint: ctx.codeFingerprint
|
|
2151
|
+
operationName: info.name
|
|
2098
2152
|
});
|
|
2099
2153
|
}
|
|
2100
2154
|
return bodyResult;
|
|
@@ -2287,6 +2341,7 @@ const columnDefSchema = z.object({
|
|
|
2287
2341
|
passThreshold: z.number().optional(),
|
|
2288
2342
|
maxStars: z.number().int().min(2).optional(),
|
|
2289
2343
|
hideInTable: z.boolean().optional(),
|
|
2344
|
+
hideIfNoValue: z.boolean().optional(),
|
|
2290
2345
|
align: z.enum([
|
|
2291
2346
|
"left",
|
|
2292
2347
|
"center",
|
|
@@ -2430,6 +2485,10 @@ const traceCacheRefSchema = z.object({
|
|
|
2430
2485
|
namespace: z.string(),
|
|
2431
2486
|
key: z.string(),
|
|
2432
2487
|
status: cacheStatusSchema,
|
|
2488
|
+
/** Whether this ref attempted to read from cache. Defaults to true. */
|
|
2489
|
+
read: z.boolean().optional(),
|
|
2490
|
+
/** Whether this ref wrote a persisted cache entry. Defaults to true for misses/refreshes. */
|
|
2491
|
+
stored: z.boolean().optional(),
|
|
2433
2492
|
storedAt: z.string().optional(),
|
|
2434
2493
|
age: z.number().optional()
|
|
2435
2494
|
});
|
|
@@ -2442,7 +2501,6 @@ const cacheListItemSchema = z.object({
|
|
|
2442
2501
|
spanName: z.string().optional(),
|
|
2443
2502
|
spanKind: traceSpanKindSchema.optional(),
|
|
2444
2503
|
storedAt: z.string(),
|
|
2445
|
-
codeFingerprint: z.string(),
|
|
2446
2504
|
sizeBytes: z.number()
|
|
2447
2505
|
});
|
|
2448
2506
|
/** Zod schema for `SerializedCacheSpan`, defined lazily for recursion. */
|
|
@@ -2524,7 +2582,6 @@ const cacheEntrySchema = z.object({
|
|
|
2524
2582
|
spanName: z.string().optional(),
|
|
2525
2583
|
spanKind: traceSpanKindSchema.optional(),
|
|
2526
2584
|
storedAt: z.string(),
|
|
2527
|
-
codeFingerprint: z.string(),
|
|
2528
2585
|
recording: cacheRecordingSchema
|
|
2529
2586
|
});
|
|
2530
2587
|
/** Debug-only raw key metadata stored outside the reusable cache entry. */
|
|
@@ -2535,7 +2592,6 @@ const cacheDebugKeyEntrySchema = z.object({
|
|
|
2535
2592
|
operationType: cacheOperationTypeSchema,
|
|
2536
2593
|
operationName: z.string(),
|
|
2537
2594
|
storedAt: z.string(),
|
|
2538
|
-
codeFingerprint: z.string(),
|
|
2539
2595
|
rawKey: z.unknown()
|
|
2540
2596
|
});
|
|
2541
2597
|
/** Cache lookup response with optional debug-only raw key data. */
|
|
@@ -2627,6 +2683,11 @@ const evalChartTooltipExtraSchema = z.discriminatedUnion("source", [z.object({
|
|
|
2627
2683
|
const evalChartConfigSchema = z.object({
|
|
2628
2684
|
/** Optional heading shown above the chart frame in the UI. */
|
|
2629
2685
|
heading: z.string().optional(),
|
|
2686
|
+
/**
|
|
2687
|
+
* Hide this chart in the UI when none of its metrics has a numeric value in
|
|
2688
|
+
* the rendered history window.
|
|
2689
|
+
*/
|
|
2690
|
+
hideIfNoValue: z.boolean().optional(),
|
|
2630
2691
|
type: evalChartTypeSchema,
|
|
2631
2692
|
/** At least one series must be declared. */
|
|
2632
2693
|
metrics: z.array(evalChartMetricSchema).min(1),
|
|
@@ -2667,17 +2728,31 @@ const evalStatAggregateSchema = z.enum([
|
|
|
2667
2728
|
"sum",
|
|
2668
2729
|
"last"
|
|
2669
2730
|
]);
|
|
2731
|
+
const hideIfNoValueShape = {
|
|
2732
|
+
/**
|
|
2733
|
+
* Hide this stat in the UI when the current run has no displayable value.
|
|
2734
|
+
* Missing values, `null`, and empty strings count as no value; `0` remains
|
|
2735
|
+
* visible.
|
|
2736
|
+
*/
|
|
2737
|
+
hideIfNoValue: z.boolean().optional() };
|
|
2670
2738
|
/**
|
|
2671
2739
|
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
2672
2740
|
* `column` aggregates a score or numeric output column across the latest run.
|
|
2673
2741
|
*/
|
|
2674
2742
|
const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
2675
|
-
z.object({
|
|
2743
|
+
z.object({
|
|
2744
|
+
kind: z.literal("cases"),
|
|
2745
|
+
...hideIfNoValueShape
|
|
2746
|
+
}),
|
|
2676
2747
|
z.object({
|
|
2677
2748
|
kind: z.literal("passRate"),
|
|
2678
|
-
accent: z.boolean().optional()
|
|
2749
|
+
accent: z.boolean().optional(),
|
|
2750
|
+
...hideIfNoValueShape
|
|
2751
|
+
}),
|
|
2752
|
+
z.object({
|
|
2753
|
+
kind: z.literal("duration"),
|
|
2754
|
+
...hideIfNoValueShape
|
|
2679
2755
|
}),
|
|
2680
|
-
z.object({ kind: z.literal("duration") }),
|
|
2681
2756
|
z.object({
|
|
2682
2757
|
kind: z.literal("column"),
|
|
2683
2758
|
key: z.string(),
|
|
@@ -2686,7 +2761,8 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
2686
2761
|
format: columnFormatSchema.optional(),
|
|
2687
2762
|
/** Number presentation options applied when `format: 'number'`. */
|
|
2688
2763
|
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
2689
|
-
accent: z.boolean().optional()
|
|
2764
|
+
accent: z.boolean().optional(),
|
|
2765
|
+
...hideIfNoValueShape
|
|
2690
2766
|
})
|
|
2691
2767
|
]);
|
|
2692
2768
|
/** Ordered list of stats rendered in the EvalCard stats row. */
|
|
@@ -2767,6 +2843,12 @@ const caseRowSchema = z.object({
|
|
|
2767
2843
|
});
|
|
2768
2844
|
/** Structured assertion failure metadata captured for one case run. */
|
|
2769
2845
|
const assertionFailureSchema = z.object({
|
|
2846
|
+
/**
|
|
2847
|
+
* Error class or category label rendered alongside the message (e.g.
|
|
2848
|
+
* `EvalAssertionError`, `OutputsSchemaError`). Optional for legacy entries
|
|
2849
|
+
* and synthetic failures without an originating Error.
|
|
2850
|
+
*/
|
|
2851
|
+
name: z.string().optional(),
|
|
2770
2852
|
/** Human-readable assertion failure message shown in the UI and artifacts. */
|
|
2771
2853
|
message: z.string(),
|
|
2772
2854
|
/** Stack trace captured from the originating error when available. */
|
|
@@ -2915,6 +2997,25 @@ const defaultConfigKeySchema = z.enum([
|
|
|
2915
2997
|
]);
|
|
2916
2998
|
/** Removal config for built-in eval-level outputs and UI metadata. */
|
|
2917
2999
|
const removeDefaultConfigSchema = z.union([z.literal(true), z.array(defaultConfigKeySchema)]);
|
|
3000
|
+
const evalDeriveValueFnSchema = z.custom((value) => typeof value === "function", { message: "Expected a derive output function" });
|
|
3001
|
+
/** Schema for keyed or object-returning trace-derived output config. */
|
|
3002
|
+
const evalDeriveConfigSchema = z.union([z.custom((value) => typeof value === "function", { message: "Expected a deriveFromTracing function" }), z.record(z.string().min(1), evalDeriveValueFnSchema)]);
|
|
3003
|
+
/** Schema for UI overrides on derived or scored columns. */
|
|
3004
|
+
const evalColumnOverrideSchema = z.object({
|
|
3005
|
+
label: z.string().optional(),
|
|
3006
|
+
format: columnFormatSchema.optional(),
|
|
3007
|
+
numberFormat: numberDisplayOptionsSchema.optional(),
|
|
3008
|
+
hideInTable: z.boolean().optional(),
|
|
3009
|
+
hideIfNoValue: z.boolean().optional(),
|
|
3010
|
+
align: z.enum([
|
|
3011
|
+
"left",
|
|
3012
|
+
"center",
|
|
3013
|
+
"right"
|
|
3014
|
+
]).optional(),
|
|
3015
|
+
maxStars: z.number().int().min(2).optional()
|
|
3016
|
+
});
|
|
3017
|
+
/** Schema for column override maps keyed by output or score field name. */
|
|
3018
|
+
const evalColumnsSchema = z.record(z.string(), evalColumnOverrideSchema);
|
|
2918
3019
|
/** Render formats supported by an LLM-call metric in the UI. */
|
|
2919
3020
|
const llmCallMetricFormatSchema = z.enum([
|
|
2920
3021
|
"string",
|
|
@@ -2992,18 +3093,9 @@ const apiCallMetricSchema = z.object({
|
|
|
2992
3093
|
placements: z.array(apiCallMetricPlacementSchema).nonempty().optional()
|
|
2993
3094
|
});
|
|
2994
3095
|
/**
|
|
2995
|
-
* Schema for
|
|
2996
|
-
* from token counts.
|
|
3096
|
+
* Schema for pricing rates used to derive LLM-call costs from token counts.
|
|
2997
3097
|
*/
|
|
2998
|
-
const
|
|
2999
|
-
/** Exact model name read from the configured `attributes.model` path. */
|
|
3000
|
-
model: z.string().min(1),
|
|
3001
|
-
/**
|
|
3002
|
-
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
3003
|
-
* the entry only applies to calls from that provider; provider-specific
|
|
3004
|
-
* entries take precedence over generic entries for the same model.
|
|
3005
|
-
*/
|
|
3006
|
-
provider: z.string().min(1).optional(),
|
|
3098
|
+
const llmCallPricingRateSchema = z.object({
|
|
3007
3099
|
/** USD per one million non-cached input tokens. */
|
|
3008
3100
|
inputUsdPerMillion: z.number().nonnegative().optional(),
|
|
3009
3101
|
/** USD per one million output tokens. */
|
|
@@ -3017,6 +3109,23 @@ const llmCallPricingSchema = z.object({
|
|
|
3017
3109
|
/** USD per one million reasoning tokens when reported separately. */
|
|
3018
3110
|
reasoningUsdPerMillion: z.number().nonnegative().optional()
|
|
3019
3111
|
});
|
|
3112
|
+
/**
|
|
3113
|
+
* Schema for one model's pricing config. The object key is the exact model
|
|
3114
|
+
* name. Use `providers` when a model has provider-specific rates in addition
|
|
3115
|
+
* to, or instead of, generic model rates.
|
|
3116
|
+
*/
|
|
3117
|
+
const llmCallPricingSchema = llmCallPricingRateSchema.extend({
|
|
3118
|
+
/**
|
|
3119
|
+
* Optional provider discriminator read from `attributes.provider`. When set,
|
|
3120
|
+
* the top-level entry only applies to calls from that provider.
|
|
3121
|
+
*/
|
|
3122
|
+
provider: z.string().min(1).optional(),
|
|
3123
|
+
/**
|
|
3124
|
+
* Provider-specific pricing for the model. Provider entries take precedence
|
|
3125
|
+
* over generic rates for the same model.
|
|
3126
|
+
*/
|
|
3127
|
+
providers: z.record(z.string().min(1), llmCallPricingRateSchema).optional()
|
|
3128
|
+
});
|
|
3020
3129
|
/** Schema for the global LLM calls config block in `agent-evals.config.ts`. */
|
|
3021
3130
|
const llmCallsConfigSchema = z.object({
|
|
3022
3131
|
/** Span kinds treated as LLM calls. Defaults to `['llm']`. */
|
|
@@ -3053,10 +3162,10 @@ const llmCallsConfigSchema = z.object({
|
|
|
3053
3162
|
*/
|
|
3054
3163
|
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
3055
3164
|
/**
|
|
3056
|
-
* Model
|
|
3057
|
-
*
|
|
3165
|
+
* Model-keyed pricing registry used to calculate LLM-call costs from token
|
|
3166
|
+
* counts. Built-in LLM cost fields are only derived from this registry.
|
|
3058
3167
|
*/
|
|
3059
|
-
pricing: z.
|
|
3168
|
+
pricing: z.record(z.string().min(1), llmCallPricingSchema).optional(),
|
|
3060
3169
|
/** Custom user-defined metrics surfaced on each LLM call. */
|
|
3061
3170
|
metrics: z.array(llmCallMetricSchema).optional()
|
|
3062
3171
|
});
|
|
@@ -3172,6 +3281,33 @@ function resolveApiCallMetric(metric) {
|
|
|
3172
3281
|
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
3173
3282
|
};
|
|
3174
3283
|
}
|
|
3284
|
+
function hasPricingRates(pricing) {
|
|
3285
|
+
return pricing.inputUsdPerMillion !== void 0 || pricing.outputUsdPerMillion !== void 0 || pricing.cachedInputUsdPerMillion !== void 0 || pricing.cacheCreationInputUsdPerMillion !== void 0 || pricing.cacheCreationInput1hUsdPerMillion !== void 0 || pricing.reasoningUsdPerMillion !== void 0;
|
|
3286
|
+
}
|
|
3287
|
+
function copyPricingRates(pricing) {
|
|
3288
|
+
return {
|
|
3289
|
+
inputUsdPerMillion: pricing.inputUsdPerMillion,
|
|
3290
|
+
outputUsdPerMillion: pricing.outputUsdPerMillion,
|
|
3291
|
+
cachedInputUsdPerMillion: pricing.cachedInputUsdPerMillion,
|
|
3292
|
+
cacheCreationInputUsdPerMillion: pricing.cacheCreationInputUsdPerMillion,
|
|
3293
|
+
cacheCreationInput1hUsdPerMillion: pricing.cacheCreationInput1hUsdPerMillion,
|
|
3294
|
+
reasoningUsdPerMillion: pricing.reasoningUsdPerMillion
|
|
3295
|
+
};
|
|
3296
|
+
}
|
|
3297
|
+
function resolveLlmCallPricingEntries(model, pricing) {
|
|
3298
|
+
const entries = [];
|
|
3299
|
+
if (hasPricingRates(pricing)) entries.push({
|
|
3300
|
+
model,
|
|
3301
|
+
provider: pricing.provider,
|
|
3302
|
+
...copyPricingRates(pricing)
|
|
3303
|
+
});
|
|
3304
|
+
for (const [provider, providerPricing] of Object.entries(pricing.providers ?? {})) entries.push({
|
|
3305
|
+
model,
|
|
3306
|
+
provider,
|
|
3307
|
+
...copyPricingRates(providerPricing)
|
|
3308
|
+
});
|
|
3309
|
+
return entries;
|
|
3310
|
+
}
|
|
3175
3311
|
/**
|
|
3176
3312
|
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
3177
3313
|
* by the UI to derive the LLM calls tab.
|
|
@@ -3182,7 +3318,7 @@ function resolveApiCallMetric(metric) {
|
|
|
3182
3318
|
* - Missing `metrics[].format` defaults to `'string'`.
|
|
3183
3319
|
* - Missing `metrics[].placements` defaults to `['body']`.
|
|
3184
3320
|
* - Missing `pricing` defaults to an empty registry; built-in costs are only
|
|
3185
|
-
* derived from configured pricing and token counts.
|
|
3321
|
+
* derived from configured model-keyed pricing and token counts.
|
|
3186
3322
|
*/
|
|
3187
3323
|
function resolveLlmCallsConfig(input) {
|
|
3188
3324
|
return {
|
|
@@ -3193,16 +3329,7 @@ function resolveLlmCallsConfig(input) {
|
|
|
3193
3329
|
},
|
|
3194
3330
|
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3195
3331
|
metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
|
|
3196
|
-
pricing: (input?.pricing ??
|
|
3197
|
-
model: p.model,
|
|
3198
|
-
provider: p.provider,
|
|
3199
|
-
inputUsdPerMillion: p.inputUsdPerMillion,
|
|
3200
|
-
outputUsdPerMillion: p.outputUsdPerMillion,
|
|
3201
|
-
cachedInputUsdPerMillion: p.cachedInputUsdPerMillion,
|
|
3202
|
-
cacheCreationInputUsdPerMillion: p.cacheCreationInputUsdPerMillion,
|
|
3203
|
-
cacheCreationInput1hUsdPerMillion: p.cacheCreationInput1hUsdPerMillion,
|
|
3204
|
-
reasoningUsdPerMillion: p.reasoningUsdPerMillion
|
|
3205
|
-
}))
|
|
3332
|
+
pricing: Object.entries(input?.pricing ?? {}).flatMap(([model, pricing]) => resolveLlmCallPricingEntries(model, pricing))
|
|
3206
3333
|
};
|
|
3207
3334
|
}
|
|
3208
3335
|
/**
|
|
@@ -3236,6 +3363,9 @@ const agentEvalsConfigSchema = z.object({
|
|
|
3236
3363
|
staleAfterDays: z.number().optional(),
|
|
3237
3364
|
allowCliRunAll: z.boolean().optional(),
|
|
3238
3365
|
traceDisplay: traceDisplayInputConfigSchema.optional(),
|
|
3366
|
+
columns: evalColumnsSchema.optional(),
|
|
3367
|
+
deriveFromTracing: evalDeriveConfigSchema.optional(),
|
|
3368
|
+
stats: evalStatsConfigSchema.optional(),
|
|
3239
3369
|
llmCalls: llmCallsConfigSchema.optional(),
|
|
3240
3370
|
removeDefaultConfig: removeDefaultConfigSchema.optional(),
|
|
3241
3371
|
apiCalls: apiCallsConfigSchema.optional(),
|
|
@@ -3888,6 +4018,11 @@ function readNumber(attributes, key) {
|
|
|
3888
4018
|
const value = attributes[key];
|
|
3889
4019
|
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
3890
4020
|
}
|
|
4021
|
+
function readBoolean(attributes, key) {
|
|
4022
|
+
if (!isRecord$2(attributes)) return void 0;
|
|
4023
|
+
const value = attributes[key];
|
|
4024
|
+
return typeof value === "boolean" ? value : void 0;
|
|
4025
|
+
}
|
|
3891
4026
|
function readArray(attributes, key) {
|
|
3892
4027
|
if (!isRecord$2(attributes)) return [];
|
|
3893
4028
|
const value = attributes[key];
|
|
@@ -3916,12 +4051,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3916
4051
|
const namespace = readString(span.attributes, "cache.namespace");
|
|
3917
4052
|
if (key !== void 0 && namespace !== void 0) {
|
|
3918
4053
|
const isHit = status === "hit";
|
|
4054
|
+
const stored = isHit ? true : readBoolean(span.attributes, "cache.stored") !== false;
|
|
3919
4055
|
entries.push({
|
|
3920
4056
|
id: span.id,
|
|
3921
4057
|
source: "span",
|
|
3922
4058
|
origin: "span",
|
|
3923
|
-
action: isHit ? "hit" : "added",
|
|
4059
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3924
4060
|
status,
|
|
4061
|
+
stored,
|
|
3925
4062
|
name: span.name,
|
|
3926
4063
|
namespace,
|
|
3927
4064
|
key,
|
|
@@ -3938,12 +4075,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3938
4075
|
const ref = parsed.data;
|
|
3939
4076
|
if (ref.status === "bypass") continue;
|
|
3940
4077
|
const isHit = ref.status === "hit";
|
|
4078
|
+
const stored = isHit ? true : ref.stored !== false;
|
|
3941
4079
|
entries.push({
|
|
3942
4080
|
id: `${span.id}:value:${String(index)}`,
|
|
3943
4081
|
source: "value",
|
|
3944
4082
|
origin: "span",
|
|
3945
|
-
action: isHit ? "hit" : "added",
|
|
4083
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3946
4084
|
status: ref.status,
|
|
4085
|
+
stored,
|
|
3947
4086
|
name: ref.name,
|
|
3948
4087
|
namespace: ref.namespace,
|
|
3949
4088
|
key: ref.key,
|
|
@@ -3956,12 +4095,14 @@ function extractCacheEntries(spans, caseCacheRefs) {
|
|
|
3956
4095
|
for (const [index, ref] of caseCacheRefs.entries()) {
|
|
3957
4096
|
if (ref.status === "bypass") continue;
|
|
3958
4097
|
const isHit = ref.status === "hit";
|
|
4098
|
+
const stored = isHit ? true : ref.stored !== false;
|
|
3959
4099
|
entries.push({
|
|
3960
4100
|
id: `case:value:${String(index)}`,
|
|
3961
4101
|
source: "value",
|
|
3962
4102
|
origin: "caseRoot",
|
|
3963
|
-
action: isHit ? "hit" : "added",
|
|
4103
|
+
action: isHit ? "hit" : stored ? "added" : "notStored",
|
|
3964
4104
|
status: ref.status,
|
|
4105
|
+
stored,
|
|
3965
4106
|
name: ref.name,
|
|
3966
4107
|
namespace: ref.namespace,
|
|
3967
4108
|
key: ref.key,
|
|
@@ -4033,7 +4174,8 @@ const updateManualScoreRequestSchema = z.object({ value: z.number().min(0).max(1
|
|
|
4033
4174
|
//#endregion
|
|
4034
4175
|
//#region ../runner/src/cacheStore.ts
|
|
4035
4176
|
const defaultMaxEntriesPerNamespace = 100;
|
|
4036
|
-
const cacheSerializationMarker = "
|
|
4177
|
+
const cacheSerializationMarker = "__aecs";
|
|
4178
|
+
const legacyCacheSerializationMarker = "__agentEvalsCacheSerialization";
|
|
4037
4179
|
const supportedCacheSerializationVersion = "json-safe-v1";
|
|
4038
4180
|
/**
|
|
4039
4181
|
* Create a filesystem-backed cache adapter rooted at `<workspaceRoot>/<dir>`.
|
|
@@ -4118,7 +4260,6 @@ function createFsCacheStore(options) {
|
|
|
4118
4260
|
spanName: entry.spanName,
|
|
4119
4261
|
spanKind: entry.spanKind,
|
|
4120
4262
|
storedAt: entry.storedAt,
|
|
4121
|
-
codeFingerprint: entry.codeFingerprint,
|
|
4122
4263
|
sizeBytes: Buffer.byteLength(JSON.stringify(entry), "utf8")
|
|
4123
4264
|
});
|
|
4124
4265
|
}
|
|
@@ -4247,7 +4388,7 @@ async function readCacheFilePath(filePath) {
|
|
|
4247
4388
|
function usesSupportedCacheSerialization(value) {
|
|
4248
4389
|
if (Array.isArray(value)) return value.every(usesSupportedCacheSerialization);
|
|
4249
4390
|
if (!isRecordLike(value)) return true;
|
|
4250
|
-
if (Object.hasOwn(value,
|
|
4391
|
+
for (const marker of [cacheSerializationMarker, legacyCacheSerializationMarker]) if (Object.hasOwn(value, marker) && value[marker] !== supportedCacheSerializationVersion) return false;
|
|
4251
4392
|
return Object.values(value).every(usesSupportedCacheSerialization);
|
|
4252
4393
|
}
|
|
4253
4394
|
async function writeOrRemoveCacheFile(cacheDir, cacheFile) {
|
|
@@ -4291,7 +4432,6 @@ async function writeDebugKeyEntry(params) {
|
|
|
4291
4432
|
operationType: debugKey.operationType,
|
|
4292
4433
|
operationName: debugKey.operationName,
|
|
4293
4434
|
storedAt: entry.storedAt,
|
|
4294
|
-
codeFingerprint: debugKey.codeFingerprint,
|
|
4295
4435
|
rawKey: debugKey.rawKey
|
|
4296
4436
|
};
|
|
4297
4437
|
await writeDebugKeyFile(debugDir, {
|
|
@@ -4507,6 +4647,7 @@ function getScoreOverride(def) {
|
|
|
4507
4647
|
format: def.format,
|
|
4508
4648
|
numberFormat: def.numberFormat,
|
|
4509
4649
|
hideInTable: def.hideInTable,
|
|
4650
|
+
hideIfNoValue: def.hideIfNoValue,
|
|
4510
4651
|
align: def.align,
|
|
4511
4652
|
maxStars: def.maxStars
|
|
4512
4653
|
};
|
|
@@ -4519,6 +4660,7 @@ function mergeOverrides(base, override) {
|
|
|
4519
4660
|
format: override.format ?? base.format,
|
|
4520
4661
|
numberFormat: override.numberFormat ?? base.numberFormat,
|
|
4521
4662
|
hideInTable: override.hideInTable ?? base.hideInTable,
|
|
4663
|
+
hideIfNoValue: override.hideIfNoValue ?? base.hideIfNoValue,
|
|
4522
4664
|
align: override.align ?? base.align,
|
|
4523
4665
|
maxStars: override.maxStars ?? base.maxStars
|
|
4524
4666
|
};
|
|
@@ -4633,6 +4775,7 @@ function createColumnDef(params) {
|
|
|
4633
4775
|
if (override?.numberFormat !== void 0) def.numberFormat = override.numberFormat;
|
|
4634
4776
|
if (override?.maxStars !== void 0) def.maxStars = override.maxStars;
|
|
4635
4777
|
if (override?.hideInTable !== void 0) def.hideInTable = override.hideInTable;
|
|
4778
|
+
if (override?.hideIfNoValue !== void 0) def.hideIfNoValue = override.hideIfNoValue;
|
|
4636
4779
|
if (override?.align !== void 0) def.align = override.align;
|
|
4637
4780
|
if (!isScore) return def;
|
|
4638
4781
|
def.isScore = true;
|
|
@@ -4717,60 +4860,70 @@ const DEFAULT_COLUMNS = {
|
|
|
4717
4860
|
label: "API Calls",
|
|
4718
4861
|
format: "number",
|
|
4719
4862
|
numberFormat: countNumberFormat,
|
|
4720
|
-
align: "right"
|
|
4863
|
+
align: "right",
|
|
4864
|
+
hideIfNoValue: true
|
|
4721
4865
|
},
|
|
4722
4866
|
costUsd: {
|
|
4723
4867
|
label: "Cost",
|
|
4724
4868
|
format: "number",
|
|
4725
4869
|
numberFormat: costNumberFormat,
|
|
4726
|
-
align: "right"
|
|
4870
|
+
align: "right",
|
|
4871
|
+
hideIfNoValue: true
|
|
4727
4872
|
},
|
|
4728
4873
|
llmTurns: {
|
|
4729
4874
|
label: "LLM Turns",
|
|
4730
4875
|
format: "number",
|
|
4731
4876
|
numberFormat: countNumberFormat,
|
|
4732
|
-
align: "right"
|
|
4877
|
+
align: "right",
|
|
4878
|
+
hideIfNoValue: true
|
|
4733
4879
|
},
|
|
4734
4880
|
inputTokens: {
|
|
4735
4881
|
label: "Input Tokens",
|
|
4736
4882
|
format: "number",
|
|
4737
4883
|
numberFormat: tokenNumberFormat,
|
|
4738
|
-
align: "right"
|
|
4884
|
+
align: "right",
|
|
4885
|
+
hideIfNoValue: true
|
|
4739
4886
|
},
|
|
4740
4887
|
outputTokens: {
|
|
4741
4888
|
label: "Output Tokens",
|
|
4742
4889
|
format: "number",
|
|
4743
4890
|
numberFormat: tokenNumberFormat,
|
|
4744
|
-
align: "right"
|
|
4891
|
+
align: "right",
|
|
4892
|
+
hideIfNoValue: true
|
|
4745
4893
|
},
|
|
4746
4894
|
totalTokens: {
|
|
4747
4895
|
label: "Total Tokens",
|
|
4748
4896
|
format: "number",
|
|
4749
4897
|
numberFormat: tokenNumberFormat,
|
|
4750
|
-
align: "right"
|
|
4898
|
+
align: "right",
|
|
4899
|
+
hideIfNoValue: true
|
|
4751
4900
|
},
|
|
4752
4901
|
cachedInputTokens: {
|
|
4753
4902
|
label: "Cached Input Tokens",
|
|
4754
4903
|
format: "number",
|
|
4755
4904
|
numberFormat: tokenNumberFormat,
|
|
4756
|
-
align: "right"
|
|
4905
|
+
align: "right",
|
|
4906
|
+
hideIfNoValue: true
|
|
4757
4907
|
},
|
|
4758
4908
|
cacheCreationInputTokens: {
|
|
4759
4909
|
label: "Cache Write Tokens",
|
|
4760
4910
|
format: "number",
|
|
4761
4911
|
numberFormat: tokenNumberFormat,
|
|
4762
|
-
align: "right"
|
|
4912
|
+
align: "right",
|
|
4913
|
+
hideIfNoValue: true
|
|
4763
4914
|
},
|
|
4764
4915
|
reasoningTokens: {
|
|
4765
4916
|
label: "Reasoning Tokens",
|
|
4766
4917
|
format: "number",
|
|
4767
4918
|
numberFormat: tokenNumberFormat,
|
|
4768
|
-
align: "right"
|
|
4919
|
+
align: "right",
|
|
4920
|
+
hideIfNoValue: true
|
|
4769
4921
|
},
|
|
4770
4922
|
llmDurationMs: {
|
|
4771
4923
|
label: "LLM Duration",
|
|
4772
4924
|
format: "duration",
|
|
4773
|
-
align: "right"
|
|
4925
|
+
align: "right",
|
|
4926
|
+
hideIfNoValue: true
|
|
4774
4927
|
}
|
|
4775
4928
|
};
|
|
4776
4929
|
function resolveRemovedKeys(globalRemove, evalRemove) {
|
|
@@ -4783,9 +4936,16 @@ function getActiveDefaultConfigKeys(params) {
|
|
|
4783
4936
|
}
|
|
4784
4937
|
function mergeDefaultColumns(params) {
|
|
4785
4938
|
const activeKeys = getActiveDefaultConfigKeys(params);
|
|
4786
|
-
if (activeKeys.length === 0)
|
|
4939
|
+
if (activeKeys.length === 0) {
|
|
4940
|
+
const merged = {
|
|
4941
|
+
...params.globalColumns,
|
|
4942
|
+
...params.columns
|
|
4943
|
+
};
|
|
4944
|
+
return Object.keys(merged).length > 0 ? merged : void 0;
|
|
4945
|
+
}
|
|
4787
4946
|
return {
|
|
4788
4947
|
...Object.fromEntries(activeKeys.map((key) => [key, DEFAULT_COLUMNS[key]])),
|
|
4948
|
+
...params.globalColumns,
|
|
4789
4949
|
...params.columns
|
|
4790
4950
|
};
|
|
4791
4951
|
}
|
|
@@ -4797,30 +4957,38 @@ function appendDefaultStats(params) {
|
|
|
4797
4957
|
key: "apiCalls",
|
|
4798
4958
|
label: "API Calls",
|
|
4799
4959
|
aggregate: "avg",
|
|
4800
|
-
numberFormat: countNumberFormat
|
|
4960
|
+
numberFormat: countNumberFormat,
|
|
4961
|
+
hideIfNoValue: true
|
|
4801
4962
|
});
|
|
4802
4963
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4803
4964
|
kind: "column",
|
|
4804
4965
|
key: "costUsd",
|
|
4805
4966
|
label: "LLM Cost",
|
|
4806
4967
|
aggregate: "avg",
|
|
4807
|
-
numberFormat: costNumberFormat
|
|
4968
|
+
numberFormat: costNumberFormat,
|
|
4969
|
+
hideIfNoValue: true
|
|
4808
4970
|
});
|
|
4809
4971
|
if (activeKeys.has("totalTokens")) defaults.push({
|
|
4810
4972
|
kind: "column",
|
|
4811
4973
|
key: "totalTokens",
|
|
4812
4974
|
label: "Tokens",
|
|
4813
4975
|
aggregate: "avg",
|
|
4814
|
-
numberFormat: tokenNumberFormat
|
|
4976
|
+
numberFormat: tokenNumberFormat,
|
|
4977
|
+
hideIfNoValue: true
|
|
4815
4978
|
});
|
|
4816
4979
|
if (activeKeys.has("llmTurns")) defaults.push({
|
|
4817
4980
|
kind: "column",
|
|
4818
4981
|
key: "llmTurns",
|
|
4819
4982
|
label: "LLM Turns",
|
|
4820
4983
|
aggregate: "avg",
|
|
4821
|
-
numberFormat: countNumberFormat
|
|
4984
|
+
numberFormat: countNumberFormat,
|
|
4985
|
+
hideIfNoValue: true
|
|
4822
4986
|
});
|
|
4823
|
-
const merged = [
|
|
4987
|
+
const merged = [
|
|
4988
|
+
...params.globalStats ?? [],
|
|
4989
|
+
...params.stats ?? [],
|
|
4990
|
+
...defaults
|
|
4991
|
+
];
|
|
4824
4992
|
return merged.length > 0 ? merged : void 0;
|
|
4825
4993
|
}
|
|
4826
4994
|
function appendDefaultCharts(params) {
|
|
@@ -4828,6 +4996,7 @@ function appendDefaultCharts(params) {
|
|
|
4828
4996
|
const defaults = [];
|
|
4829
4997
|
if (activeKeys.has("costUsd")) defaults.push({
|
|
4830
4998
|
heading: "LLM Cost",
|
|
4999
|
+
hideIfNoValue: true,
|
|
4831
5000
|
type: "area",
|
|
4832
5001
|
metrics: [{
|
|
4833
5002
|
source: "column",
|
|
@@ -4869,6 +5038,7 @@ function appendDefaultCharts(params) {
|
|
|
4869
5038
|
].filter((metric) => metric !== null);
|
|
4870
5039
|
if (tokenMetrics.length > 0) defaults.push({
|
|
4871
5040
|
heading: "LLM Tokens",
|
|
5041
|
+
hideIfNoValue: true,
|
|
4872
5042
|
type: "bar",
|
|
4873
5043
|
metrics: tokenMetrics,
|
|
4874
5044
|
tooltipExtras: activeKeys.has("totalTokens") ? [{
|
|
@@ -4885,11 +5055,13 @@ function resolveEvalDefaultConfig(params) {
|
|
|
4885
5055
|
const evalRemove = params.evalDef.removeDefaultConfig;
|
|
4886
5056
|
return {
|
|
4887
5057
|
columns: mergeDefaultColumns({
|
|
5058
|
+
globalColumns: params.globalColumns,
|
|
4888
5059
|
columns: params.evalDef.columns,
|
|
4889
5060
|
globalRemove: params.globalRemove,
|
|
4890
5061
|
evalRemove
|
|
4891
5062
|
}),
|
|
4892
5063
|
stats: appendDefaultStats({
|
|
5064
|
+
globalStats: params.globalStats,
|
|
4893
5065
|
stats: params.evalDef.stats,
|
|
4894
5066
|
globalRemove: params.globalRemove,
|
|
4895
5067
|
evalRemove
|
|
@@ -5227,6 +5399,65 @@ function isFile(value) {
|
|
|
5227
5399
|
return value instanceof File;
|
|
5228
5400
|
}
|
|
5229
5401
|
//#endregion
|
|
5402
|
+
//#region ../runner/src/traceDisplay.ts
|
|
5403
|
+
function isRecord$1(value) {
|
|
5404
|
+
return typeof value === "object" && value !== null;
|
|
5405
|
+
}
|
|
5406
|
+
function mergeNestedAttribute(value, path, attributeValue) {
|
|
5407
|
+
const root = value === void 0 ? {} : { ...value };
|
|
5408
|
+
const parts = path.split(".");
|
|
5409
|
+
let current = root;
|
|
5410
|
+
for (const [index, part] of parts.entries()) {
|
|
5411
|
+
if (index === parts.length - 1) {
|
|
5412
|
+
current[part] = attributeValue;
|
|
5413
|
+
continue;
|
|
5414
|
+
}
|
|
5415
|
+
const nextValue = current[part];
|
|
5416
|
+
const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
|
|
5417
|
+
current[part] = nextRecord;
|
|
5418
|
+
current = nextRecord;
|
|
5419
|
+
}
|
|
5420
|
+
return root;
|
|
5421
|
+
}
|
|
5422
|
+
function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
5423
|
+
const merged = /* @__PURE__ */ new Map();
|
|
5424
|
+
for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5425
|
+
for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5426
|
+
const resolvedAttributes = [];
|
|
5427
|
+
const transformedTrace = spans.map((span) => ({
|
|
5428
|
+
...span,
|
|
5429
|
+
attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
|
|
5430
|
+
}));
|
|
5431
|
+
for (const attribute of merged.values()) {
|
|
5432
|
+
const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
|
|
5433
|
+
resolvedAttributes.push({
|
|
5434
|
+
key: attribute.key,
|
|
5435
|
+
path: resolvedPath,
|
|
5436
|
+
label: attribute.label,
|
|
5437
|
+
format: attribute.format,
|
|
5438
|
+
numberFormat: attribute.numberFormat,
|
|
5439
|
+
placements: attribute.placements,
|
|
5440
|
+
scope: attribute.scope,
|
|
5441
|
+
mode: attribute.mode
|
|
5442
|
+
});
|
|
5443
|
+
if (!attribute.transform) continue;
|
|
5444
|
+
for (const span of transformedTrace) {
|
|
5445
|
+
const sourceValue = getNestedAttribute(span.attributes, attribute.path);
|
|
5446
|
+
if (sourceValue === void 0) continue;
|
|
5447
|
+
const transformedValue = attribute.transform({
|
|
5448
|
+
value: sourceValue,
|
|
5449
|
+
span
|
|
5450
|
+
});
|
|
5451
|
+
if (transformedValue === void 0) continue;
|
|
5452
|
+
span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
|
|
5453
|
+
}
|
|
5454
|
+
}
|
|
5455
|
+
return {
|
|
5456
|
+
trace: transformedTrace,
|
|
5457
|
+
traceDisplay: { attributes: resolvedAttributes }
|
|
5458
|
+
};
|
|
5459
|
+
}
|
|
5460
|
+
//#endregion
|
|
5230
5461
|
//#region ../runner/src/runMaintenance.ts
|
|
5231
5462
|
async function persistRunState(runState) {
|
|
5232
5463
|
await writeFile(join(runState.runDir, "summary.json"), JSON.stringify(runState.summary, null, 2));
|
|
@@ -5551,65 +5782,6 @@ function stripTerminalControlCodes(value) {
|
|
|
5551
5782
|
return stripVTControlCharacters(value).replaceAll(orphanedAnsiSgrPattern, "");
|
|
5552
5783
|
}
|
|
5553
5784
|
//#endregion
|
|
5554
|
-
//#region ../runner/src/traceDisplay.ts
|
|
5555
|
-
function isRecord$1(value) {
|
|
5556
|
-
return typeof value === "object" && value !== null;
|
|
5557
|
-
}
|
|
5558
|
-
function mergeNestedAttribute(value, path, attributeValue) {
|
|
5559
|
-
const root = value === void 0 ? {} : { ...value };
|
|
5560
|
-
const parts = path.split(".");
|
|
5561
|
-
let current = root;
|
|
5562
|
-
for (const [index, part] of parts.entries()) {
|
|
5563
|
-
if (index === parts.length - 1) {
|
|
5564
|
-
current[part] = attributeValue;
|
|
5565
|
-
continue;
|
|
5566
|
-
}
|
|
5567
|
-
const nextValue = current[part];
|
|
5568
|
-
const nextRecord = isRecord$1(nextValue) ? { ...nextValue } : {};
|
|
5569
|
-
current[part] = nextRecord;
|
|
5570
|
-
current = nextRecord;
|
|
5571
|
-
}
|
|
5572
|
-
return root;
|
|
5573
|
-
}
|
|
5574
|
-
function resolveTracePresentation(spans, globalTraceDisplay, evalTraceDisplay) {
|
|
5575
|
-
const merged = /* @__PURE__ */ new Map();
|
|
5576
|
-
for (const attribute of globalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5577
|
-
for (const attribute of evalTraceDisplay?.attributes ?? []) merged.set(attribute.key ?? attribute.path, attribute);
|
|
5578
|
-
const resolvedAttributes = [];
|
|
5579
|
-
const transformedTrace = spans.map((span) => ({
|
|
5580
|
-
...span,
|
|
5581
|
-
attributes: span.attributes === void 0 ? void 0 : { ...span.attributes }
|
|
5582
|
-
}));
|
|
5583
|
-
for (const attribute of merged.values()) {
|
|
5584
|
-
const resolvedPath = attribute.transform ? `__display.${attribute.key ?? attribute.path}` : attribute.path;
|
|
5585
|
-
resolvedAttributes.push({
|
|
5586
|
-
key: attribute.key,
|
|
5587
|
-
path: resolvedPath,
|
|
5588
|
-
label: attribute.label,
|
|
5589
|
-
format: attribute.format,
|
|
5590
|
-
numberFormat: attribute.numberFormat,
|
|
5591
|
-
placements: attribute.placements,
|
|
5592
|
-
scope: attribute.scope,
|
|
5593
|
-
mode: attribute.mode
|
|
5594
|
-
});
|
|
5595
|
-
if (!attribute.transform) continue;
|
|
5596
|
-
for (const span of transformedTrace) {
|
|
5597
|
-
const sourceValue = getNestedAttribute(span.attributes, attribute.path);
|
|
5598
|
-
if (sourceValue === void 0) continue;
|
|
5599
|
-
const transformedValue = attribute.transform({
|
|
5600
|
-
value: sourceValue,
|
|
5601
|
-
span
|
|
5602
|
-
});
|
|
5603
|
-
if (transformedValue === void 0) continue;
|
|
5604
|
-
span.attributes = mergeNestedAttribute(span.attributes, resolvedPath, transformedValue);
|
|
5605
|
-
}
|
|
5606
|
-
}
|
|
5607
|
-
return {
|
|
5608
|
-
trace: transformedTrace,
|
|
5609
|
-
traceDisplay: { attributes: resolvedAttributes }
|
|
5610
|
-
};
|
|
5611
|
-
}
|
|
5612
|
-
//#endregion
|
|
5613
5785
|
//#region ../runner/src/runExecution.ts
|
|
5614
5786
|
function filterEvalCases(cases, caseIds) {
|
|
5615
5787
|
if (!caseIds || caseIds.length === 0) return cases;
|
|
@@ -5639,8 +5811,54 @@ function buildScopedEvalIdPrefix(params) {
|
|
|
5639
5811
|
async function callWithUnknownResult(fn, args) {
|
|
5640
5812
|
return await Reflect.apply(fn, void 0, args);
|
|
5641
5813
|
}
|
|
5814
|
+
async function callUnknownFunction(fn, args) {
|
|
5815
|
+
if (typeof fn !== "function") throw new Error("Expected a function");
|
|
5816
|
+
return await Reflect.apply(fn, void 0, args);
|
|
5817
|
+
}
|
|
5818
|
+
function assignDerivedOutputs(params) {
|
|
5819
|
+
for (const [key, value] of Object.entries(params.derived)) {
|
|
5820
|
+
if (key in params.outputs) continue;
|
|
5821
|
+
params.outputs[key] = value;
|
|
5822
|
+
}
|
|
5823
|
+
}
|
|
5824
|
+
async function resolveDeriveFromTracingConfig(params) {
|
|
5825
|
+
const ctx = {
|
|
5826
|
+
trace: params.traceTree,
|
|
5827
|
+
input: params.evalCase.input,
|
|
5828
|
+
case: params.evalCase
|
|
5829
|
+
};
|
|
5830
|
+
if (typeof params.deriveFromTracing === "function") {
|
|
5831
|
+
const derived = await callUnknownFunction(params.deriveFromTracing, [ctx]);
|
|
5832
|
+
if (!isRecord(derived)) throw new Error("deriveFromTracing must return an object");
|
|
5833
|
+
return derived;
|
|
5834
|
+
}
|
|
5835
|
+
const derived = {};
|
|
5836
|
+
for (const [key, compute] of Object.entries(params.deriveFromTracing)) {
|
|
5837
|
+
const value = await callUnknownFunction(compute, [ctx]);
|
|
5838
|
+
if (value !== void 0) derived[key] = value;
|
|
5839
|
+
}
|
|
5840
|
+
return derived;
|
|
5841
|
+
}
|
|
5842
|
+
async function runDeriveFromTracingConfig(params) {
|
|
5843
|
+
if (params.deriveFromTracing === void 0) return;
|
|
5844
|
+
const { deriveFromTracing } = params;
|
|
5845
|
+
try {
|
|
5846
|
+
const derived = await runInExistingEvalScope(params.scope, "derive", async () => await resolveDeriveFromTracingConfig({
|
|
5847
|
+
deriveFromTracing,
|
|
5848
|
+
traceTree: params.traceTree,
|
|
5849
|
+
evalCase: params.evalCase
|
|
5850
|
+
}));
|
|
5851
|
+
assignDerivedOutputs({
|
|
5852
|
+
outputs: params.scope.outputs,
|
|
5853
|
+
derived
|
|
5854
|
+
});
|
|
5855
|
+
} catch (e) {
|
|
5856
|
+
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5857
|
+
params.scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5858
|
+
}
|
|
5859
|
+
}
|
|
5642
5860
|
async function runCase(params) {
|
|
5643
|
-
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode,
|
|
5861
|
+
const { evalDef, evalId, evalKey = evalId, evalCase, globalTraceDisplay, globalColumns, globalDeriveFromTracing, llmCallsConfig = resolveLlmCallsConfig(void 0), apiCallsConfig = resolveApiCallsConfig(void 0), globalRemoveDefaultConfig, trial, startTime, cacheAdapter, cacheMode, moduleIsolation, evalFilePath, evalFileRelativePath = evalFilePath, workspaceRoot, artifactDir, runId } = params;
|
|
5644
5862
|
const scopedIdPrefix = buildScopedEvalIdPrefix({
|
|
5645
5863
|
evalId,
|
|
5646
5864
|
evalFilePath,
|
|
@@ -5672,7 +5890,8 @@ async function runCase(params) {
|
|
|
5672
5890
|
adapter: cacheAdapter,
|
|
5673
5891
|
mode: cacheMode,
|
|
5674
5892
|
evalId,
|
|
5675
|
-
|
|
5893
|
+
read: evalDef.cache?.read,
|
|
5894
|
+
store: evalDef.cache?.store
|
|
5676
5895
|
} : void 0,
|
|
5677
5896
|
startTime: evalDef.startTime,
|
|
5678
5897
|
freezeTime: evalDef.freezeTime
|
|
@@ -5685,22 +5904,19 @@ async function runCase(params) {
|
|
|
5685
5904
|
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
5686
5905
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5687
5906
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5688
|
-
if (!nonAssertError
|
|
5689
|
-
|
|
5690
|
-
|
|
5691
|
-
|
|
5692
|
-
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
|
|
5696
|
-
|
|
5697
|
-
|
|
5698
|
-
|
|
5699
|
-
|
|
5700
|
-
}
|
|
5701
|
-
const message = `deriveFromTracing threw: ${e instanceof Error ? e.message : String(e)}`;
|
|
5702
|
-
scope.assertionFailures.push(toAssertionFailure(message, e instanceof Error ? e : void 0));
|
|
5703
|
-
}
|
|
5907
|
+
if (!nonAssertError) {
|
|
5908
|
+
await runDeriveFromTracingConfig({
|
|
5909
|
+
deriveFromTracing: globalDeriveFromTracing,
|
|
5910
|
+
scope,
|
|
5911
|
+
traceTree,
|
|
5912
|
+
evalCase
|
|
5913
|
+
});
|
|
5914
|
+
await runDeriveFromTracingConfig({
|
|
5915
|
+
deriveFromTracing: evalDef.deriveFromTracing,
|
|
5916
|
+
scope,
|
|
5917
|
+
traceTree,
|
|
5918
|
+
evalCase
|
|
5919
|
+
});
|
|
5704
5920
|
}
|
|
5705
5921
|
if (!nonAssertError) addDefaultOutputs({
|
|
5706
5922
|
outputs: scope.outputs,
|
|
@@ -5717,7 +5933,7 @@ async function runCase(params) {
|
|
|
5717
5933
|
...scope.outputs,
|
|
5718
5934
|
...parsedOutputs.data
|
|
5719
5935
|
};
|
|
5720
|
-
else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error)));
|
|
5936
|
+
else scope.assertionFailures.push(toAssertionFailure(formatOutputsSchemaError(parsedOutputs.error), void 0, "OutputsSchemaError"));
|
|
5721
5937
|
}
|
|
5722
5938
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
5723
5939
|
const scoringTraces = {};
|
|
@@ -5740,7 +5956,8 @@ async function runCase(params) {
|
|
|
5740
5956
|
adapter: cacheAdapter,
|
|
5741
5957
|
mode: cacheMode,
|
|
5742
5958
|
evalId: `${evalId}__score__${key}`,
|
|
5743
|
-
|
|
5959
|
+
read: evalDef.cache?.read,
|
|
5960
|
+
store: evalDef.cache?.store
|
|
5744
5961
|
} : void 0,
|
|
5745
5962
|
startTime: scoreStartTime,
|
|
5746
5963
|
freezeTime: evalDef.freezeTime
|
|
@@ -5795,6 +6012,7 @@ async function runCase(params) {
|
|
|
5795
6012
|
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
5796
6013
|
const columns = {};
|
|
5797
6014
|
const columnOverrides = mergeDefaultColumns({
|
|
6015
|
+
globalColumns,
|
|
5798
6016
|
columns: evalDef.columns,
|
|
5799
6017
|
globalRemove: globalRemoveDefaultConfig,
|
|
5800
6018
|
evalRemove: evalDef.removeDefaultConfig
|
|
@@ -5858,14 +6076,17 @@ function formatOutputsSchemaError(error) {
|
|
|
5858
6076
|
const issueLines = error.issues.map((issue) => {
|
|
5859
6077
|
return `${issue.path.length > 0 ? issue.path.join(".") : "<root>"}: ${issue.message}`;
|
|
5860
6078
|
});
|
|
5861
|
-
if (issueLines.length === 0) return "
|
|
5862
|
-
return
|
|
6079
|
+
if (issueLines.length === 0) return "outputs did not match the configured schema";
|
|
6080
|
+
return issueLines.join("\n");
|
|
5863
6081
|
}
|
|
5864
|
-
function toAssertionFailure(message, error = void 0) {
|
|
5865
|
-
|
|
6082
|
+
function toAssertionFailure(message, error = void 0, nameOverride = void 0) {
|
|
6083
|
+
const name = nameOverride ?? error?.name;
|
|
6084
|
+
const stack = error?.stack ? stripTerminalControlCodes(error.stack) : void 0;
|
|
6085
|
+
return {
|
|
6086
|
+
...name !== void 0 ? { name } : {},
|
|
5866
6087
|
message,
|
|
5867
|
-
stack:
|
|
5868
|
-
}
|
|
6088
|
+
...stack !== void 0 ? { stack } : {}
|
|
6089
|
+
};
|
|
5869
6090
|
}
|
|
5870
6091
|
//#endregion
|
|
5871
6092
|
//#region ../runner/src/runQueue.ts
|
|
@@ -6095,15 +6316,15 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6095
6316
|
const apiCallsConfig = resolveApiCallsConfig(config.apiCalls);
|
|
6096
6317
|
for (const evalMeta of targetEvals) {
|
|
6097
6318
|
const evalFilePath = evalMeta.sourceFilePath;
|
|
6098
|
-
let
|
|
6319
|
+
let sourceFingerprint = "";
|
|
6099
6320
|
try {
|
|
6100
|
-
|
|
6321
|
+
sourceFingerprint = getSourceFingerprint(await readFile(evalFilePath, "utf-8"));
|
|
6101
6322
|
} catch {
|
|
6102
|
-
|
|
6323
|
+
sourceFingerprint = "";
|
|
6103
6324
|
}
|
|
6104
|
-
if (
|
|
6105
|
-
runState.manifest.evalSourceFingerprints[evalMeta.key] =
|
|
6106
|
-
evalMeta.sourceFingerprint =
|
|
6325
|
+
if (sourceFingerprint.length > 0) {
|
|
6326
|
+
runState.manifest.evalSourceFingerprints[evalMeta.key] = sourceFingerprint;
|
|
6327
|
+
evalMeta.sourceFingerprint = sourceFingerprint;
|
|
6107
6328
|
} else {
|
|
6108
6329
|
delete runState.manifest.evalSourceFingerprints[evalMeta.key];
|
|
6109
6330
|
evalMeta.sourceFingerprint = null;
|
|
@@ -6112,7 +6333,7 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6112
6333
|
const registry = getEvalRegistry();
|
|
6113
6334
|
await runWithModuleIsolation(moduleIsolation, async () => {
|
|
6114
6335
|
await runInEvalRuntimeScope("env", async () => {
|
|
6115
|
-
await loadEvalModule(evalFilePath,
|
|
6336
|
+
await loadEvalModule(evalFilePath, sourceFingerprint);
|
|
6116
6337
|
});
|
|
6117
6338
|
});
|
|
6118
6339
|
const entry = registry.get(evalMeta.id);
|
|
@@ -6136,6 +6357,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6136
6357
|
runState.summary.totalCases += cases.length;
|
|
6137
6358
|
const defaultConfig = resolveEvalDefaultConfig({
|
|
6138
6359
|
evalDef,
|
|
6360
|
+
globalColumns: config.columns,
|
|
6361
|
+
globalStats: config.stats,
|
|
6139
6362
|
globalRemove: config.removeDefaultConfig
|
|
6140
6363
|
});
|
|
6141
6364
|
const declaredColumnDefs = buildDeclaredColumnDefs(defaultConfig.columns, evalDef.scores, evalDef.manualScores);
|
|
@@ -6181,6 +6404,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6181
6404
|
evalKey: evalMeta.key,
|
|
6182
6405
|
evalCase,
|
|
6183
6406
|
globalTraceDisplay,
|
|
6407
|
+
globalColumns: config.columns,
|
|
6408
|
+
globalDeriveFromTracing: config.deriveFromTracing,
|
|
6184
6409
|
llmCallsConfig,
|
|
6185
6410
|
apiCallsConfig,
|
|
6186
6411
|
globalRemoveDefaultConfig: config.removeDefaultConfig,
|
|
@@ -6188,7 +6413,6 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
6188
6413
|
startTime,
|
|
6189
6414
|
cacheAdapter: bufferedCacheStore ?? (cacheEnabled ? cacheStore : null),
|
|
6190
6415
|
cacheMode,
|
|
6191
|
-
codeFingerprint,
|
|
6192
6416
|
moduleIsolation,
|
|
6193
6417
|
evalFilePath,
|
|
6194
6418
|
evalFileRelativePath: evalMeta.filePath,
|
|
@@ -6343,4 +6567,4 @@ function toLastRunStatus(status) {
|
|
|
6343
6567
|
return status === "pending" ? null : status;
|
|
6344
6568
|
}
|
|
6345
6569
|
//#endregion
|
|
6346
|
-
export {
|
|
6570
|
+
export { llmCallMetricFormatSchema as $, traceAttributeDisplayPlacementSchema as $t, extractCacheHits as A, advanceEvalTime as An, evalChartBuiltinMetricSchema as At, runManifestSchema as B, mergeEvalOutput as Bn, cacheEntryWithDebugKeySchema as Bt, normalizeScoreDef as C, deserializeCacheRecording as Cn, runLogEntrySchema as Ct, updateManualScoreRequestSchema as D, repoFile as Dn, scoreTraceSchema as Dt, createRunRequestSchema as E, serializeCacheValue as En, runLogPhaseSchema as Et, getEvalTitle as F, getCurrentScope as Fn, evalChartTypeSchema as Ft, apiCallMetricFormatSchema as G, setEvalOutput as Gn, cacheRecordingOpSchema as Gt, DEFAULT_API_CALLS_CONFIG as H, runInEvalRuntimeScope as Hn, cacheListItemSchema as Ht, getEvalDisplayStatus as I, getEvalCaseInput as In, evalChartsConfigSchema as It, apiCallsConfigSchema as J, defineEval as Jn, serializedCacheSpanSchema as Jt, apiCallMetricPlacementSchema as K, setScopeCacheContext as Kn, cacheRecordingSchema as Kt, deriveScopedSummaryFromCases as L, getEvalStartTime as Ln, cacheDebugKeyEntrySchema as Lt, extractLlmCalls as M, configureEvalRunLogs as Mn, evalChartConfigSchema as Mt, applyDerivedCallAttributes as N, evalAssert as Nn, evalChartMetricSchema as Nt, sseEnvelopeSchema as O, evalExpect as On, evalChartAggregateSchema as Ot, getNestedAttribute as P, evalLog as Pn, evalChartTooltipExtraSchema as Pt, evalDeriveConfigSchema as Q, traceAttributeDisplayInputSchema as Qt, deriveStatusFromCaseRows as R, incrementEvalOutput as Rn, cacheDebugKeyFileSchema as Rt, buildDeclaredColumnDefs as S, hashCacheKeySync as Sn, evalSummarySchema as St, createFsCacheStore as T, serializeCacheRecording as Tn, runLogLocationSchema as Tt, DEFAULT_LLM_CALLS_CONFIG as U, runInEvalScope as Un, cacheModeSchema as Ut, runSummarySchema as V, nextEvalId as Vn, cacheFileSchema as Vt, agentEvalsConfigSchema as W, runInExistingEvalScope as Wn, cacheOperationTypeSchema as Wt, evalColumnOverrideSchema as X, traceCacheRefSchema as Xt, defaultConfigKeySchema as Y, getEvalRegistry as Yn, spanCacheOptionsSchema as Yt, evalColumnsSchema as Z, traceAttributeDisplayFormatSchema as Zt, deriveEvalFreshness as _, buildTraceTree as _n, discoveryIssueSchema as _t, getLastRunStatuses as a, traceSpanSchema as an, removeDefaultConfigSchema as at, resolveEvalDefaultConfig as b, evalTracer as bn, evalStatItemSchema as bt, loadPersistedRunSnapshots as c, columnDefSchema as cn, runLogsConfigSchema as ct, persistRunState as d, fileRefSchema as dn, buildEvalKey as dt, traceAttributeDisplaySchema as en, llmCallMetricPlacementSchema as et, recomputeEvalStatusesInRuns as f, jsonCellSchema as fn, getCaseRowCaseKey as ft, resolveArtifactPath as g, z$1 as gn, caseRowSchema as gt, resolveTracePresentation as h, runArtifactRefSchema as hn, caseDetailSchema as ht, generateRunId as i, traceSpanKindSchema as in, llmCallsConfigSchema as it, extractApiCalls as j, appendToEvalOutput as jn, evalChartColorSchema as jt, extractCacheEntries as k, EvalAssertionError as kn, evalChartAxisSchema as kt, nextShortIdFromSnapshots as l, columnFormatSchema as ln, trialSelectionModeSchema as lt, runTouchesEval as m, repoFileRefSchema as mn, assertionFailureSchema as mt, getTargetEvalKeys as n, traceDisplayInputConfigSchema as nn, llmCallPricingRateSchema as nt, getLatestRunInfos as o, traceSpanWarningSchema as on, resolveApiCallsConfig as ot, recomputePersistedCaseStatus as p, numberDisplayOptionsSchema as pn, getCaseRowEvalKey as pt, apiCallMetricSchema as q, startEvalBackgroundJob as qn, cacheStatusSchema as qt, getTargetEvals as r, traceSpanErrorSchema as rn, llmCallPricingSchema as rt, loadPersistedRunSnapshot as s, cellValueSchema as sn, resolveLlmCallsConfig as st, executeRun as t, traceDisplayConfigSchema as tn, llmCallMetricSchema as tt, persistCaseDetail as u, columnKindSchema as un, buildCaseKey as ut, loadEvalModule as v, captureEvalSpanError as vn, evalFreshnessStatusSchema as vt, validateCharts as w, deserializeCacheValue as wn, runLogLevelSchema as wt, loadConfig as x, hashCacheKey as xn, evalStatsConfigSchema as xt, parseEvalDiscovery as y, evalSpan as yn, evalStatAggregateSchema as yt, deriveStatusFromChildStatuses as z, isInEvalScope as zn, cacheEntrySchema as zt };
|