@ls-stack/agent-eval 0.22.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-moDHbg1O.mjs → app-DYRmucgj.mjs} +3 -3
- package/dist/apps/web/dist/assets/{index-AUDD3rNB.js → index-KbbX3NYr.js} +35 -35
- package/dist/apps/web/dist/index.html +1 -1
- package/dist/bin.mjs +1 -1
- package/dist/{cli-C0EtHhEO.mjs → cli-Be0x8CS3.mjs} +3 -3
- package/dist/index.d.mts +106 -9
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-D1edUDhp.mjs → runOrchestration-D697g6Qe.mjs} +281 -42
- package/dist/{runner-C9nP2VKL.mjs → runner-B4SosWgD.mjs} +2 -2
- package/dist/{runner-CyRhIzci.mjs → runner-jSujaSKt.mjs} +1 -1
- package/dist/src-D6cettg0.mjs +3 -0
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +21 -5
- package/dist/src-D-HuV8I-.mjs +0 -3
|
@@ -49,9 +49,46 @@ function repoFile(path, mimeType) {
|
|
|
49
49
|
//#region ../sdk/src/runtime.ts
|
|
50
50
|
const scopeStorage = new AsyncLocalStorage();
|
|
51
51
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
52
|
+
const evalClockStorage = new AsyncLocalStorage();
|
|
52
53
|
let activeEvalScopeCount = 0;
|
|
53
54
|
let activeEvalRuntimeScopeCount = 0;
|
|
54
55
|
let consoleCaptureEnabled = true;
|
|
56
|
+
const defaultEvalStartTimeMs = Date.parse("2026-04-10T00:00:00.000Z");
|
|
57
|
+
const realDate = globalThis.__agentEvalsRealDate ?? Date;
|
|
58
|
+
globalThis.__agentEvalsRealDate = realDate;
|
|
59
|
+
function toDateConstructorArg(value) {
|
|
60
|
+
if (typeof value === "string" || typeof value === "number" || value instanceof realDate) return value;
|
|
61
|
+
return Number(value);
|
|
62
|
+
}
|
|
63
|
+
function toDateNumberArg(value) {
|
|
64
|
+
return typeof value === "number" ? value : Number(value);
|
|
65
|
+
}
|
|
66
|
+
function constructDateFromArgs(args) {
|
|
67
|
+
if (args.length === 0) return new realDate();
|
|
68
|
+
if (args.length === 1) return new realDate(toDateConstructorArg(args[0]));
|
|
69
|
+
return new realDate(toDateNumberArg(args[0]), toDateNumberArg(args[1]), args[2] === void 0 ? 1 : toDateNumberArg(args[2]), args[3] === void 0 ? 0 : toDateNumberArg(args[3]), args[4] === void 0 ? 0 : toDateNumberArg(args[4]), args[5] === void 0 ? 0 : toDateNumberArg(args[5]), args[6] === void 0 ? 0 : toDateNumberArg(args[6]));
|
|
70
|
+
}
|
|
71
|
+
const evalDate = new Proxy(realDate, {
|
|
72
|
+
apply(target, thisArg, argArray_) {
|
|
73
|
+
const nowMs = getEvalClockNowMs();
|
|
74
|
+
if (nowMs !== null) return new target(nowMs).toString();
|
|
75
|
+
return target.call(thisArg);
|
|
76
|
+
},
|
|
77
|
+
construct(target, argArray, newTarget_) {
|
|
78
|
+
const nowMs = getEvalClockNowMs();
|
|
79
|
+
if (argArray.length === 0 && nowMs !== null) return new target(nowMs);
|
|
80
|
+
return constructDateFromArgs(Array.from(argArray));
|
|
81
|
+
},
|
|
82
|
+
get(target, property) {
|
|
83
|
+
if (property === "now") return getEvalDateNow;
|
|
84
|
+
if (property === "parse") return target.parse;
|
|
85
|
+
if (property === "UTC") return target.UTC;
|
|
86
|
+
if (property === "prototype") return target.prototype;
|
|
87
|
+
if (property === "name") return target.name;
|
|
88
|
+
if (property === "length") return target.length;
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
globalThis.Date = evalDate;
|
|
55
92
|
const maxLogMessageLength = 2e4;
|
|
56
93
|
const maxLogStringLength = 1e4;
|
|
57
94
|
const maxLogArrayLength = 100;
|
|
@@ -79,6 +116,82 @@ var EvalAssertionError = class extends Error {
|
|
|
79
116
|
this.name = "EvalAssertionError";
|
|
80
117
|
}
|
|
81
118
|
};
|
|
119
|
+
function getEvalClockStateNowMs(state) {
|
|
120
|
+
const elapsedMs = state.frozen ? 0 : realDate.now() - state.realStartMs;
|
|
121
|
+
return state.startMs + elapsedMs + state.offsetMs;
|
|
122
|
+
}
|
|
123
|
+
function getEvalClockNowMs() {
|
|
124
|
+
const state = evalClockStorage.getStore();
|
|
125
|
+
if (state?.shifted !== true) return null;
|
|
126
|
+
return getEvalClockStateNowMs(state);
|
|
127
|
+
}
|
|
128
|
+
function getEvalDateNow() {
|
|
129
|
+
return getEvalClockNowMs() ?? realDate.now();
|
|
130
|
+
}
|
|
131
|
+
/** Return the host process clock, bypassing the eval Date shim. */
|
|
132
|
+
function getRealDateNowMs() {
|
|
133
|
+
return realDate.now();
|
|
134
|
+
}
|
|
135
|
+
/** Return the shifted wall-clock time for a stored eval clock state. */
|
|
136
|
+
function getEvalClockStateTimeMs(state) {
|
|
137
|
+
if (!state.shifted) return null;
|
|
138
|
+
return getEvalClockStateNowMs(state);
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Return the wall-clock start time captured for the active eval.
|
|
142
|
+
*
|
|
143
|
+
* For `startTime: 'now'`, this is the real time captured when the eval clock
|
|
144
|
+
* context was created.
|
|
145
|
+
*/
|
|
146
|
+
function getEvalStartTime() {
|
|
147
|
+
const state = evalClockStorage.getStore();
|
|
148
|
+
if (state === void 0) throw new Error("getEvalStartTime() must be called inside an active eval");
|
|
149
|
+
return new realDate(state.startMs);
|
|
150
|
+
}
|
|
151
|
+
function resolveEvalStartTimeMs(startTime) {
|
|
152
|
+
if (startTime === void 0) return defaultEvalStartTimeMs;
|
|
153
|
+
if (startTime === "now") return realDate.now();
|
|
154
|
+
const ms = startTime instanceof realDate ? startTime.getTime() : typeof startTime === "number" ? startTime : realDate.parse(startTime);
|
|
155
|
+
if (Number.isFinite(ms)) return ms;
|
|
156
|
+
throw new Error(`Invalid eval startTime "${String(startTime)}". Use a Date, timestamp, ISO date string, or "now".`);
|
|
157
|
+
}
|
|
158
|
+
function createEvalClockState(startTime, freezeTime) {
|
|
159
|
+
const nowMs = realDate.now();
|
|
160
|
+
return {
|
|
161
|
+
startMs: startTime === "now" ? nowMs : resolveEvalStartTimeMs(startTime),
|
|
162
|
+
realStartMs: nowMs,
|
|
163
|
+
offsetMs: 0,
|
|
164
|
+
frozen: freezeTime,
|
|
165
|
+
shifted: startTime !== "now" || freezeTime
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
/** Execute a callback with the eval Date clock shifted from `startTime`. */
|
|
169
|
+
async function runWithEvalClock(startTime, fn, options = {}) {
|
|
170
|
+
return await evalClockStorage.run(createEvalClockState(startTime, options.freezeTime === true), fn);
|
|
171
|
+
}
|
|
172
|
+
function getEvalTimeUnitMs(unit) {
|
|
173
|
+
if (unit === "millisecond" || unit === "milliseconds") return 1;
|
|
174
|
+
if (unit === "second" || unit === "seconds") return 1e3;
|
|
175
|
+
if (unit === "minute" || unit === "minutes") return 6e4;
|
|
176
|
+
if (unit === "hour" || unit === "hours") return 36e5;
|
|
177
|
+
if (unit === "day" || unit === "days") return 864e5;
|
|
178
|
+
throw new Error(`Unsupported eval time unit "${unit}"`);
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Advance the active eval's shifted Date clock and return the new time.
|
|
182
|
+
*
|
|
183
|
+
* Throws outside an active shifted eval clock. Evals that set
|
|
184
|
+
* `startTime: 'now'` use the real current clock unless `freezeTime: true` is
|
|
185
|
+
* also set.
|
|
186
|
+
*/
|
|
187
|
+
function advanceEvalTime(unit, amount) {
|
|
188
|
+
const state = evalClockStorage.getStore();
|
|
189
|
+
if (state === void 0) throw new Error("advanceEvalTime() must be called inside an active eval");
|
|
190
|
+
if (!state.shifted) throw new Error("advanceEvalTime() requires a shifted eval clock. Remove startTime: \"now\" or set freezeTime: true to use it.");
|
|
191
|
+
if (!Number.isFinite(amount)) throw new Error("advanceEvalTime() amount must be a finite number");
|
|
192
|
+
state.offsetMs += getEvalTimeUnitMs(unit) * amount;
|
|
193
|
+
return new realDate(getEvalClockStateNowMs(state));
|
|
194
|
+
}
|
|
82
195
|
/** Return the current eval scope for the active async context, if any. */
|
|
83
196
|
function getCurrentScope() {
|
|
84
197
|
if (activeEvalScopeCount === 0) return void 0;
|
|
@@ -349,7 +462,9 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
|
|
|
349
462
|
activeEvalScopeCount++;
|
|
350
463
|
try {
|
|
351
464
|
return await scopeStorage.run(scope, async () => {
|
|
352
|
-
return await
|
|
465
|
+
return await evalClockStorage.run(scope.evalClockState, async () => {
|
|
466
|
+
return await runInEvalRuntimeScope(runtimeScope, fn);
|
|
467
|
+
});
|
|
353
468
|
});
|
|
354
469
|
} finally {
|
|
355
470
|
activeEvalScopeCount--;
|
|
@@ -362,6 +477,8 @@ async function runInExistingEvalScope(scope, runtimeScope, fn) {
|
|
|
362
477
|
async function runInEvalScope(caseId, fn, options = {}) {
|
|
363
478
|
const scope = {
|
|
364
479
|
caseId,
|
|
480
|
+
startTime: options.startTime,
|
|
481
|
+
evalClockState: createEvalClockState(options.startTime, options.freezeTime === true),
|
|
365
482
|
idPrefix: options.idPrefix,
|
|
366
483
|
nextEvalIdCounter: 0,
|
|
367
484
|
input: options.input,
|
|
@@ -1213,7 +1330,7 @@ const errorCoreFields = new Set([
|
|
|
1213
1330
|
"stack",
|
|
1214
1331
|
"capturedAt"
|
|
1215
1332
|
]);
|
|
1216
|
-
function isRecord$
|
|
1333
|
+
function isRecord$5(value) {
|
|
1217
1334
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1218
1335
|
}
|
|
1219
1336
|
function formatUnknownErrorMessage(error) {
|
|
@@ -1241,7 +1358,7 @@ function normalizeTraceError(error, capturedAt = void 0) {
|
|
|
1241
1358
|
stack: error.stack,
|
|
1242
1359
|
capturedAt
|
|
1243
1360
|
};
|
|
1244
|
-
if (isRecord$
|
|
1361
|
+
if (isRecord$5(error)) {
|
|
1245
1362
|
const extraFields = getErrorExtraFields(error);
|
|
1246
1363
|
const name = typeof error.name === "string" ? error.name : void 0;
|
|
1247
1364
|
const stack = typeof error.stack === "string" ? error.stack : void 0;
|
|
@@ -1266,7 +1383,7 @@ function normalizeTraceWarnings(warningOrWarnings, additionalWarnings, capturedA
|
|
|
1266
1383
|
return (additionalWarnings.length > 0 ? [warningOrWarnings, ...additionalWarnings] : Array.isArray(warningOrWarnings) ? warningOrWarnings : [warningOrWarnings]).map((warning) => normalizeTraceError(warning, capturedAt));
|
|
1267
1384
|
}
|
|
1268
1385
|
function isCaptureEvalSpanErrorOptions(value) {
|
|
1269
|
-
if (!isRecord$
|
|
1386
|
+
if (!isRecord$5(value)) return false;
|
|
1270
1387
|
const keys = Object.keys(value);
|
|
1271
1388
|
if (keys.length === 0) return false;
|
|
1272
1389
|
if (!keys.every((key) => key === "level")) return false;
|
|
@@ -1493,9 +1610,12 @@ function mergeSpanAttribute(span, key, patch) {
|
|
|
1493
1610
|
...patch
|
|
1494
1611
|
} });
|
|
1495
1612
|
}
|
|
1496
|
-
function
|
|
1613
|
+
function addElapsedMsToTimestamp(isoTimestamp, elapsedMs) {
|
|
1614
|
+
return new Date(new Date(isoTimestamp).getTime() + elapsedMs).toISOString();
|
|
1615
|
+
}
|
|
1616
|
+
function finishSpanWithoutThrownError(span, realStartedAt) {
|
|
1497
1617
|
span.status = hasSpanError(span) ? "error" : "ok";
|
|
1498
|
-
span.endedAt = (
|
|
1618
|
+
span.endedAt = addElapsedMsToTimestamp(span.startedAt, getRealDateNowMs() - realStartedAt);
|
|
1499
1619
|
}
|
|
1500
1620
|
function createSpanHandle(span) {
|
|
1501
1621
|
return {
|
|
@@ -1737,9 +1857,11 @@ async function traceSpanInternal(info, fn) {
|
|
|
1737
1857
|
const scope = getCurrentScope();
|
|
1738
1858
|
if (!scope) return await fn(noopActiveSpan());
|
|
1739
1859
|
const id = generateSpanId();
|
|
1860
|
+
const parentId = scope.activeSpanStack.at(-1)?.id ?? null;
|
|
1861
|
+
const realStartedAt = getRealDateNowMs();
|
|
1740
1862
|
const spanRecord = {
|
|
1741
1863
|
id,
|
|
1742
|
-
parentId
|
|
1864
|
+
parentId,
|
|
1743
1865
|
caseId: scope.caseId,
|
|
1744
1866
|
kind: info.kind,
|
|
1745
1867
|
name: info.name,
|
|
@@ -1779,7 +1901,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
1779
1901
|
const recording = deserializeCacheRecording(hit.recording);
|
|
1780
1902
|
replayRecording(scope, spanRecord, recording, { generateSpanId });
|
|
1781
1903
|
spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
|
|
1782
|
-
spanRecord.endedAt = (
|
|
1904
|
+
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
1783
1905
|
return recording.returnValue;
|
|
1784
1906
|
}
|
|
1785
1907
|
mergeSpanAttributes(spanRecord, { "cache.status": "miss" });
|
|
@@ -1798,7 +1920,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
1798
1920
|
scope.recordingStack.pop();
|
|
1799
1921
|
}
|
|
1800
1922
|
appendSubSpanOps(scope, frame);
|
|
1801
|
-
finishSpanWithoutThrownError(spanRecord);
|
|
1923
|
+
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
1802
1924
|
if (ctx.mode !== "bypass") {
|
|
1803
1925
|
const recording = {
|
|
1804
1926
|
returnValue: bodyResult,
|
|
@@ -1832,11 +1954,11 @@ async function traceSpanInternal(info, fn) {
|
|
|
1832
1954
|
return bodyResult;
|
|
1833
1955
|
}
|
|
1834
1956
|
const result = await fn(activeSpan);
|
|
1835
|
-
finishSpanWithoutThrownError(spanRecord);
|
|
1957
|
+
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
1836
1958
|
return result;
|
|
1837
1959
|
} catch (error) {
|
|
1838
1960
|
spanRecord.status = "error";
|
|
1839
|
-
spanRecord.endedAt = (
|
|
1961
|
+
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
1840
1962
|
spanRecord.error = normalizeTraceError(error);
|
|
1841
1963
|
throw error;
|
|
1842
1964
|
} finally {
|
|
@@ -2605,13 +2727,16 @@ const apiCallMetricFormatSchema = llmCallMetricFormatSchema;
|
|
|
2605
2727
|
const llmCallMetricPlacementSchema = z.enum(["header", "body"]);
|
|
2606
2728
|
/** Where an API-call metric is rendered inside the API calls tab. */
|
|
2607
2729
|
const apiCallMetricPlacementSchema = llmCallMetricPlacementSchema;
|
|
2730
|
+
const callDerivedAttributeSchema = z.custom((value) => typeof value === "function", { message: "Expected a derived attribute function" });
|
|
2608
2731
|
/**
|
|
2609
2732
|
* Schema for a single user-defined metric attached to LLM call rows.
|
|
2610
2733
|
*
|
|
2611
2734
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
2612
|
-
* with the configured `format` and `numberFormat`.
|
|
2613
|
-
*
|
|
2614
|
-
*
|
|
2735
|
+
* with the configured `format` and `numberFormat`. Use
|
|
2736
|
+
* `llmCalls.derivedAttributes` when a metric should read a value computed from
|
|
2737
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
2738
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
2739
|
+
* Defaults to `['body']` when omitted.
|
|
2615
2740
|
*/
|
|
2616
2741
|
const llmCallMetricSchema = z.object({
|
|
2617
2742
|
/** Display label for the metric row or header chip. */
|
|
@@ -2638,9 +2763,11 @@ const llmCallMetricSchema = z.object({
|
|
|
2638
2763
|
* Schema for a single user-defined metric attached to API call rows.
|
|
2639
2764
|
*
|
|
2640
2765
|
* Each metric reads `path` from the span's `attributes` and renders the value
|
|
2641
|
-
* with the configured `format` and `numberFormat`.
|
|
2642
|
-
*
|
|
2643
|
-
*
|
|
2766
|
+
* with the configured `format` and `numberFormat`. Use
|
|
2767
|
+
* `apiCalls.derivedAttributes` when a metric should read a value computed from
|
|
2768
|
+
* other attributes. `placements` controls whether the metric appears as a chip
|
|
2769
|
+
* on the collapsed row header, as a row inside the expanded body, or both.
|
|
2770
|
+
* Defaults to `['body']` when omitted.
|
|
2644
2771
|
*/
|
|
2645
2772
|
const apiCallMetricSchema = z.object({
|
|
2646
2773
|
/** Display label for the metric row or header chip. */
|
|
@@ -2717,6 +2844,13 @@ const llmCallsConfigSchema = z.object({
|
|
|
2717
2844
|
toolCalls: z.string().optional()
|
|
2718
2845
|
}).optional(),
|
|
2719
2846
|
/**
|
|
2847
|
+
* Derived attributes persisted onto every matching LLM span before
|
|
2848
|
+
* `deriveFromTracing`, default outputs, trace display, and call metrics read
|
|
2849
|
+
* the trace. Keys are dot-paths under `span.attributes`; return `undefined`
|
|
2850
|
+
* to skip writing the attribute for one span.
|
|
2851
|
+
*/
|
|
2852
|
+
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
2853
|
+
/**
|
|
2720
2854
|
* Model/provider pricing registry used to calculate LLM-call costs from
|
|
2721
2855
|
* token counts. Built-in LLM cost fields are only derived from this registry.
|
|
2722
2856
|
*/
|
|
@@ -2745,6 +2879,13 @@ const apiCallsConfigSchema = z.object({
|
|
|
2745
2879
|
durationMs: z.string().optional(),
|
|
2746
2880
|
error: z.string().optional()
|
|
2747
2881
|
}).optional(),
|
|
2882
|
+
/**
|
|
2883
|
+
* Derived attributes persisted onto every matching API span before trace
|
|
2884
|
+
* display and call metrics read the trace. Keys are dot-paths under
|
|
2885
|
+
* `span.attributes`; return `undefined` to skip writing the attribute for
|
|
2886
|
+
* one span.
|
|
2887
|
+
*/
|
|
2888
|
+
derivedAttributes: z.record(z.string().min(1), callDerivedAttributeSchema).optional(),
|
|
2748
2889
|
/** Custom user-defined metrics surfaced on each API call. */
|
|
2749
2890
|
metrics: z.array(apiCallMetricSchema).optional()
|
|
2750
2891
|
});
|
|
@@ -2776,6 +2917,7 @@ const DEFAULT_LLM_CALLS_CONFIG = {
|
|
|
2776
2917
|
reasoning: "reasoning",
|
|
2777
2918
|
toolCalls: "toolCalls"
|
|
2778
2919
|
},
|
|
2920
|
+
derivedAttributes: [],
|
|
2779
2921
|
metrics: [],
|
|
2780
2922
|
pricing: []
|
|
2781
2923
|
};
|
|
@@ -2799,8 +2941,35 @@ const DEFAULT_API_CALLS_CONFIG = {
|
|
|
2799
2941
|
durationMs: "durationMs",
|
|
2800
2942
|
error: "error"
|
|
2801
2943
|
},
|
|
2944
|
+
derivedAttributes: [],
|
|
2802
2945
|
metrics: []
|
|
2803
2946
|
};
|
|
2947
|
+
function resolveDerivedAttributes(input) {
|
|
2948
|
+
return Object.entries(input ?? {}).map(([path, compute]) => ({
|
|
2949
|
+
path,
|
|
2950
|
+
compute
|
|
2951
|
+
}));
|
|
2952
|
+
}
|
|
2953
|
+
function resolveLlmCallMetric(metric) {
|
|
2954
|
+
return {
|
|
2955
|
+
label: metric.label,
|
|
2956
|
+
tooltip: metric.tooltip,
|
|
2957
|
+
path: metric.path,
|
|
2958
|
+
format: metric.format ?? "string",
|
|
2959
|
+
numberFormat: metric.numberFormat,
|
|
2960
|
+
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
2961
|
+
};
|
|
2962
|
+
}
|
|
2963
|
+
function resolveApiCallMetric(metric) {
|
|
2964
|
+
return {
|
|
2965
|
+
label: metric.label,
|
|
2966
|
+
tooltip: metric.tooltip,
|
|
2967
|
+
path: metric.path,
|
|
2968
|
+
format: metric.format ?? "string",
|
|
2969
|
+
numberFormat: metric.numberFormat,
|
|
2970
|
+
placements: metric.placements ? [...metric.placements] : ["body"]
|
|
2971
|
+
};
|
|
2972
|
+
}
|
|
2804
2973
|
/**
|
|
2805
2974
|
* Resolve the user-authored LLM-calls config to a fully-defaulted shape used
|
|
2806
2975
|
* by the UI to derive the LLM calls tab.
|
|
@@ -2820,14 +2989,8 @@ function resolveLlmCallsConfig(input) {
|
|
|
2820
2989
|
...DEFAULT_LLM_CALLS_CONFIG.attributes,
|
|
2821
2990
|
...input?.attributes
|
|
2822
2991
|
},
|
|
2823
|
-
|
|
2824
|
-
|
|
2825
|
-
tooltip: m.tooltip,
|
|
2826
|
-
path: m.path,
|
|
2827
|
-
format: m.format ?? "string",
|
|
2828
|
-
numberFormat: m.numberFormat,
|
|
2829
|
-
placements: m.placements ? [...m.placements] : ["body"]
|
|
2830
|
-
})),
|
|
2992
|
+
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
2993
|
+
metrics: (input?.metrics ?? []).map(resolveLlmCallMetric),
|
|
2831
2994
|
pricing: (input?.pricing ?? []).map((p) => ({
|
|
2832
2995
|
model: p.model,
|
|
2833
2996
|
provider: p.provider,
|
|
@@ -2857,14 +3020,8 @@ function resolveApiCallsConfig(input) {
|
|
|
2857
3020
|
...DEFAULT_API_CALLS_CONFIG.attributes,
|
|
2858
3021
|
...input?.attributes
|
|
2859
3022
|
},
|
|
2860
|
-
|
|
2861
|
-
|
|
2862
|
-
tooltip: m.tooltip,
|
|
2863
|
-
path: m.path,
|
|
2864
|
-
format: m.format ?? "string",
|
|
2865
|
-
numberFormat: m.numberFormat,
|
|
2866
|
-
placements: m.placements ? [...m.placements] : ["body"]
|
|
2867
|
-
}))
|
|
3023
|
+
derivedAttributes: resolveDerivedAttributes(input?.derivedAttributes),
|
|
3024
|
+
metrics: (input?.metrics ?? []).map(resolveApiCallMetric)
|
|
2868
3025
|
};
|
|
2869
3026
|
}
|
|
2870
3027
|
/** Zod schema for validating `agent-evals.config.ts` input. */
|
|
@@ -3084,7 +3241,7 @@ function getEvalTitle(evalLike) {
|
|
|
3084
3241
|
}
|
|
3085
3242
|
//#endregion
|
|
3086
3243
|
//#region ../shared/src/utils/getNestedAttribute.ts
|
|
3087
|
-
function isRecord$
|
|
3244
|
+
function isRecord$4(value) {
|
|
3088
3245
|
return typeof value === "object" && value !== null;
|
|
3089
3246
|
}
|
|
3090
3247
|
/**
|
|
@@ -3099,12 +3256,84 @@ function getNestedAttribute(value, path) {
|
|
|
3099
3256
|
const parts = path.split(".");
|
|
3100
3257
|
let current = value;
|
|
3101
3258
|
for (const part of parts) {
|
|
3102
|
-
if (!isRecord$
|
|
3259
|
+
if (!isRecord$4(current) || !(part in current)) return;
|
|
3103
3260
|
current = current[part];
|
|
3104
3261
|
}
|
|
3105
3262
|
return current;
|
|
3106
3263
|
}
|
|
3107
3264
|
//#endregion
|
|
3265
|
+
//#region ../shared/src/utils/deriveCallAttributes.ts
|
|
3266
|
+
function isRecord$3(value) {
|
|
3267
|
+
return typeof value === "object" && value !== null;
|
|
3268
|
+
}
|
|
3269
|
+
function mergeNestedAttribute$1(value, path, attributeValue) {
|
|
3270
|
+
const root = value === void 0 ? {} : { ...value };
|
|
3271
|
+
const parts = path.split(".");
|
|
3272
|
+
let current = root;
|
|
3273
|
+
for (const [index, part] of parts.entries()) {
|
|
3274
|
+
if (index === parts.length - 1) {
|
|
3275
|
+
current[part] = attributeValue;
|
|
3276
|
+
continue;
|
|
3277
|
+
}
|
|
3278
|
+
const nextValue = current[part];
|
|
3279
|
+
const nextRecord = isRecord$3(nextValue) ? { ...nextValue } : {};
|
|
3280
|
+
current[part] = nextRecord;
|
|
3281
|
+
current = nextRecord;
|
|
3282
|
+
}
|
|
3283
|
+
return root;
|
|
3284
|
+
}
|
|
3285
|
+
function applyDerivedAttributesForKind(params) {
|
|
3286
|
+
let attributes = params.span.attributes;
|
|
3287
|
+
for (const derivedAttribute of params.derivedAttributes) {
|
|
3288
|
+
if (derivedAttribute.compute === void 0) continue;
|
|
3289
|
+
const span = {
|
|
3290
|
+
...params.span,
|
|
3291
|
+
attributes
|
|
3292
|
+
};
|
|
3293
|
+
const value = (() => {
|
|
3294
|
+
try {
|
|
3295
|
+
return derivedAttribute.compute({
|
|
3296
|
+
attributes,
|
|
3297
|
+
span,
|
|
3298
|
+
get: (path) => getNestedAttribute(attributes, path)
|
|
3299
|
+
});
|
|
3300
|
+
} catch {
|
|
3301
|
+
return;
|
|
3302
|
+
}
|
|
3303
|
+
})();
|
|
3304
|
+
if (value === void 0) continue;
|
|
3305
|
+
attributes = mergeNestedAttribute$1(attributes, derivedAttribute.path, value);
|
|
3306
|
+
}
|
|
3307
|
+
if (attributes === params.span.attributes) return params.span;
|
|
3308
|
+
return {
|
|
3309
|
+
...params.span,
|
|
3310
|
+
attributes
|
|
3311
|
+
};
|
|
3312
|
+
}
|
|
3313
|
+
/**
|
|
3314
|
+
* Persist configured derived attributes onto matching LLM/API spans.
|
|
3315
|
+
*
|
|
3316
|
+
* These derived attributes are applied before trace consumers run, so
|
|
3317
|
+
* `deriveFromTracing`, default usage extraction, trace display, and call
|
|
3318
|
+
* metrics can all read them by normal dot-path lookup.
|
|
3319
|
+
*/
|
|
3320
|
+
function applyDerivedCallAttributes(params) {
|
|
3321
|
+
const llmKinds = new Set(params.llmCallsConfig.kinds);
|
|
3322
|
+
const apiKinds = new Set(params.apiCallsConfig.kinds);
|
|
3323
|
+
return params.spans.map((span) => {
|
|
3324
|
+
let nextSpan = span;
|
|
3325
|
+
if (llmKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
|
|
3326
|
+
span: nextSpan,
|
|
3327
|
+
derivedAttributes: params.llmCallsConfig.derivedAttributes
|
|
3328
|
+
});
|
|
3329
|
+
if (apiKinds.has(span.kind)) nextSpan = applyDerivedAttributesForKind({
|
|
3330
|
+
span: nextSpan,
|
|
3331
|
+
derivedAttributes: params.apiCallsConfig.derivedAttributes
|
|
3332
|
+
});
|
|
3333
|
+
return nextSpan;
|
|
3334
|
+
});
|
|
3335
|
+
}
|
|
3336
|
+
//#endregion
|
|
3108
3337
|
//#region ../shared/src/utils/extractLlmCalls.ts
|
|
3109
3338
|
function readNumber$2(attributes, path) {
|
|
3110
3339
|
const raw = getNestedAttribute(attributes, path);
|
|
@@ -5235,9 +5464,16 @@ async function runCase(params) {
|
|
|
5235
5464
|
mode: cacheMode,
|
|
5236
5465
|
evalId,
|
|
5237
5466
|
codeFingerprint
|
|
5238
|
-
} : void 0
|
|
5467
|
+
} : void 0,
|
|
5468
|
+
startTime: evalDef.startTime,
|
|
5469
|
+
freezeTime: evalDef.freezeTime
|
|
5239
5470
|
});
|
|
5240
|
-
const
|
|
5471
|
+
const spansWithDerivedAttributes = applyDerivedCallAttributes({
|
|
5472
|
+
spans: scope.spans,
|
|
5473
|
+
llmCallsConfig,
|
|
5474
|
+
apiCallsConfig
|
|
5475
|
+
});
|
|
5476
|
+
const traceTree = buildTraceTree(spansWithDerivedAttributes, scope.checkpoints);
|
|
5241
5477
|
const nonAssertError = executeError && !(executeError instanceof EvalAssertionError) ? executeError : null;
|
|
5242
5478
|
if (executeError instanceof EvalAssertionError && scope.assertionFailures.length === 0) scope.assertionFailures.push(toAssertionFailure(executeError.message, executeError));
|
|
5243
5479
|
if (!nonAssertError && evalDef.deriveFromTracing) {
|
|
@@ -5259,7 +5495,7 @@ async function runCase(params) {
|
|
|
5259
5495
|
}
|
|
5260
5496
|
if (!nonAssertError) addDefaultOutputs({
|
|
5261
5497
|
outputs: scope.outputs,
|
|
5262
|
-
spans:
|
|
5498
|
+
spans: spansWithDerivedAttributes,
|
|
5263
5499
|
llmCallsConfig,
|
|
5264
5500
|
apiCallsConfig,
|
|
5265
5501
|
globalRemove: globalRemoveDefaultConfig,
|
|
@@ -5276,6 +5512,7 @@ async function runCase(params) {
|
|
|
5276
5512
|
}
|
|
5277
5513
|
const scoreResults = /* @__PURE__ */ new Map();
|
|
5278
5514
|
const scoringTraces = {};
|
|
5515
|
+
const scoreStartTime = getEvalClockStateTimeMs(scope.evalClockState) ?? evalDef.startTime;
|
|
5279
5516
|
if (!nonAssertError && scope.assertionFailures.length === 0 && evalDef.scores) for (const [key, def] of Object.entries(evalDef.scores)) {
|
|
5280
5517
|
const { compute, passThreshold, label } = normalizeScoreDef(def);
|
|
5281
5518
|
const scoreRun = await runInEvalScope(evalCase.id, async () => {
|
|
@@ -5295,7 +5532,9 @@ async function runCase(params) {
|
|
|
5295
5532
|
mode: cacheMode,
|
|
5296
5533
|
evalId: `${evalId}__score__${key}`,
|
|
5297
5534
|
codeFingerprint
|
|
5298
|
-
} : void 0
|
|
5535
|
+
} : void 0,
|
|
5536
|
+
startTime: scoreStartTime,
|
|
5537
|
+
freezeTime: evalDef.freezeTime
|
|
5299
5538
|
});
|
|
5300
5539
|
const { trace, traceDisplay } = resolveTracePresentation(scoreRun.scope.spans, globalTraceDisplay, evalDef.traceDisplay);
|
|
5301
5540
|
scope.logs.push(...scoreRun.scope.logs.map((entry) => ({
|
|
@@ -5344,7 +5583,7 @@ async function runCase(params) {
|
|
|
5344
5583
|
}
|
|
5345
5584
|
}
|
|
5346
5585
|
const status = nonAssertError ? "error" : passed ? "pass" : "fail";
|
|
5347
|
-
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(
|
|
5586
|
+
const { trace: displayTrace, traceDisplay } = resolveTracePresentation(spansWithDerivedAttributes, globalTraceDisplay, evalDef.traceDisplay);
|
|
5348
5587
|
const columns = {};
|
|
5349
5588
|
const columnOverrides = mergeDefaultColumns({
|
|
5350
5589
|
columns: evalDef.columns,
|
|
@@ -5608,7 +5847,7 @@ async function executeRun({ runState, request, runDir, config, evals, cacheStore
|
|
|
5608
5847
|
await runInEvalRuntimeScope("cases", async () => {
|
|
5609
5848
|
await entry.use(async (evalDef) => {
|
|
5610
5849
|
const cases = filterEvalCases(resolveRunnableEvalCases({
|
|
5611
|
-
cases: typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [],
|
|
5850
|
+
cases: await runWithEvalClock(evalDef.startTime, async () => typeof evalDef.cases === "function" ? await evalDef.cases() : evalDef.cases ?? [], { freezeTime: evalDef.freezeTime }),
|
|
5612
5851
|
evalId: evalMeta.id
|
|
5613
5852
|
}), request.target.evalIds, request.target.caseIds, evalMeta.id);
|
|
5614
5853
|
runState.summary.totalCases += cases.length;
|
|
@@ -5811,4 +6050,4 @@ function toLastRunStatus(status) {
|
|
|
5811
6050
|
return status === "pending" ? null : status;
|
|
5812
6051
|
}
|
|
5813
6052
|
//#endregion
|
|
5814
|
-
export {
|
|
6053
|
+
export { llmCallsConfigSchema as $, columnFormatSchema as $t, extractApiCalls as A, runInEvalRuntimeScope as An, cacheEntryWithDebugKeySchema as At, runSummarySchema as B, traceCacheRefSchema as Bt, validateCharts as C, getCurrentScope as Cn, evalChartMetricSchema as Ct, sseEnvelopeSchema as D, isInEvalScope as Dn, cacheDebugKeyEntrySchema as Dt, updateManualScoreRequestSchema as E, incrementEvalOutput as En, evalChartsConfigSchema as Et, getEvalDisplayStatus as F, startEvalBackgroundJob as Fn, cacheRecordingOpSchema as Ft, apiCallMetricPlacementSchema as G, traceDisplayConfigSchema as Gt, DEFAULT_LLM_CALLS_CONFIG as H, traceAttributeDisplayInputSchema as Ht, deriveScopedSummaryFromCases as I, repoFile as In, cacheRecordingSchema as It, defaultConfigKeySchema as J, traceSpanKindSchema as Jt, apiCallMetricSchema as K, traceDisplayInputConfigSchema as Kt, deriveStatusFromCaseRows as L, defineEval as Ln, cacheStatusSchema as Lt, applyDerivedCallAttributes as M, runInExistingEvalScope as Mn, cacheListItemSchema as Mt, getNestedAttribute as N, setEvalOutput as Nn, cacheModeSchema as Nt, extractCacheEntries as O, mergeEvalOutput as On, cacheDebugKeyFileSchema as Ot, getEvalTitle as P, setScopeCacheContext as Pn, cacheOperationTypeSchema as Pt, llmCallPricingSchema as Q, columnDefSchema as Qt, deriveStatusFromChildStatuses as R, getEvalRegistry as Rn, serializedCacheSpanSchema as Rt, normalizeScoreDef as S, evalLog as Sn, evalChartConfigSchema as St, createRunRequestSchema as T, getEvalStartTime as Tn, evalChartTypeSchema as Tt, agentEvalsConfigSchema as U, traceAttributeDisplayPlacementSchema as Ut, DEFAULT_API_CALLS_CONFIG as V, traceAttributeDisplayFormatSchema as Vt, apiCallMetricFormatSchema as W, traceAttributeDisplaySchema as Wt, llmCallMetricPlacementSchema as X, traceSpanWarningSchema as Xt, llmCallMetricFormatSchema as Y, traceSpanSchema as Yt, llmCallMetricSchema as Z, cellValueSchema as Zt, loadEvalModule as _, EvalAssertionError as _n, scoreTraceSchema as _t, loadPersistedRunSnapshot as a, runArtifactRefSchema as an, assertionFailureSchema as at, loadConfig as b, configureEvalRunLogs as bn, evalChartBuiltinMetricSchema as bt, persistCaseDetail as c, captureEvalSpanError as cn, evalFreshnessStatusSchema as ct, recomputePersistedCaseStatus as d, hashCacheKey as dn, evalStatsConfigSchema as dt, columnKindSchema as en, removeDefaultConfigSchema as et, runTouchesEval as f, hashCacheKeySync as fn, evalSummarySchema as ft, setLatestRunInfoMap as g, serializeCacheValue as gn, runLogPhaseSchema as gt, getTargetEvalIds as h, serializeCacheRecording as hn, runLogLocationSchema as ht, getLatestRunInfos as i, repoFileRefSchema as in, trialSelectionModeSchema as it, extractLlmCalls as j, runInEvalScope as jn, cacheFileSchema as jt, extractCacheHits as k, nextEvalId as kn, cacheEntrySchema as kt, persistRunState as l, evalSpan as ln, evalStatAggregateSchema as lt, buildEvalSummary as m, deserializeCacheValue as mn, runLogLevelSchema as mt, generateRunId as n, jsonCellSchema as nn, resolveLlmCallsConfig as nt, loadPersistedRunSnapshots as o, z$1 as on, caseDetailSchema as ot, resolveArtifactPath as p, deserializeCacheRecording as pn, runLogEntrySchema as pt, apiCallsConfigSchema as q, traceSpanErrorSchema as qt, getLastRunStatuses as r, numberDisplayOptionsSchema as rn, runLogsConfigSchema as rt, nextShortIdFromSnapshots as s, buildTraceTree as sn, caseRowSchema as st, executeRun as t, fileRefSchema as tn, resolveApiCallsConfig as tt, recomputeEvalStatusesInRuns as u, evalTracer as un, evalStatItemSchema as ut, parseEvalMetas as v, advanceEvalTime as vn, evalChartAggregateSchema as vt, createFsCacheStore as w, getEvalCaseInput as wn, evalChartTooltipExtraSchema as wt, buildDeclaredColumnDefs as x, evalAssert as xn, evalChartColorSchema as xt, resolveEvalDefaultConfig as y, appendToEvalOutput as yn, evalChartAxisSchema as yt, runManifestSchema as z, spanCacheOptionsSchema as zt };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Be0x8CS3.mjs";
|
|
2
|
+
import "./src-D6cettg0.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-B4SosWgD.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.24.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -59,8 +59,8 @@
|
|
|
59
59
|
"@types/node": "^24.7.2",
|
|
60
60
|
"typescript": "^5.9.2",
|
|
61
61
|
"@agent-evals/runner": "0.0.1",
|
|
62
|
-
"@agent-evals/
|
|
63
|
-
"@agent-evals/
|
|
62
|
+
"@agent-evals/shared": "0.0.1",
|
|
63
|
+
"@agent-evals/sdk": "0.0.1"
|
|
64
64
|
},
|
|
65
65
|
"scripts": {
|
|
66
66
|
"build": "pnpm --filter @agent-evals/web build && tsdown",
|
|
@@ -156,6 +156,19 @@ for settlement; promise and span errors keep their normal behavior. Use
|
|
|
156
156
|
`waitForBackgroundJob: false` on a span, or `waitForBackgroundJobs: false` on an
|
|
157
157
|
eval definition, when background work should not delay finalization.
|
|
158
158
|
|
|
159
|
+
Eval Date APIs use a shifted wall clock by default: `new Date()` and
|
|
160
|
+
`Date.now()` start at `2026-04-10T00:00:00.000Z` during case generation,
|
|
161
|
+
execution, tracing, derived outputs, and scorers, then continue advancing with
|
|
162
|
+
real elapsed time. Set `startTime` on a specific `defineEval(...)` to use
|
|
163
|
+
another initial clock value, or set `startTime: 'now'` for that eval to use the
|
|
164
|
+
real current clock. Timers are not faked, so async waits still run normally.
|
|
165
|
+
Set `freezeTime: true` to keep Date APIs frozen until they are moved manually.
|
|
166
|
+
Use `getEvalStartTime()` to read the captured wall-clock start as a `Date`.
|
|
167
|
+
Use `advanceEvalTime(unit, amount)` inside an eval to move the shifted clock
|
|
168
|
+
forward; supported units are `millisecond(s)`, `second(s)`, `minute(s)`,
|
|
169
|
+
`hour(s)`, and `day(s)`. It throws for evals with `startTime: 'now'`, unless
|
|
170
|
+
`freezeTime: true` is also set.
|
|
171
|
+
|
|
159
172
|
For libraries or observability exporters that already emit span lifecycle
|
|
160
173
|
events, use `evalTracer.startSpan(...)`, `evalTracer.updateSpan(...)`,
|
|
161
174
|
`evalTracer.endSpan(...)`, or `evalTracer.recordSpan(...)` to translate those
|
|
@@ -261,10 +274,12 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
261
274
|
attribute paths. `latencyMs` is time to first token; duration, total tokens,
|
|
262
275
|
tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter,
|
|
263
276
|
override `attributes.<field>` for non-default primitive span shapes, configure
|
|
264
|
-
`pricing` to derive USD costs from token counts by model/provider,
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
'
|
|
277
|
+
`pricing` to derive USD costs from token counts by model/provider, add
|
|
278
|
+
`derivedAttributes` to persist computed values back onto matching LLM spans
|
|
279
|
+
before trace consumers run, and add entries to `metrics` to surface arbitrary
|
|
280
|
+
user metrics (`format: 'string' | 'number' | 'duration' | 'json' |
|
|
281
|
+
'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` keys are
|
|
282
|
+
dot-paths under `span.attributes`; return `undefined` to skip one span.
|
|
268
283
|
- Default usage config derives missing eval outputs from matching LLM/API spans
|
|
269
284
|
before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`,
|
|
270
285
|
`inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`,
|
|
@@ -285,7 +300,8 @@ cacheCreationInputTokens` so cache details are not double-counted.
|
|
|
285
300
|
and `'fetch'` spans with `method`, `url`, `statusCode`, `request`,
|
|
286
301
|
`response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and
|
|
287
302
|
`error` read from conventional attribute paths. Override `kinds` or
|
|
288
|
-
`attributes.<field>` for external tracers,
|
|
303
|
+
`attributes.<field>` for external tracers, add `derivedAttributes` for
|
|
304
|
+
computed persisted API span attributes, and add `metrics` with the same
|
|
289
305
|
formats and placements as LLM-call metrics.
|
|
290
306
|
- `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
|
|
291
307
|
`runLogs: { captureConsole: false }` to keep console output in the terminal
|
package/dist/src-D-HuV8I-.mjs
DELETED