@ls-stack/agent-eval 0.47.0 → 0.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BZmhhSFZ.mjs → app-CzLj4ZX0.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-C5SveD-X.css +1 -0
- package/dist/apps/web/dist/assets/index-DwgyYZgf.js +373 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-vdJYkEVk.mjs → cli-Cvs7tc2v.mjs} +3 -3
- package/dist/index.d.mts +90 -49
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-BFdxG9ws.mjs → runOrchestration-o38J7uZO.mjs} +161 -110
- package/dist/{runner-DJWn_7p0.mjs → runner-LdMiDmAN.mjs} +2 -2
- package/dist/{runner--aH0jO4Z.mjs → runner-iWtmKx9z.mjs} +1 -1
- package/dist/{src-BRqs3kSA.mjs → src-Jahivm6d.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +8 -2
- package/dist/apps/web/dist/assets/index-B5JrV3_C.css +0 -1
- package/dist/apps/web/dist/assets/index-DB61h-lP.js +0 -369
|
@@ -621,6 +621,8 @@ const hideIfNoValueShape = {
|
|
|
621
621
|
hideIfNoValue: z.boolean().optional() };
|
|
622
622
|
/**
|
|
623
623
|
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
624
|
+
* `cacheHits` counts Agent Eval operation-level cache hits from spans and
|
|
625
|
+
* `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
|
|
624
626
|
* `column` aggregates a score or numeric output column across the latest run.
|
|
625
627
|
*/
|
|
626
628
|
const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
@@ -637,6 +639,10 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
637
639
|
kind: z.literal("duration"),
|
|
638
640
|
...hideIfNoValueShape
|
|
639
641
|
}),
|
|
642
|
+
z.object({
|
|
643
|
+
kind: z.literal("cacheHits"),
|
|
644
|
+
...hideIfNoValueShape
|
|
645
|
+
}),
|
|
640
646
|
z.object({
|
|
641
647
|
kind: z.literal("column"),
|
|
642
648
|
key: z.string(),
|
|
@@ -731,6 +737,23 @@ const caseRowSchema = z.object({
|
|
|
731
737
|
]),
|
|
732
738
|
/** Elapsed case execution duration in milliseconds, or null before completion. */
|
|
733
739
|
durationMs: z.number().nullable(),
|
|
740
|
+
/**
|
|
741
|
+
* Agent Eval operation-level cache hits recorded for this case.
|
|
742
|
+
*
|
|
743
|
+
* This counts persisted operation cache hits from spans and
|
|
744
|
+
* `evalTracer.cache(...)` refs. It does not count LLM provider prompt-cache
|
|
745
|
+
* read tokens such as `cachedInputTokens`. Older run artifacts may omit it
|
|
746
|
+
* and should be treated as zero by aggregate readers.
|
|
747
|
+
*/
|
|
748
|
+
cacheHits: z.number().optional(),
|
|
749
|
+
/**
|
|
750
|
+
* Agent Eval operation-level cache activity entries recorded for this case.
|
|
751
|
+
*
|
|
752
|
+
* This is the denominator for `cacheHits`, counting hits plus misses and
|
|
753
|
+
* refreshes that appear in the Cache tab. Older run artifacts may omit it
|
|
754
|
+
* and should be treated as zero by aggregate readers.
|
|
755
|
+
*/
|
|
756
|
+
cacheOperations: z.number().optional(),
|
|
734
757
|
costUsd: z.number().nullable().optional(),
|
|
735
758
|
columns: z.record(z.string(), cellValueSchema),
|
|
736
759
|
/** Winning trial index for the persisted case result. */
|
|
@@ -771,7 +794,13 @@ const runLogLocationSchema = z.object({
|
|
|
771
794
|
/** 1-based source line reported by the JavaScript stack frame. */
|
|
772
795
|
line: z.number(),
|
|
773
796
|
/** 1-based source column reported by the JavaScript stack frame. */
|
|
774
|
-
column: z.number()
|
|
797
|
+
column: z.number(),
|
|
798
|
+
/**
|
|
799
|
+
* Full JavaScript stack captured when the log was emitted.
|
|
800
|
+
*
|
|
801
|
+
* Older run artifacts may only include the primary file, line, and column.
|
|
802
|
+
*/
|
|
803
|
+
stack: z.string().optional()
|
|
775
804
|
});
|
|
776
805
|
/** Schema for one persisted log entry captured during a case run. */
|
|
777
806
|
const runLogEntrySchema = z.object({
|
|
@@ -1692,6 +1721,8 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1692
1721
|
let runningCases = 0;
|
|
1693
1722
|
let totalDurationMs = 0;
|
|
1694
1723
|
let hasDuration = false;
|
|
1724
|
+
let cacheHits = 0;
|
|
1725
|
+
let cacheOperations = 0;
|
|
1695
1726
|
for (const caseRow of caseRows) {
|
|
1696
1727
|
if (caseRow.status === "pass") passedCases += 1;
|
|
1697
1728
|
else if (caseRow.status === "fail") failedCases += 1;
|
|
@@ -1703,6 +1734,8 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1703
1734
|
totalDurationMs += caseRow.durationMs;
|
|
1704
1735
|
hasDuration = true;
|
|
1705
1736
|
}
|
|
1737
|
+
cacheHits += caseRow.cacheHits ?? 0;
|
|
1738
|
+
cacheOperations += caseRow.cacheOperations ?? 0;
|
|
1706
1739
|
}
|
|
1707
1740
|
return {
|
|
1708
1741
|
status: deriveStatusFromCaseRows({
|
|
@@ -1716,7 +1749,9 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1716
1749
|
cancelledCases,
|
|
1717
1750
|
pendingCases,
|
|
1718
1751
|
runningCases,
|
|
1719
|
-
totalDurationMs: hasDuration ? totalDurationMs : null
|
|
1752
|
+
totalDurationMs: hasDuration ? totalDurationMs : null,
|
|
1753
|
+
cacheHits,
|
|
1754
|
+
cacheOperations
|
|
1720
1755
|
};
|
|
1721
1756
|
}
|
|
1722
1757
|
//#endregion
|
|
@@ -2508,6 +2543,7 @@ function stripTerminalControlCodes$1(value) {
|
|
|
2508
2543
|
const scopeStorage = new AsyncLocalStorage();
|
|
2509
2544
|
const runtimeScopeStorage = new AsyncLocalStorage();
|
|
2510
2545
|
const evalClockStorage = new AsyncLocalStorage();
|
|
2546
|
+
const activeSpanStackStorage = new AsyncLocalStorage();
|
|
2511
2547
|
let activeEvalScopeCount = 0;
|
|
2512
2548
|
let activeEvalRuntimeScopeCount = 0;
|
|
2513
2549
|
let consoleCaptureEnabled = true;
|
|
@@ -2650,6 +2686,16 @@ function getCurrentScope() {
|
|
|
2650
2686
|
if (activeEvalScopeCount === 0) return void 0;
|
|
2651
2687
|
return scopeStorage.getStore();
|
|
2652
2688
|
}
|
|
2689
|
+
/** Return the span currently active in this async execution, if any. */
|
|
2690
|
+
function getCurrentActiveSpan() {
|
|
2691
|
+
if (activeEvalScopeCount === 0) return void 0;
|
|
2692
|
+
return activeSpanStackStorage.getStore()?.at(-1);
|
|
2693
|
+
}
|
|
2694
|
+
/** Execute a callback with a span added to this async execution's active stack. */
|
|
2695
|
+
async function runWithActiveSpan(span, fn) {
|
|
2696
|
+
const currentStack = activeSpanStackStorage.getStore() ?? [];
|
|
2697
|
+
return await activeSpanStackStorage.run([...currentStack, span], fn);
|
|
2698
|
+
}
|
|
2653
2699
|
/**
|
|
2654
2700
|
* Return the current eval runner phase for this async execution.
|
|
2655
2701
|
*
|
|
@@ -2787,7 +2833,8 @@ function normalizeStackFile(value) {
|
|
|
2787
2833
|
return decodeURIComponent(value.replace(fileUrlPrefixPattern, ""));
|
|
2788
2834
|
}
|
|
2789
2835
|
function isInternalLogFrame(file) {
|
|
2790
|
-
|
|
2836
|
+
const normalizedFile = file.replaceAll("\\", "/");
|
|
2837
|
+
return normalizedFile.includes("/packages/sdk/src/runtime.ts") || normalizedFile.includes("/packages/sdk/dist/") || normalizedFile.includes("/node_modules/@agent-evals/sdk/dist/") || normalizedFile.includes("/node_modules/@ls-stack/agent-eval/dist/") || normalizedFile.includes("/node:internal/") || normalizedFile.startsWith("node:internal/");
|
|
2791
2838
|
}
|
|
2792
2839
|
function parseStackFrameLocation(line) {
|
|
2793
2840
|
const match = stackFrameLocationPattern.exec(line.trim());
|
|
@@ -2808,7 +2855,10 @@ function getLogLocation() {
|
|
|
2808
2855
|
for (const line of stack.split("\n").slice(1)) {
|
|
2809
2856
|
const location = parseStackFrameLocation(line);
|
|
2810
2857
|
if (location === null || isInternalLogFrame(location.file)) continue;
|
|
2811
|
-
return
|
|
2858
|
+
return {
|
|
2859
|
+
...location,
|
|
2860
|
+
stack
|
|
2861
|
+
};
|
|
2812
2862
|
}
|
|
2813
2863
|
}
|
|
2814
2864
|
function recordEvalLog(level, args) {
|
|
@@ -2951,8 +3001,6 @@ async function runInEvalScope(caseId, fn, options = {}) {
|
|
|
2951
3001
|
logs: [],
|
|
2952
3002
|
spans: [],
|
|
2953
3003
|
checkpoints: /* @__PURE__ */ new Map(),
|
|
2954
|
-
spanStack: [],
|
|
2955
|
-
activeSpanStack: [],
|
|
2956
3004
|
recordingStack: [],
|
|
2957
3005
|
replayingDepth: 0,
|
|
2958
3006
|
cacheContext: options.cacheContext,
|
|
@@ -4185,7 +4233,7 @@ function createTraceCache(generateSpanId) {
|
|
|
4185
4233
|
namespace,
|
|
4186
4234
|
key: info.key
|
|
4187
4235
|
}, { serializeFileBytes: info.serializeFileBytes === true });
|
|
4188
|
-
const activeSpan =
|
|
4236
|
+
const activeSpan = getCurrentActiveSpan();
|
|
4189
4237
|
const canRead = cacheCtx.mode === "use" && cacheCtx.read !== false;
|
|
4190
4238
|
const canStore = cacheCtx.mode !== "bypass" && cacheCtx.store !== false;
|
|
4191
4239
|
if (canRead) {
|
|
@@ -4283,7 +4331,7 @@ function generateSpanId() {
|
|
|
4283
4331
|
return `span_${String(Date.now())}_${String(spanIdCounter)}`;
|
|
4284
4332
|
}
|
|
4285
4333
|
function updateCurrentSpan(update) {
|
|
4286
|
-
const currentSpan =
|
|
4334
|
+
const currentSpan = getCurrentActiveSpan();
|
|
4287
4335
|
if (!currentSpan) return;
|
|
4288
4336
|
update(currentSpan);
|
|
4289
4337
|
}
|
|
@@ -4454,9 +4502,9 @@ function toIsoTimestamp(value) {
|
|
|
4454
4502
|
function findSpan(scope, id) {
|
|
4455
4503
|
return scope.spans.find((span) => span.id === id);
|
|
4456
4504
|
}
|
|
4457
|
-
function resolveExternalParentId(
|
|
4505
|
+
function resolveExternalParentId(parentId) {
|
|
4458
4506
|
if (parentId !== void 0) return parentId;
|
|
4459
|
-
return
|
|
4507
|
+
return getCurrentActiveSpan()?.id ?? null;
|
|
4460
4508
|
}
|
|
4461
4509
|
function startExternalSpan(info) {
|
|
4462
4510
|
const id = info.id ?? generateSpanId();
|
|
@@ -4464,7 +4512,7 @@ function startExternalSpan(info) {
|
|
|
4464
4512
|
if (!scope) return noopExternalSpan(id);
|
|
4465
4513
|
const existing = findSpan(scope, id);
|
|
4466
4514
|
if (existing) {
|
|
4467
|
-
existing.parentId = resolveExternalParentId(
|
|
4515
|
+
existing.parentId = resolveExternalParentId(info.parentId);
|
|
4468
4516
|
existing.kind = info.kind;
|
|
4469
4517
|
existing.name = info.name;
|
|
4470
4518
|
existing.startedAt = toIsoTimestamp(info.startedAt);
|
|
@@ -4475,7 +4523,7 @@ function startExternalSpan(info) {
|
|
|
4475
4523
|
}
|
|
4476
4524
|
scope.spans.push({
|
|
4477
4525
|
id,
|
|
4478
|
-
parentId: resolveExternalParentId(
|
|
4526
|
+
parentId: resolveExternalParentId(info.parentId),
|
|
4479
4527
|
caseId: scope.caseId,
|
|
4480
4528
|
kind: info.kind,
|
|
4481
4529
|
name: info.name,
|
|
@@ -4516,7 +4564,7 @@ function recordExternalSpan(info) {
|
|
|
4516
4564
|
const existing = findSpan(scope, id);
|
|
4517
4565
|
const status = info.status ?? (info.error ? "error" : "ok");
|
|
4518
4566
|
if (existing) {
|
|
4519
|
-
existing.parentId = resolveExternalParentId(
|
|
4567
|
+
existing.parentId = resolveExternalParentId(info.parentId);
|
|
4520
4568
|
existing.kind = info.kind;
|
|
4521
4569
|
existing.name = info.name;
|
|
4522
4570
|
existing.startedAt = startedAt;
|
|
@@ -4530,7 +4578,7 @@ function recordExternalSpan(info) {
|
|
|
4530
4578
|
}
|
|
4531
4579
|
scope.spans.push({
|
|
4532
4580
|
id,
|
|
4533
|
-
parentId: resolveExternalParentId(
|
|
4581
|
+
parentId: resolveExternalParentId(info.parentId),
|
|
4534
4582
|
caseId: scope.caseId,
|
|
4535
4583
|
kind: info.kind,
|
|
4536
4584
|
name: info.name,
|
|
@@ -4613,7 +4661,7 @@ async function traceSpanInternal(info, fn) {
|
|
|
4613
4661
|
const scope = getCurrentScope();
|
|
4614
4662
|
if (!scope) return await fn(noopActiveSpan());
|
|
4615
4663
|
const id = generateSpanId();
|
|
4616
|
-
const parentId =
|
|
4664
|
+
const parentId = getCurrentActiveSpan()?.id ?? null;
|
|
4617
4665
|
const realStartedAt = getRealDateNowMs();
|
|
4618
4666
|
const spanRecord = {
|
|
4619
4667
|
id,
|
|
@@ -4627,109 +4675,106 @@ async function traceSpanInternal(info, fn) {
|
|
|
4627
4675
|
attributes: info.attributes
|
|
4628
4676
|
};
|
|
4629
4677
|
scope.spans.push(spanRecord);
|
|
4630
|
-
scope.spanStack.push(id);
|
|
4631
|
-
scope.activeSpanStack.push(spanRecord);
|
|
4632
4678
|
const activeSpan = createSpanHandle(spanRecord);
|
|
4633
|
-
|
|
4634
|
-
|
|
4635
|
-
|
|
4636
|
-
|
|
4637
|
-
|
|
4638
|
-
|
|
4639
|
-
|
|
4640
|
-
|
|
4641
|
-
|
|
4642
|
-
|
|
4643
|
-
|
|
4644
|
-
|
|
4645
|
-
|
|
4646
|
-
|
|
4647
|
-
|
|
4648
|
-
|
|
4649
|
-
|
|
4650
|
-
|
|
4651
|
-
|
|
4652
|
-
|
|
4679
|
+
return await runWithActiveSpan(spanRecord, async () => {
|
|
4680
|
+
try {
|
|
4681
|
+
const cacheOpts = info.cache;
|
|
4682
|
+
const cacheCtx = scope.cacheContext;
|
|
4683
|
+
if (cacheOpts !== void 0 && cacheCtx !== void 0 && scope.replayingDepth === 0) {
|
|
4684
|
+
const ctx = cacheCtx;
|
|
4685
|
+
const namespace = getRequiredSpanCacheNamespace(cacheOpts);
|
|
4686
|
+
const keyHash = await hashCacheKey({
|
|
4687
|
+
namespace,
|
|
4688
|
+
key: cacheOpts.key
|
|
4689
|
+
}, { serializeFileBytes: cacheOpts.serializeFileBytes === true });
|
|
4690
|
+
const canRead = ctx.mode === "use" && ctx.read !== false;
|
|
4691
|
+
const canStore = ctx.mode !== "bypass" && ctx.store !== false;
|
|
4692
|
+
mergeSpanAttributes(spanRecord, {
|
|
4693
|
+
"cache.key": keyHash,
|
|
4694
|
+
"cache.namespace": namespace
|
|
4695
|
+
});
|
|
4696
|
+
if (canRead) {
|
|
4697
|
+
const hit = await ctx.adapter.lookup(namespace, keyHash);
|
|
4698
|
+
if (hit) {
|
|
4699
|
+
const storedAt = hit.storedAt;
|
|
4700
|
+
mergeSpanAttributes(spanRecord, {
|
|
4701
|
+
"cache.status": "hit",
|
|
4702
|
+
"cache.storedAt": storedAt,
|
|
4703
|
+
"cache.age": getRealDateNowMs() - new Date(storedAt).getTime()
|
|
4704
|
+
});
|
|
4705
|
+
const recording = deserializeCacheRecording(hit.recording);
|
|
4706
|
+
replayRecording(scope, spanRecord, recording, { generateSpanId });
|
|
4707
|
+
spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
|
|
4708
|
+
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
4709
|
+
return recording.returnValue;
|
|
4710
|
+
}
|
|
4653
4711
|
mergeSpanAttributes(spanRecord, {
|
|
4654
|
-
"cache.status": "
|
|
4655
|
-
"cache.
|
|
4656
|
-
"cache.age": getRealDateNowMs() - new Date(storedAt).getTime()
|
|
4712
|
+
"cache.status": "miss",
|
|
4713
|
+
...canStore ? {} : { "cache.stored": false }
|
|
4657
4714
|
});
|
|
4658
|
-
|
|
4659
|
-
replayRecording(scope, spanRecord, recording, { generateSpanId });
|
|
4660
|
-
spanRecord.status = recording.finalStatus ?? (hasSpanError(spanRecord) ? "error" : "ok");
|
|
4661
|
-
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
4662
|
-
return recording.returnValue;
|
|
4663
|
-
}
|
|
4664
|
-
mergeSpanAttributes(spanRecord, {
|
|
4715
|
+
} else if (ctx.mode === "use" && canStore) mergeSpanAttributes(spanRecord, {
|
|
4665
4716
|
"cache.status": "miss",
|
|
4717
|
+
"cache.read": false
|
|
4718
|
+
});
|
|
4719
|
+
else if (ctx.mode === "refresh") mergeSpanAttributes(spanRecord, {
|
|
4720
|
+
"cache.status": "refresh",
|
|
4666
4721
|
...canStore ? {} : { "cache.stored": false }
|
|
4667
4722
|
});
|
|
4668
|
-
|
|
4669
|
-
|
|
4670
|
-
|
|
4671
|
-
|
|
4672
|
-
|
|
4673
|
-
"cache.status": "refresh",
|
|
4674
|
-
...canStore ? {} : { "cache.stored": false }
|
|
4675
|
-
});
|
|
4676
|
-
else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
|
|
4677
|
-
const frame = {
|
|
4678
|
-
baseSpanIndex: scope.spans.length,
|
|
4679
|
-
replayParentSpanId: id,
|
|
4680
|
-
ops: []
|
|
4681
|
-
};
|
|
4682
|
-
scope.recordingStack.push(frame);
|
|
4683
|
-
let bodyResult;
|
|
4684
|
-
try {
|
|
4685
|
-
bodyResult = await fn(activeSpan);
|
|
4686
|
-
} finally {
|
|
4687
|
-
scope.recordingStack.pop();
|
|
4688
|
-
}
|
|
4689
|
-
appendSubSpanOps(scope, frame);
|
|
4690
|
-
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
4691
|
-
if (canStore) {
|
|
4692
|
-
const recording = {
|
|
4693
|
-
returnValue: bodyResult,
|
|
4694
|
-
finalAttributes: stripCacheAttributes(spanRecord.attributes),
|
|
4695
|
-
finalStatus: spanRecord.status,
|
|
4696
|
-
finalError: spanRecord.error,
|
|
4697
|
-
finalErrors: spanRecord.errors,
|
|
4698
|
-
finalWarning: spanRecord.warning,
|
|
4699
|
-
finalWarnings: spanRecord.warnings,
|
|
4700
|
-
ops: frame.ops
|
|
4701
|
-
};
|
|
4702
|
-
const entry = {
|
|
4703
|
-
version: 1,
|
|
4704
|
-
key: keyHash,
|
|
4705
|
-
namespace,
|
|
4706
|
-
operationType: "span",
|
|
4707
|
-
operationName: info.name,
|
|
4708
|
-
spanName: info.name,
|
|
4709
|
-
spanKind: info.kind,
|
|
4710
|
-
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
4711
|
-
recording: await serializeCacheRecording(recording, { externalJsonStore: ctx.adapter.externalJsonStore })
|
|
4723
|
+
else mergeSpanAttributes(spanRecord, { "cache.status": "bypass" });
|
|
4724
|
+
const frame = {
|
|
4725
|
+
baseSpanIndex: scope.spans.length,
|
|
4726
|
+
replayParentSpanId: id,
|
|
4727
|
+
ops: []
|
|
4712
4728
|
};
|
|
4713
|
-
|
|
4714
|
-
|
|
4715
|
-
|
|
4716
|
-
|
|
4717
|
-
}
|
|
4729
|
+
scope.recordingStack.push(frame);
|
|
4730
|
+
let bodyResult;
|
|
4731
|
+
try {
|
|
4732
|
+
bodyResult = await fn(activeSpan);
|
|
4733
|
+
} finally {
|
|
4734
|
+
scope.recordingStack.pop();
|
|
4735
|
+
}
|
|
4736
|
+
appendSubSpanOps(scope, frame);
|
|
4737
|
+
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
4738
|
+
if (canStore) {
|
|
4739
|
+
const recording = {
|
|
4740
|
+
returnValue: bodyResult,
|
|
4741
|
+
finalAttributes: stripCacheAttributes(spanRecord.attributes),
|
|
4742
|
+
finalStatus: spanRecord.status,
|
|
4743
|
+
finalError: spanRecord.error,
|
|
4744
|
+
finalErrors: spanRecord.errors,
|
|
4745
|
+
finalWarning: spanRecord.warning,
|
|
4746
|
+
finalWarnings: spanRecord.warnings,
|
|
4747
|
+
ops: frame.ops
|
|
4748
|
+
};
|
|
4749
|
+
const entry = {
|
|
4750
|
+
version: 1,
|
|
4751
|
+
key: keyHash,
|
|
4752
|
+
namespace,
|
|
4753
|
+
operationType: "span",
|
|
4754
|
+
operationName: info.name,
|
|
4755
|
+
spanName: info.name,
|
|
4756
|
+
spanKind: info.kind,
|
|
4757
|
+
storedAt: new Date(getRealDateNowMs()).toISOString(),
|
|
4758
|
+
recording: await serializeCacheRecording(recording, { externalJsonStore: ctx.adapter.externalJsonStore })
|
|
4759
|
+
};
|
|
4760
|
+
await ctx.adapter.write(entry, {
|
|
4761
|
+
rawKey: cacheOpts.key,
|
|
4762
|
+
operationType: "span",
|
|
4763
|
+
operationName: info.name
|
|
4764
|
+
});
|
|
4765
|
+
}
|
|
4766
|
+
return bodyResult;
|
|
4718
4767
|
}
|
|
4719
|
-
|
|
4768
|
+
const result = await fn(activeSpan);
|
|
4769
|
+
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
4770
|
+
return result;
|
|
4771
|
+
} catch (error) {
|
|
4772
|
+
spanRecord.status = "error";
|
|
4773
|
+
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
4774
|
+
spanRecord.error = normalizeTraceError(error);
|
|
4775
|
+
throw error;
|
|
4720
4776
|
}
|
|
4721
|
-
|
|
4722
|
-
finishSpanWithoutThrownError(spanRecord, realStartedAt);
|
|
4723
|
-
return result;
|
|
4724
|
-
} catch (error) {
|
|
4725
|
-
spanRecord.status = "error";
|
|
4726
|
-
spanRecord.endedAt = addElapsedMsToTimestamp(spanRecord.startedAt, getRealDateNowMs() - realStartedAt);
|
|
4727
|
-
spanRecord.error = normalizeTraceError(error);
|
|
4728
|
-
throw error;
|
|
4729
|
-
} finally {
|
|
4730
|
-
scope.spanStack.pop();
|
|
4731
|
-
scope.activeSpanStack.pop();
|
|
4732
|
-
}
|
|
4777
|
+
});
|
|
4733
4778
|
}
|
|
4734
4779
|
function getRequiredSpanCacheNamespace(cacheOpts) {
|
|
4735
4780
|
if (!isRecordLike$1(cacheOpts)) throw new Error("Cached spans require a non-empty cache.namespace");
|
|
@@ -4785,7 +4830,7 @@ const evalTracer = {
|
|
|
4785
4830
|
if (!scope) return;
|
|
4786
4831
|
scope.checkpoints.set(name, data);
|
|
4787
4832
|
const id = generateSpanId();
|
|
4788
|
-
const parentId =
|
|
4833
|
+
const parentId = getCurrentActiveSpan()?.id ?? null;
|
|
4789
4834
|
scope.spans.push({
|
|
4790
4835
|
id,
|
|
4791
4836
|
parentId,
|
|
@@ -7169,12 +7214,16 @@ async function runCase(params) {
|
|
|
7169
7214
|
};
|
|
7170
7215
|
if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
|
|
7171
7216
|
const elapsedMs = Date.now() - startTime;
|
|
7217
|
+
const cacheEntries = extractCacheEntries(displayTrace, scope.caseCacheRefs);
|
|
7218
|
+
const cacheHits = cacheEntries.filter((entry) => entry.status === "hit");
|
|
7172
7219
|
return {
|
|
7173
7220
|
caseDetail,
|
|
7174
7221
|
caseRowUpdate: {
|
|
7175
7222
|
tags: evalCase.tags ?? [],
|
|
7176
7223
|
status,
|
|
7177
7224
|
durationMs: elapsedMs,
|
|
7225
|
+
cacheHits: cacheHits.length,
|
|
7226
|
+
cacheOperations: cacheEntries.length,
|
|
7178
7227
|
columns
|
|
7179
7228
|
}
|
|
7180
7229
|
};
|
|
@@ -7670,6 +7719,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7670
7719
|
tags: caseDetail.tags,
|
|
7671
7720
|
status: caseRowUpdate.status ?? "pending",
|
|
7672
7721
|
durationMs: caseRowUpdate.durationMs ?? null,
|
|
7722
|
+
cacheHits: caseRowUpdate.cacheHits ?? 0,
|
|
7723
|
+
cacheOperations: caseRowUpdate.cacheOperations ?? 0,
|
|
7673
7724
|
columns: caseRowUpdate.columns ?? {},
|
|
7674
7725
|
trial
|
|
7675
7726
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Cvs7tc2v.mjs";
|
|
2
|
+
import "./src-Jahivm6d.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-LdMiDmAN.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-o38J7uZO.mjs";
|
|
2
|
+
import "./cli-Cvs7tc2v.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.51.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
35
|
"@agent-evals/runner": "0.0.1",
|
|
36
|
-
"@agent-evals/
|
|
37
|
-
"@agent-evals/
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -400,11 +400,17 @@ cacheCreationInputTokens` so cache details are not double-counted.
|
|
|
400
400
|
- `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
|
|
401
401
|
`runLogs: { captureConsole: false }` to keep console output in the terminal
|
|
402
402
|
without persisting console calls to case details. Manual `evalLog(...)` calls
|
|
403
|
-
are still captured.
|
|
403
|
+
are still captured. Captured log locations store the selected user-facing
|
|
404
|
+
source frame and the full JavaScript stack so agents can inspect additional
|
|
405
|
+
frames in persisted artifacts when diagnosing where a log came from.
|
|
404
406
|
|
|
405
407
|
Stats rows and history charts can be authored via `stats` / `charts` on the eval
|
|
406
408
|
definition. Global `stats` in `agent-evals.config.ts` combine with eval-level
|
|
407
|
-
stats.
|
|
409
|
+
stats. Native stat kinds include `cases`, `passRate`, `duration`, and
|
|
410
|
+
`cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total
|
|
411
|
+
cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not
|
|
412
|
+
LLM provider prompt-cache read tokens such as `cachedInputTokens`. Usage stats
|
|
413
|
+
and LLM usage charts are added by default unless removed with
|
|
408
414
|
`removeDefaultConfig`. Column stats can override `format` and `numberFormat`,
|
|
409
415
|
otherwise they inherit from the matching column. Number formats use
|
|
410
416
|
`maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing
|