@ls-stack/agent-eval 0.46.0 → 0.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-BZmhhSFZ.mjs → app-DR9WPMA4.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-BQY_snr3.css +1 -0
- package/dist/apps/web/dist/assets/index-BkXnL_y8.js +373 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/{cli-vdJYkEVk.mjs → cli-R7_V6YWa.mjs} +3 -3
- package/dist/index.d.mts +80 -37
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +1 -1
- package/dist/{runOrchestration-BFdxG9ws.mjs → runOrchestration-CokPQet7.mjs} +49 -4
- package/dist/{runner--aH0jO4Z.mjs → runner-B8dLVAyM.mjs} +1 -1
- package/dist/{runner-DJWn_7p0.mjs → runner-Coc9wBWz.mjs} +2 -2
- package/dist/{src-BRqs3kSA.mjs → src-B43qR0Ea.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +8 -2
- package/dist/apps/web/dist/assets/index-B5JrV3_C.css +0 -1
- package/dist/apps/web/dist/assets/index-CaU84fHq.js +0 -369
|
@@ -25,8 +25,8 @@
|
|
|
25
25
|
href="https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
|
26
26
|
rel="stylesheet"
|
|
27
27
|
/>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
29
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-BkXnL_y8.js"></script>
|
|
29
|
+
<link rel="stylesheet" crossorigin href="/assets/index-BQY_snr3.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
|
32
32
|
<div id="root"></div>
|
package/dist/bin.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-
|
|
1
|
+
import { A as validateCharts, At as buildEvalKey, C as deriveEvalFreshness, Ct as getEvalDisplayStatus, D as loadConfig, Dt as runSummarySchema, E as resolveEvalDefaultConfig, Lt as getEvalRegistry, O as buildDeclaredColumnDefs, Ot as resolveApiCallsConfig, S as parseManualInputValues, St as getEvalTitle, T as parseEvalDiscovery, Tt as matchesTagsFilter, _ as recomputePersistedCaseStatus, a as validateTagsFilters, b as resolveArtifactPath, bt as applyDerivedCallAttributes, c as getLastRunStatuses, d as loadPersistedRunSnapshots, f as nextShortIdFromSnapshots, g as recomputeEvalStatusesInRuns, h as persistRunState, i as resolveEvalTags, j as createFsCacheStore, jt as getCaseRowCaseKey, k as normalizeScoreDef, kt as resolveLlmCallsConfig, l as getLatestRunInfos, m as deleteTemporaryRuns, n as getTargetEvalKeys, o as stripTerminalControlCodes, p as persistCaseDetail, s as generateRunId, u as loadPersistedRunSnapshot, v as runTouchesEval, w as loadEvalModule, wt as deriveScopedSummaryFromCases, x as buildManualInputDescriptor, y as resolveTracePresentation } from "./runOrchestration-CokPQet7.mjs";
|
|
2
2
|
import { copyFile, mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve, sep } from "node:path";
|
|
4
4
|
import { createHash, randomUUID } from "node:crypto";
|
|
@@ -2095,8 +2095,8 @@ async function commandApp(args) {
|
|
|
2095
2095
|
const { serve } = await import("@hono/node-server");
|
|
2096
2096
|
const bundledWebDist = resolve(currentDir, "apps/web/dist");
|
|
2097
2097
|
if (existsSync(bundledWebDist)) process.env.AGENT_EVALS_WEB_DIST = bundledWebDist;
|
|
2098
|
-
const appModule = await import("./app-
|
|
2099
|
-
const runnerModule = await import("./runner
|
|
2098
|
+
const appModule = await import("./app-DR9WPMA4.mjs");
|
|
2099
|
+
const runnerModule = await import("./runner-B8dLVAyM.mjs");
|
|
2100
2100
|
if (!isHonoAppModule(appModule)) throw new Error("Server app module is invalid");
|
|
2101
2101
|
if (!isServerRunnerModule(runnerModule)) throw new Error("Server runner module is invalid");
|
|
2102
2102
|
await runnerModule.initRunner();
|
package/dist/index.d.mts
CHANGED
|
@@ -152,6 +152,9 @@ declare const evalStatsConfigSchema$1: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z
|
|
|
152
152
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
153
153
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
154
154
|
kind: z$1.ZodLiteral<"duration">;
|
|
155
|
+
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
156
|
+
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
157
|
+
kind: z$1.ZodLiteral<"cacheHits">;
|
|
155
158
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
156
159
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
157
160
|
kind: z$1.ZodLiteral<"column">;
|
|
@@ -222,6 +225,7 @@ declare const runLogEntrySchema$1: z$1.ZodObject<{
|
|
|
222
225
|
file: z$1.ZodString;
|
|
223
226
|
line: z$1.ZodNumber;
|
|
224
227
|
column: z$1.ZodNumber;
|
|
228
|
+
stack: z$1.ZodOptional<z$1.ZodString>;
|
|
225
229
|
}, z$1.core.$strip>>;
|
|
226
230
|
source: z$1.ZodOptional<z$1.ZodString>;
|
|
227
231
|
}, z$1.core.$strip>;
|
|
@@ -1012,13 +1016,15 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
|
|
|
1012
1016
|
* Opt-in: when omitted (or empty) the EvalCard renders no stats row at all.
|
|
1013
1017
|
* When provided, the stats render in order, left to right.
|
|
1014
1018
|
*
|
|
1015
|
-
* Built-in kinds (`cases`, `passRate`, `duration`, `
|
|
1016
|
-
* latest run summary. `
|
|
1017
|
-
*
|
|
1018
|
-
*
|
|
1019
|
-
*
|
|
1020
|
-
*
|
|
1021
|
-
*
|
|
1019
|
+
* Built-in kinds (`cases`, `passRate`, `duration`, `cacheHits`) read from
|
|
1020
|
+
* the latest run summary. `cacheHits` counts Agent Eval operation-level cache
|
|
1021
|
+
* hits over total cache operations, not LLM provider prompt-cache read
|
|
1022
|
+
* tokens. `kind: 'column'` aggregates a score or numeric output column across
|
|
1023
|
+
* the latest run's cases — `key` must match one of the eval's score or column
|
|
1024
|
+
* keys, and only finite numeric values participate in the reduction. When no
|
|
1025
|
+
* case has a numeric value for the key the stat renders an em dash, or hides
|
|
1026
|
+
* when `hideIfNoValue` is true. `label`, `format`, and `numberFormat` default
|
|
1027
|
+
* to the matching `ColumnDef`.
|
|
1022
1028
|
*/
|
|
1023
1029
|
stats?: EvalStatsConfig$1;
|
|
1024
1030
|
/**
|
|
@@ -1984,8 +1990,8 @@ declare const traceSpanSchema$1: z$1.ZodObject<{
|
|
|
1984
1990
|
status: z$1.ZodEnum<{
|
|
1985
1991
|
error: "error";
|
|
1986
1992
|
running: "running";
|
|
1987
|
-
cancelled: "cancelled";
|
|
1988
1993
|
ok: "ok";
|
|
1994
|
+
cancelled: "cancelled";
|
|
1989
1995
|
}>;
|
|
1990
1996
|
attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
1991
1997
|
error: z$1.ZodOptional<z$1.ZodObject<{
|
|
@@ -2036,6 +2042,8 @@ declare const evalStatAggregateSchema: z$1.ZodEnum<{
|
|
|
2036
2042
|
type EvalStatAggregate = z$1.infer<typeof evalStatAggregateSchema>;
|
|
2037
2043
|
/**
|
|
2038
2044
|
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
2045
|
+
* `cacheHits` counts Agent Eval operation-level cache hits from spans and
|
|
2046
|
+
* `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
|
|
2039
2047
|
* `column` aggregates a score or numeric output column across the latest run.
|
|
2040
2048
|
*/
|
|
2041
2049
|
declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
@@ -2048,6 +2056,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
2048
2056
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2049
2057
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2050
2058
|
kind: z$1.ZodLiteral<"duration">;
|
|
2059
|
+
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2060
|
+
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2061
|
+
kind: z$1.ZodLiteral<"cacheHits">;
|
|
2051
2062
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2052
2063
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2053
2064
|
kind: z$1.ZodLiteral<"column">;
|
|
@@ -2090,6 +2101,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
2090
2101
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2091
2102
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2092
2103
|
kind: z$1.ZodLiteral<"duration">;
|
|
2104
|
+
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2105
|
+
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2106
|
+
kind: z$1.ZodLiteral<"cacheHits">;
|
|
2093
2107
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2094
2108
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2095
2109
|
kind: z$1.ZodLiteral<"column">;
|
|
@@ -2177,10 +2191,10 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2177
2191
|
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2178
2192
|
lastRunStatus: z$1.ZodNullable<z$1.ZodEnum<{
|
|
2179
2193
|
error: "error";
|
|
2180
|
-
pass: "pass";
|
|
2181
|
-
fail: "fail";
|
|
2182
2194
|
running: "running";
|
|
2183
2195
|
cancelled: "cancelled";
|
|
2196
|
+
pass: "pass";
|
|
2197
|
+
fail: "fail";
|
|
2184
2198
|
unscored: "unscored";
|
|
2185
2199
|
}>>;
|
|
2186
2200
|
stats: z$1.ZodOptional<z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
@@ -2193,6 +2207,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2193
2207
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2194
2208
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2195
2209
|
kind: z$1.ZodLiteral<"duration">;
|
|
2210
|
+
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2211
|
+
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2212
|
+
kind: z$1.ZodLiteral<"cacheHits">;
|
|
2196
2213
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
2197
2214
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2198
2215
|
kind: z$1.ZodLiteral<"column">;
|
|
@@ -2396,13 +2413,15 @@ declare const caseRowSchema$1: z$1.ZodObject<{
|
|
|
2396
2413
|
tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2397
2414
|
status: z$1.ZodEnum<{
|
|
2398
2415
|
error: "error";
|
|
2399
|
-
pass: "pass";
|
|
2400
|
-
fail: "fail";
|
|
2401
2416
|
running: "running";
|
|
2402
2417
|
cancelled: "cancelled";
|
|
2418
|
+
pass: "pass";
|
|
2419
|
+
fail: "fail";
|
|
2403
2420
|
pending: "pending";
|
|
2404
2421
|
}>;
|
|
2405
2422
|
durationMs: z$1.ZodNullable<z$1.ZodNumber>;
|
|
2423
|
+
cacheHits: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2424
|
+
cacheOperations: z$1.ZodOptional<z$1.ZodNumber>;
|
|
2406
2425
|
costUsd: z$1.ZodOptional<z$1.ZodNullable<z$1.ZodNumber>>;
|
|
2407
2426
|
columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
|
|
2408
2427
|
source: z$1.ZodLiteral<"repo">;
|
|
@@ -2449,8 +2468,9 @@ declare const runLogLocationSchema: z$1.ZodObject<{
|
|
|
2449
2468
|
file: z$1.ZodString;
|
|
2450
2469
|
line: z$1.ZodNumber;
|
|
2451
2470
|
column: z$1.ZodNumber;
|
|
2471
|
+
stack: z$1.ZodOptional<z$1.ZodString>;
|
|
2452
2472
|
}, z$1.core.$strip>;
|
|
2453
|
-
/** Best-effort source location for one
|
|
2473
|
+
/** Best-effort source location and captured stack for one case log. */
|
|
2454
2474
|
type RunLogLocation = z$1.infer<typeof runLogLocationSchema>;
|
|
2455
2475
|
/** Schema for one persisted log entry captured during a case run. */
|
|
2456
2476
|
declare const runLogEntrySchema: z$1.ZodObject<{
|
|
@@ -2474,6 +2494,7 @@ declare const runLogEntrySchema: z$1.ZodObject<{
|
|
|
2474
2494
|
file: z$1.ZodString;
|
|
2475
2495
|
line: z$1.ZodNumber;
|
|
2476
2496
|
column: z$1.ZodNumber;
|
|
2497
|
+
stack: z$1.ZodOptional<z$1.ZodString>;
|
|
2477
2498
|
}, z$1.core.$strip>>;
|
|
2478
2499
|
source: z$1.ZodOptional<z$1.ZodString>;
|
|
2479
2500
|
}, z$1.core.$strip>;
|
|
@@ -2492,8 +2513,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
2492
2513
|
status: z$1.ZodEnum<{
|
|
2493
2514
|
error: "error";
|
|
2494
2515
|
running: "running";
|
|
2495
|
-
cancelled: "cancelled";
|
|
2496
2516
|
ok: "ok";
|
|
2517
|
+
cancelled: "cancelled";
|
|
2497
2518
|
}>;
|
|
2498
2519
|
attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
2499
2520
|
error: z$1.ZodOptional<z$1.ZodObject<{
|
|
@@ -2561,10 +2582,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2561
2582
|
tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2562
2583
|
status: z$1.ZodEnum<{
|
|
2563
2584
|
error: "error";
|
|
2564
|
-
pass: "pass";
|
|
2565
|
-
fail: "fail";
|
|
2566
2585
|
running: "running";
|
|
2567
2586
|
cancelled: "cancelled";
|
|
2587
|
+
pass: "pass";
|
|
2588
|
+
fail: "fail";
|
|
2568
2589
|
pending: "pending";
|
|
2569
2590
|
}>;
|
|
2570
2591
|
input: z$1.ZodUnknown;
|
|
@@ -2579,8 +2600,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2579
2600
|
status: z$1.ZodEnum<{
|
|
2580
2601
|
error: "error";
|
|
2581
2602
|
running: "running";
|
|
2582
|
-
cancelled: "cancelled";
|
|
2583
2603
|
ok: "ok";
|
|
2604
|
+
cancelled: "cancelled";
|
|
2584
2605
|
}>;
|
|
2585
2606
|
attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
2586
2607
|
error: z$1.ZodOptional<z$1.ZodObject<{
|
|
@@ -2648,8 +2669,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2648
2669
|
status: z$1.ZodEnum<{
|
|
2649
2670
|
error: "error";
|
|
2650
2671
|
running: "running";
|
|
2651
|
-
cancelled: "cancelled";
|
|
2652
2672
|
ok: "ok";
|
|
2673
|
+
cancelled: "cancelled";
|
|
2653
2674
|
}>;
|
|
2654
2675
|
attributes: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
2655
2676
|
error: z$1.ZodOptional<z$1.ZodObject<{
|
|
@@ -2746,6 +2767,7 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2746
2767
|
file: z$1.ZodString;
|
|
2747
2768
|
line: z$1.ZodNumber;
|
|
2748
2769
|
column: z$1.ZodNumber;
|
|
2770
|
+
stack: z$1.ZodOptional<z$1.ZodString>;
|
|
2749
2771
|
}, z$1.core.$strip>>;
|
|
2750
2772
|
source: z$1.ZodOptional<z$1.ZodString>;
|
|
2751
2773
|
}, z$1.core.$strip>>>;
|
|
@@ -2761,10 +2783,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2761
2783
|
namespace: z$1.ZodString;
|
|
2762
2784
|
key: z$1.ZodString;
|
|
2763
2785
|
status: z$1.ZodEnum<{
|
|
2786
|
+
bypass: "bypass";
|
|
2787
|
+
refresh: "refresh";
|
|
2764
2788
|
hit: "hit";
|
|
2765
2789
|
miss: "miss";
|
|
2766
|
-
refresh: "refresh";
|
|
2767
|
-
bypass: "bypass";
|
|
2768
2790
|
}>;
|
|
2769
2791
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2770
2792
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -3137,9 +3159,9 @@ declare const runManifestSchema$1: z$1.ZodObject<{
|
|
|
3137
3159
|
median: "median";
|
|
3138
3160
|
}>>>;
|
|
3139
3161
|
cacheMode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3140
|
-
refresh: "refresh";
|
|
3141
|
-
bypass: "bypass";
|
|
3142
3162
|
use: "use";
|
|
3163
|
+
bypass: "bypass";
|
|
3164
|
+
refresh: "refresh";
|
|
3143
3165
|
}>>;
|
|
3144
3166
|
}, z$1.core.$strip>;
|
|
3145
3167
|
/** Persisted lifecycle metadata for a single eval run. */
|
|
@@ -3189,6 +3211,21 @@ type ScopedCaseSummary = {
|
|
|
3189
3211
|
pendingCases: number;
|
|
3190
3212
|
runningCases: number;
|
|
3191
3213
|
totalDurationMs: number | null;
|
|
3214
|
+
/**
|
|
3215
|
+
* Sum of Agent Eval operation-level cache hits across the scoped case rows.
|
|
3216
|
+
*
|
|
3217
|
+
* Missing values from older run artifacts count as zero. This is separate
|
|
3218
|
+
* from LLM prompt-cache token reads such as `cachedInputTokens`.
|
|
3219
|
+
*/
|
|
3220
|
+
cacheHits: number;
|
|
3221
|
+
/**
|
|
3222
|
+
* Sum of Agent Eval operation-level cache activity entries across the scoped
|
|
3223
|
+
* case rows.
|
|
3224
|
+
*
|
|
3225
|
+
* This is the denominator for `cacheHits`. Missing values from older run
|
|
3226
|
+
* artifacts count as zero.
|
|
3227
|
+
*/
|
|
3228
|
+
cacheOperations: number;
|
|
3192
3229
|
};
|
|
3193
3230
|
//#endregion
|
|
3194
3231
|
//#region src/evalStatus.d.ts
|
|
@@ -4026,9 +4063,9 @@ declare function extractApiCalls(spans: EvalTraceSpan$1[], config: ResolvedApiCa
|
|
|
4026
4063
|
* - `refresh`: never read, always write (forces re-execution and overwrites).
|
|
4027
4064
|
*/
|
|
4028
4065
|
declare const cacheModeSchema: z$1.ZodEnum<{
|
|
4029
|
-
refresh: "refresh";
|
|
4030
|
-
bypass: "bypass";
|
|
4031
4066
|
use: "use";
|
|
4067
|
+
bypass: "bypass";
|
|
4068
|
+
refresh: "refresh";
|
|
4032
4069
|
}>;
|
|
4033
4070
|
/** Mode controlling how cached spans behave during a run. */
|
|
4034
4071
|
type CacheMode = z$1.infer<typeof cacheModeSchema>;
|
|
@@ -4049,10 +4086,10 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
|
|
|
4049
4086
|
type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
|
|
4050
4087
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4051
4088
|
declare const cacheStatusSchema: z$1.ZodEnum<{
|
|
4089
|
+
bypass: "bypass";
|
|
4090
|
+
refresh: "refresh";
|
|
4052
4091
|
hit: "hit";
|
|
4053
4092
|
miss: "miss";
|
|
4054
|
-
refresh: "refresh";
|
|
4055
|
-
bypass: "bypass";
|
|
4056
4093
|
}>;
|
|
4057
4094
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4058
4095
|
type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
|
|
@@ -4069,10 +4106,10 @@ declare const traceCacheRefSchema: z$1.ZodObject<{
|
|
|
4069
4106
|
namespace: z$1.ZodString;
|
|
4070
4107
|
key: z$1.ZodString;
|
|
4071
4108
|
status: z$1.ZodEnum<{
|
|
4109
|
+
bypass: "bypass";
|
|
4110
|
+
refresh: "refresh";
|
|
4072
4111
|
hit: "hit";
|
|
4073
4112
|
miss: "miss";
|
|
4074
|
-
refresh: "refresh";
|
|
4075
|
-
bypass: "bypass";
|
|
4076
4113
|
}>;
|
|
4077
4114
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
4078
4115
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -4149,8 +4186,8 @@ declare const cacheRecordingSchema: z$1.ZodObject<{
|
|
|
4149
4186
|
finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4150
4187
|
error: "error";
|
|
4151
4188
|
running: "running";
|
|
4152
|
-
cancelled: "cancelled";
|
|
4153
4189
|
ok: "ok";
|
|
4190
|
+
cancelled: "cancelled";
|
|
4154
4191
|
}>>;
|
|
4155
4192
|
finalError: z$1.ZodOptional<z$1.ZodObject<{
|
|
4156
4193
|
name: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -4222,8 +4259,8 @@ declare const cacheEntrySchema: z$1.ZodObject<{
|
|
|
4222
4259
|
finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4223
4260
|
error: "error";
|
|
4224
4261
|
running: "running";
|
|
4225
|
-
cancelled: "cancelled";
|
|
4226
4262
|
ok: "ok";
|
|
4263
|
+
cancelled: "cancelled";
|
|
4227
4264
|
}>>;
|
|
4228
4265
|
finalError: z$1.ZodOptional<z$1.ZodObject<{
|
|
4229
4266
|
name: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -4312,8 +4349,8 @@ declare const cacheDebugKeyEntrySchema: z$1.ZodObject<{
|
|
|
4312
4349
|
finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4313
4350
|
error: "error";
|
|
4314
4351
|
running: "running";
|
|
4315
|
-
cancelled: "cancelled";
|
|
4316
4352
|
ok: "ok";
|
|
4353
|
+
cancelled: "cancelled";
|
|
4317
4354
|
}>>;
|
|
4318
4355
|
finalError: z$1.ZodOptional<z$1.ZodObject<{
|
|
4319
4356
|
name: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -4391,8 +4428,8 @@ declare const cacheEntryWithDebugKeySchema$1: z$1.ZodObject<{
|
|
|
4391
4428
|
finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4392
4429
|
error: "error";
|
|
4393
4430
|
running: "running";
|
|
4394
|
-
cancelled: "cancelled";
|
|
4395
4431
|
ok: "ok";
|
|
4432
|
+
cancelled: "cancelled";
|
|
4396
4433
|
}>>;
|
|
4397
4434
|
finalError: z$1.ZodOptional<z$1.ZodObject<{
|
|
4398
4435
|
name: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -4472,8 +4509,8 @@ declare const cacheEntryWithDebugKeySchema$1: z$1.ZodObject<{
|
|
|
4472
4509
|
finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4473
4510
|
error: "error";
|
|
4474
4511
|
running: "running";
|
|
4475
|
-
cancelled: "cancelled";
|
|
4476
4512
|
ok: "ok";
|
|
4513
|
+
cancelled: "cancelled";
|
|
4477
4514
|
}>>;
|
|
4478
4515
|
finalError: z$1.ZodOptional<z$1.ZodObject<{
|
|
4479
4516
|
name: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -4551,8 +4588,8 @@ declare const cacheFileSchema: z$1.ZodObject<{
|
|
|
4551
4588
|
finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4552
4589
|
error: "error";
|
|
4553
4590
|
running: "running";
|
|
4554
|
-
cancelled: "cancelled";
|
|
4555
4591
|
ok: "ok";
|
|
4592
|
+
cancelled: "cancelled";
|
|
4556
4593
|
}>>;
|
|
4557
4594
|
finalError: z$1.ZodOptional<z$1.ZodObject<{
|
|
4558
4595
|
name: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -4640,8 +4677,8 @@ declare const cacheDebugKeyFileSchema: z$1.ZodObject<{
|
|
|
4640
4677
|
finalStatus: z$1.ZodOptional<z$1.ZodEnum<{
|
|
4641
4678
|
error: "error";
|
|
4642
4679
|
running: "running";
|
|
4643
|
-
cancelled: "cancelled";
|
|
4644
4680
|
ok: "ok";
|
|
4681
|
+
cancelled: "cancelled";
|
|
4645
4682
|
}>>;
|
|
4646
4683
|
finalError: z$1.ZodOptional<z$1.ZodObject<{
|
|
4647
4684
|
name: z$1.ZodOptional<z$1.ZodString>;
|
|
@@ -4811,9 +4848,9 @@ declare const createRunRequestSchema$1: z$1.ZodObject<{
|
|
|
4811
4848
|
temporary: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
4812
4849
|
cache: z$1.ZodOptional<z$1.ZodObject<{
|
|
4813
4850
|
mode: z$1.ZodDefault<z$1.ZodEnum<{
|
|
4814
|
-
refresh: "refresh";
|
|
4815
|
-
bypass: "bypass";
|
|
4816
4851
|
use: "use";
|
|
4852
|
+
bypass: "bypass";
|
|
4853
|
+
refresh: "refresh";
|
|
4817
4854
|
}>>;
|
|
4818
4855
|
}, z$1.core.$strip>>;
|
|
4819
4856
|
manualInputs: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
@@ -5148,6 +5185,9 @@ declare const evalSummarySchema: z$1.ZodObject<{
|
|
|
5148
5185
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
5149
5186
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
5150
5187
|
kind: z$1.ZodLiteral<"duration">;
|
|
5188
|
+
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
5189
|
+
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
5190
|
+
kind: z$1.ZodLiteral<"cacheHits">;
|
|
5151
5191
|
}, z$1.core.$strip>, z$1.ZodObject<{
|
|
5152
5192
|
hideIfNoValue: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
5153
5193
|
kind: z$1.ZodLiteral<"column">;
|
|
@@ -5358,6 +5398,8 @@ declare const caseRowSchema: z$1.ZodObject<{
|
|
|
5358
5398
|
pending: "pending";
|
|
5359
5399
|
}>;
|
|
5360
5400
|
durationMs: z$1.ZodNullable<z$1.ZodNumber>;
|
|
5401
|
+
cacheHits: z$1.ZodOptional<z$1.ZodNumber>;
|
|
5402
|
+
cacheOperations: z$1.ZodOptional<z$1.ZodNumber>;
|
|
5361
5403
|
costUsd: z$1.ZodOptional<z$1.ZodNullable<z$1.ZodNumber>>;
|
|
5362
5404
|
columns: z$1.ZodRecord<z$1.ZodString, z$1.ZodUnion<readonly [z$1.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z$1.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z$1.ZodUnion<readonly [z$1.ZodObject<{
|
|
5363
5405
|
source: z$1.ZodLiteral<"repo">;
|
|
@@ -5567,6 +5609,7 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
5567
5609
|
file: z$1.ZodString;
|
|
5568
5610
|
line: z$1.ZodNumber;
|
|
5569
5611
|
column: z$1.ZodNumber;
|
|
5612
|
+
stack: z$1.ZodOptional<z$1.ZodString>;
|
|
5570
5613
|
}, z$1.core.$strip>>;
|
|
5571
5614
|
source: z$1.ZodOptional<z$1.ZodString>;
|
|
5572
5615
|
}, z$1.core.$strip>>>;
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-
|
|
2
|
-
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-
|
|
3
|
-
import { n as matchesEvalTags, t as defineEval } from "./src-
|
|
1
|
+
import { $ as getCurrentScope, B as deserializeCacheValue, F as evalSpan, G as readManualInputFile, H as serializeCacheValue, I as evalTracer, J as appendToEvalOutput, K as evalExpect, L as hashCacheKey, Lt as getEvalRegistry, M as z, N as buildTraceTree, P as captureEvalSpanError, Q as evalTime, R as hashCacheKeySync, U as repoFile, V as serializeCacheRecording, W as manualInputFileValueSchema, X as evalAssert, Z as evalLog, _t as extractLlmCalls, at as nextEvalId, ct as runInExistingEvalScope, dt as startEvalBackgroundJob, et as getEvalCaseInput, gt as extractApiCalls, ht as extractCacheHits, it as mergeEvalOutput, lt as setEvalOutput, mt as extractCacheEntries, nt as isInEvalScope, ot as runInEvalRuntimeScope, q as EvalAssertionError, st as runInEvalScope, tt as incrementEvalOutput, ut as setScopeCacheContext, vt as simulateLlmCallCost, xt as getNestedAttribute, yt as simulateTokenAllocation, z as deserializeCacheRecording } from "./runOrchestration-CokPQet7.mjs";
|
|
2
|
+
import { a as materializeManualInputFiles, i as isManualInputFileValue, n as createRunner, o as stageManualInputFile, r as cleanupStagedManualInputFiles, s as stageManualInputFileFromPath, t as runCli } from "./cli-R7_V6YWa.mjs";
|
|
3
|
+
import { n as matchesEvalTags, t as defineEval } from "./src-B43qR0Ea.mjs";
|
|
4
4
|
export { EvalAssertionError, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
package/dist/runChild.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-
|
|
1
|
+
import { At as buildEvalKey, D as loadConfig, Dt as runSummarySchema, Et as runManifestSchema, Ft as columnDefSchema, Mt as evalStatsConfigSchema, Nt as manualInputDescriptorSchema, Pt as evalChartsConfigSchema, T as parseEvalDiscovery, Y as configureEvalRunLogs, ft as createRunRequestSchema, h as persistRunState, j as createFsCacheStore, r as getTargetEvals$1, t as executeRun } from "./runOrchestration-CokPQet7.mjs";
|
|
2
2
|
import { z } from "zod/v4";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { relative } from "node:path";
|
|
@@ -621,6 +621,8 @@ const hideIfNoValueShape = {
|
|
|
621
621
|
hideIfNoValue: z.boolean().optional() };
|
|
622
622
|
/**
|
|
623
623
|
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
624
|
+
* `cacheHits` counts Agent Eval operation-level cache hits from spans and
|
|
625
|
+
* `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens.
|
|
624
626
|
* `column` aggregates a score or numeric output column across the latest run.
|
|
625
627
|
*/
|
|
626
628
|
const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
@@ -637,6 +639,10 @@ const evalStatItemSchema = z.discriminatedUnion("kind", [
|
|
|
637
639
|
kind: z.literal("duration"),
|
|
638
640
|
...hideIfNoValueShape
|
|
639
641
|
}),
|
|
642
|
+
z.object({
|
|
643
|
+
kind: z.literal("cacheHits"),
|
|
644
|
+
...hideIfNoValueShape
|
|
645
|
+
}),
|
|
640
646
|
z.object({
|
|
641
647
|
kind: z.literal("column"),
|
|
642
648
|
key: z.string(),
|
|
@@ -731,6 +737,23 @@ const caseRowSchema = z.object({
|
|
|
731
737
|
]),
|
|
732
738
|
/** Elapsed case execution duration in milliseconds, or null before completion. */
|
|
733
739
|
durationMs: z.number().nullable(),
|
|
740
|
+
/**
|
|
741
|
+
* Agent Eval operation-level cache hits recorded for this case.
|
|
742
|
+
*
|
|
743
|
+
* This counts persisted operation cache hits from spans and
|
|
744
|
+
* `evalTracer.cache(...)` refs. It does not count LLM provider prompt-cache
|
|
745
|
+
* read tokens such as `cachedInputTokens`. Older run artifacts may omit it
|
|
746
|
+
* and should be treated as zero by aggregate readers.
|
|
747
|
+
*/
|
|
748
|
+
cacheHits: z.number().optional(),
|
|
749
|
+
/**
|
|
750
|
+
* Agent Eval operation-level cache activity entries recorded for this case.
|
|
751
|
+
*
|
|
752
|
+
* This is the denominator for `cacheHits`, counting hits plus misses and
|
|
753
|
+
* refreshes that appear in the Cache tab. Older run artifacts may omit it
|
|
754
|
+
* and should be treated as zero by aggregate readers.
|
|
755
|
+
*/
|
|
756
|
+
cacheOperations: z.number().optional(),
|
|
734
757
|
costUsd: z.number().nullable().optional(),
|
|
735
758
|
columns: z.record(z.string(), cellValueSchema),
|
|
736
759
|
/** Winning trial index for the persisted case result. */
|
|
@@ -771,7 +794,13 @@ const runLogLocationSchema = z.object({
|
|
|
771
794
|
/** 1-based source line reported by the JavaScript stack frame. */
|
|
772
795
|
line: z.number(),
|
|
773
796
|
/** 1-based source column reported by the JavaScript stack frame. */
|
|
774
|
-
column: z.number()
|
|
797
|
+
column: z.number(),
|
|
798
|
+
/**
|
|
799
|
+
* Full JavaScript stack captured when the log was emitted.
|
|
800
|
+
*
|
|
801
|
+
* Older run artifacts may only include the primary file, line, and column.
|
|
802
|
+
*/
|
|
803
|
+
stack: z.string().optional()
|
|
775
804
|
});
|
|
776
805
|
/** Schema for one persisted log entry captured during a case run. */
|
|
777
806
|
const runLogEntrySchema = z.object({
|
|
@@ -1692,6 +1721,8 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1692
1721
|
let runningCases = 0;
|
|
1693
1722
|
let totalDurationMs = 0;
|
|
1694
1723
|
let hasDuration = false;
|
|
1724
|
+
let cacheHits = 0;
|
|
1725
|
+
let cacheOperations = 0;
|
|
1695
1726
|
for (const caseRow of caseRows) {
|
|
1696
1727
|
if (caseRow.status === "pass") passedCases += 1;
|
|
1697
1728
|
else if (caseRow.status === "fail") failedCases += 1;
|
|
@@ -1703,6 +1734,8 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1703
1734
|
totalDurationMs += caseRow.durationMs;
|
|
1704
1735
|
hasDuration = true;
|
|
1705
1736
|
}
|
|
1737
|
+
cacheHits += caseRow.cacheHits ?? 0;
|
|
1738
|
+
cacheOperations += caseRow.cacheOperations ?? 0;
|
|
1706
1739
|
}
|
|
1707
1740
|
return {
|
|
1708
1741
|
status: deriveStatusFromCaseRows({
|
|
@@ -1716,7 +1749,9 @@ function deriveScopedSummaryFromCases(params) {
|
|
|
1716
1749
|
cancelledCases,
|
|
1717
1750
|
pendingCases,
|
|
1718
1751
|
runningCases,
|
|
1719
|
-
totalDurationMs: hasDuration ? totalDurationMs : null
|
|
1752
|
+
totalDurationMs: hasDuration ? totalDurationMs : null,
|
|
1753
|
+
cacheHits,
|
|
1754
|
+
cacheOperations
|
|
1720
1755
|
};
|
|
1721
1756
|
}
|
|
1722
1757
|
//#endregion
|
|
@@ -2787,7 +2822,8 @@ function normalizeStackFile(value) {
|
|
|
2787
2822
|
return decodeURIComponent(value.replace(fileUrlPrefixPattern, ""));
|
|
2788
2823
|
}
|
|
2789
2824
|
function isInternalLogFrame(file) {
|
|
2790
|
-
|
|
2825
|
+
const normalizedFile = file.replaceAll("\\", "/");
|
|
2826
|
+
return normalizedFile.includes("/packages/sdk/src/runtime.ts") || normalizedFile.includes("/packages/sdk/dist/") || normalizedFile.includes("/node_modules/@agent-evals/sdk/dist/") || normalizedFile.includes("/node_modules/@ls-stack/agent-eval/dist/") || normalizedFile.includes("/node:internal/") || normalizedFile.startsWith("node:internal/");
|
|
2791
2827
|
}
|
|
2792
2828
|
function parseStackFrameLocation(line) {
|
|
2793
2829
|
const match = stackFrameLocationPattern.exec(line.trim());
|
|
@@ -2808,7 +2844,10 @@ function getLogLocation() {
|
|
|
2808
2844
|
for (const line of stack.split("\n").slice(1)) {
|
|
2809
2845
|
const location = parseStackFrameLocation(line);
|
|
2810
2846
|
if (location === null || isInternalLogFrame(location.file)) continue;
|
|
2811
|
-
return
|
|
2847
|
+
return {
|
|
2848
|
+
...location,
|
|
2849
|
+
stack
|
|
2850
|
+
};
|
|
2812
2851
|
}
|
|
2813
2852
|
}
|
|
2814
2853
|
function recordEvalLog(level, args) {
|
|
@@ -7169,12 +7208,16 @@ async function runCase(params) {
|
|
|
7169
7208
|
};
|
|
7170
7209
|
if (Object.keys(scoringTraces).length > 0) caseDetail.scoringTraces = scoringTraces;
|
|
7171
7210
|
const elapsedMs = Date.now() - startTime;
|
|
7211
|
+
const cacheEntries = extractCacheEntries(displayTrace, scope.caseCacheRefs);
|
|
7212
|
+
const cacheHits = cacheEntries.filter((entry) => entry.status === "hit");
|
|
7172
7213
|
return {
|
|
7173
7214
|
caseDetail,
|
|
7174
7215
|
caseRowUpdate: {
|
|
7175
7216
|
tags: evalCase.tags ?? [],
|
|
7176
7217
|
status,
|
|
7177
7218
|
durationMs: elapsedMs,
|
|
7219
|
+
cacheHits: cacheHits.length,
|
|
7220
|
+
cacheOperations: cacheEntries.length,
|
|
7178
7221
|
columns
|
|
7179
7222
|
}
|
|
7180
7223
|
};
|
|
@@ -7670,6 +7713,8 @@ async function executeRun({ runState, request, runDir, config, cacheStore, lastR
|
|
|
7670
7713
|
tags: caseDetail.tags,
|
|
7671
7714
|
status: caseRowUpdate.status ?? "pending",
|
|
7672
7715
|
durationMs: caseRowUpdate.durationMs ?? null,
|
|
7716
|
+
cacheHits: caseRowUpdate.cacheHits ?? 0,
|
|
7717
|
+
cacheOperations: caseRowUpdate.cacheOperations ?? 0,
|
|
7673
7718
|
columns: caseRowUpdate.columns ?? {},
|
|
7674
7719
|
trial
|
|
7675
7720
|
}
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-Coc9wBWz.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-R7_V6YWa.mjs";
|
|
2
|
+
import "./src-B43qR0Ea.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { It as defineEval$1, rt as matchesEvalTags$1 } from "./runOrchestration-CokPQet7.mjs";
|
|
2
|
+
import "./cli-R7_V6YWa.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -400,11 +400,17 @@ cacheCreationInputTokens` so cache details are not double-counted.
|
|
|
400
400
|
- `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use
|
|
401
401
|
`runLogs: { captureConsole: false }` to keep console output in the terminal
|
|
402
402
|
without persisting console calls to case details. Manual `evalLog(...)` calls
|
|
403
|
-
are still captured.
|
|
403
|
+
are still captured. Captured log locations store the selected user-facing
|
|
404
|
+
source frame and the full JavaScript stack so agents can inspect additional
|
|
405
|
+
frames in persisted artifacts when diagnosing where a log came from.
|
|
404
406
|
|
|
405
407
|
Stats rows and history charts can be authored via `stats` / `charts` on the eval
|
|
406
408
|
definition. Global `stats` in `agent-evals.config.ts` combine with eval-level
|
|
407
|
-
stats.
|
|
409
|
+
stats. Native stat kinds include `cases`, `passRate`, `duration`, and
|
|
410
|
+
`cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total
|
|
411
|
+
cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not
|
|
412
|
+
LLM provider prompt-cache read tokens such as `cachedInputTokens`. Usage stats
|
|
413
|
+
and LLM usage charts are added by default unless removed with
|
|
408
414
|
`removeDefaultConfig`. Column stats can override `format` and `numberFormat`,
|
|
409
415
|
otherwise they inherit from the matching column. Number formats use
|
|
410
416
|
`maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing
|