@ls-stack/agent-eval 0.61.1 → 0.61.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/apps/web/dist/assets/{index-CwSehYad.js → index-DXQ_LDQw.js} +70 -70
- package/dist/apps/web/dist/assets/{index-CM_zUhl_.css → index-zWPuRQmP.css} +1 -1
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/index.d.mts +55 -55
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +2 -2
package/dist/index.d.mts
CHANGED
|
@@ -2061,9 +2061,9 @@ declare const traceAttributeDisplaySchema: z.ZodObject<{
|
|
|
2061
2061
|
subtree: "subtree";
|
|
2062
2062
|
}>>;
|
|
2063
2063
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
2064
|
-
sum: "sum";
|
|
2065
2064
|
all: "all";
|
|
2066
2065
|
last: "last";
|
|
2066
|
+
sum: "sum";
|
|
2067
2067
|
}>>;
|
|
2068
2068
|
}, z.core.$strip>;
|
|
2069
2069
|
/**
|
|
@@ -2097,9 +2097,9 @@ declare const traceDisplayConfigSchema: z.ZodObject<{
|
|
|
2097
2097
|
subtree: "subtree";
|
|
2098
2098
|
}>>;
|
|
2099
2099
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
2100
|
-
sum: "sum";
|
|
2101
2100
|
all: "all";
|
|
2102
2101
|
last: "last";
|
|
2102
|
+
sum: "sum";
|
|
2103
2103
|
}>>;
|
|
2104
2104
|
}, z.core.$strip>>>;
|
|
2105
2105
|
}, z.core.$strip>;
|
|
@@ -2137,9 +2137,9 @@ declare const traceAttributeDisplayInputSchema: z.ZodObject<{
|
|
|
2137
2137
|
subtree: "subtree";
|
|
2138
2138
|
}>>;
|
|
2139
2139
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
2140
|
-
sum: "sum";
|
|
2141
2140
|
all: "all";
|
|
2142
2141
|
last: "last";
|
|
2142
|
+
sum: "sum";
|
|
2143
2143
|
}>>;
|
|
2144
2144
|
transform: z.ZodOptional<z.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
2145
2145
|
}, z.core.$strip>;
|
|
@@ -2175,9 +2175,9 @@ declare const traceDisplayInputConfigSchema: z.ZodObject<{
|
|
|
2175
2175
|
subtree: "subtree";
|
|
2176
2176
|
}>>;
|
|
2177
2177
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
2178
|
-
sum: "sum";
|
|
2179
2178
|
all: "all";
|
|
2180
2179
|
last: "last";
|
|
2180
|
+
sum: "sum";
|
|
2181
2181
|
}>>;
|
|
2182
2182
|
transform: z.ZodOptional<z.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
2183
2183
|
}, z.core.$strip>>>;
|
|
@@ -2214,8 +2214,8 @@ declare const traceSpanSchema$1: z.ZodObject<{
|
|
|
2214
2214
|
status: z.ZodEnum<{
|
|
2215
2215
|
error: "error";
|
|
2216
2216
|
running: "running";
|
|
2217
|
-
cancelled: "cancelled";
|
|
2218
2217
|
ok: "ok";
|
|
2218
|
+
cancelled: "cancelled";
|
|
2219
2219
|
}>;
|
|
2220
2220
|
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2221
2221
|
error: z.ZodOptional<z.ZodObject<{
|
|
@@ -2260,10 +2260,10 @@ type EvalFreshnessStatus = z.infer<typeof evalFreshnessStatusSchema>;
|
|
|
2260
2260
|
* `best` selects the highest finite value and `worst` selects the lowest.
|
|
2261
2261
|
*/
|
|
2262
2262
|
declare const evalStatAggregateSchema: z.ZodEnum<{
|
|
2263
|
+
sum: "sum";
|
|
2263
2264
|
avg: "avg";
|
|
2264
2265
|
min: "min";
|
|
2265
2266
|
max: "max";
|
|
2266
|
-
sum: "sum";
|
|
2267
2267
|
best: "best";
|
|
2268
2268
|
worst: "worst";
|
|
2269
2269
|
}>;
|
|
@@ -2292,10 +2292,10 @@ declare const evalStatItemSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
2292
2292
|
hideIfNoValue: z.ZodOptional<z.ZodBoolean>;
|
|
2293
2293
|
kind: z.ZodLiteral<"duration">;
|
|
2294
2294
|
aggregate: z.ZodOptional<z.ZodEnum<{
|
|
2295
|
+
sum: "sum";
|
|
2295
2296
|
avg: "avg";
|
|
2296
2297
|
min: "min";
|
|
2297
2298
|
max: "max";
|
|
2298
|
-
sum: "sum";
|
|
2299
2299
|
best: "best";
|
|
2300
2300
|
worst: "worst";
|
|
2301
2301
|
}>>;
|
|
@@ -2303,10 +2303,10 @@ declare const evalStatItemSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
2303
2303
|
hideIfNoValue: z.ZodOptional<z.ZodBoolean>;
|
|
2304
2304
|
kind: z.ZodLiteral<"cacheHits">;
|
|
2305
2305
|
aggregate: z.ZodOptional<z.ZodEnum<{
|
|
2306
|
+
sum: "sum";
|
|
2306
2307
|
avg: "avg";
|
|
2307
2308
|
min: "min";
|
|
2308
2309
|
max: "max";
|
|
2309
|
-
sum: "sum";
|
|
2310
2310
|
best: "best";
|
|
2311
2311
|
worst: "worst";
|
|
2312
2312
|
}>>;
|
|
@@ -2316,10 +2316,10 @@ declare const evalStatItemSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
2316
2316
|
key: z.ZodString;
|
|
2317
2317
|
label: z.ZodOptional<z.ZodString>;
|
|
2318
2318
|
aggregate: z.ZodEnum<{
|
|
2319
|
+
sum: "sum";
|
|
2319
2320
|
avg: "avg";
|
|
2320
2321
|
min: "min";
|
|
2321
2322
|
max: "max";
|
|
2322
|
-
sum: "sum";
|
|
2323
2323
|
best: "best";
|
|
2324
2324
|
worst: "worst";
|
|
2325
2325
|
}>;
|
|
@@ -2356,10 +2356,10 @@ declare const evalStatsConfigSchema: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodOb
|
|
|
2356
2356
|
hideIfNoValue: z.ZodOptional<z.ZodBoolean>;
|
|
2357
2357
|
kind: z.ZodLiteral<"duration">;
|
|
2358
2358
|
aggregate: z.ZodOptional<z.ZodEnum<{
|
|
2359
|
+
sum: "sum";
|
|
2359
2360
|
avg: "avg";
|
|
2360
2361
|
min: "min";
|
|
2361
2362
|
max: "max";
|
|
2362
|
-
sum: "sum";
|
|
2363
2363
|
best: "best";
|
|
2364
2364
|
worst: "worst";
|
|
2365
2365
|
}>>;
|
|
@@ -2367,10 +2367,10 @@ declare const evalStatsConfigSchema: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodOb
|
|
|
2367
2367
|
hideIfNoValue: z.ZodOptional<z.ZodBoolean>;
|
|
2368
2368
|
kind: z.ZodLiteral<"cacheHits">;
|
|
2369
2369
|
aggregate: z.ZodOptional<z.ZodEnum<{
|
|
2370
|
+
sum: "sum";
|
|
2370
2371
|
avg: "avg";
|
|
2371
2372
|
min: "min";
|
|
2372
2373
|
max: "max";
|
|
2373
|
-
sum: "sum";
|
|
2374
2374
|
best: "best";
|
|
2375
2375
|
worst: "worst";
|
|
2376
2376
|
}>>;
|
|
@@ -2380,10 +2380,10 @@ declare const evalStatsConfigSchema: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodOb
|
|
|
2380
2380
|
key: z.ZodString;
|
|
2381
2381
|
label: z.ZodOptional<z.ZodString>;
|
|
2382
2382
|
aggregate: z.ZodEnum<{
|
|
2383
|
+
sum: "sum";
|
|
2383
2384
|
avg: "avg";
|
|
2384
2385
|
min: "min";
|
|
2385
2386
|
max: "max";
|
|
2386
|
-
sum: "sum";
|
|
2387
2387
|
best: "best";
|
|
2388
2388
|
worst: "worst";
|
|
2389
2389
|
}>;
|
|
@@ -2466,10 +2466,10 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2466
2466
|
caseIds: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
2467
2467
|
lastRunStatus: z.ZodNullable<z.ZodEnum<{
|
|
2468
2468
|
error: "error";
|
|
2469
|
-
pass: "pass";
|
|
2470
|
-
fail: "fail";
|
|
2471
2469
|
running: "running";
|
|
2472
2470
|
cancelled: "cancelled";
|
|
2471
|
+
pass: "pass";
|
|
2472
|
+
fail: "fail";
|
|
2473
2473
|
unscored: "unscored";
|
|
2474
2474
|
}>>;
|
|
2475
2475
|
stats: z.ZodOptional<z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
@@ -2483,10 +2483,10 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2483
2483
|
hideIfNoValue: z.ZodOptional<z.ZodBoolean>;
|
|
2484
2484
|
kind: z.ZodLiteral<"duration">;
|
|
2485
2485
|
aggregate: z.ZodOptional<z.ZodEnum<{
|
|
2486
|
+
sum: "sum";
|
|
2486
2487
|
avg: "avg";
|
|
2487
2488
|
min: "min";
|
|
2488
2489
|
max: "max";
|
|
2489
|
-
sum: "sum";
|
|
2490
2490
|
best: "best";
|
|
2491
2491
|
worst: "worst";
|
|
2492
2492
|
}>>;
|
|
@@ -2494,10 +2494,10 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2494
2494
|
hideIfNoValue: z.ZodOptional<z.ZodBoolean>;
|
|
2495
2495
|
kind: z.ZodLiteral<"cacheHits">;
|
|
2496
2496
|
aggregate: z.ZodOptional<z.ZodEnum<{
|
|
2497
|
+
sum: "sum";
|
|
2497
2498
|
avg: "avg";
|
|
2498
2499
|
min: "min";
|
|
2499
2500
|
max: "max";
|
|
2500
|
-
sum: "sum";
|
|
2501
2501
|
best: "best";
|
|
2502
2502
|
worst: "worst";
|
|
2503
2503
|
}>>;
|
|
@@ -2507,10 +2507,10 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2507
2507
|
key: z.ZodString;
|
|
2508
2508
|
label: z.ZodOptional<z.ZodString>;
|
|
2509
2509
|
aggregate: z.ZodEnum<{
|
|
2510
|
+
sum: "sum";
|
|
2510
2511
|
avg: "avg";
|
|
2511
2512
|
min: "min";
|
|
2512
2513
|
max: "max";
|
|
2513
|
-
sum: "sum";
|
|
2514
2514
|
best: "best";
|
|
2515
2515
|
worst: "worst";
|
|
2516
2516
|
}>;
|
|
@@ -2534,10 +2534,10 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2534
2534
|
accent: z.ZodOptional<z.ZodBoolean>;
|
|
2535
2535
|
}, z.core.$strip>], "kind">>>;
|
|
2536
2536
|
defaultStatAggregate: z.ZodOptional<z.ZodEnum<{
|
|
2537
|
+
sum: "sum";
|
|
2537
2538
|
avg: "avg";
|
|
2538
2539
|
min: "min";
|
|
2539
2540
|
max: "max";
|
|
2540
|
-
sum: "sum";
|
|
2541
2541
|
best: "best";
|
|
2542
2542
|
worst: "worst";
|
|
2543
2543
|
}>>;
|
|
@@ -2560,9 +2560,9 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2560
2560
|
color: z.ZodOptional<z.ZodEnum<{
|
|
2561
2561
|
success: "success";
|
|
2562
2562
|
error: "error";
|
|
2563
|
+
warning: "warning";
|
|
2563
2564
|
accent: "accent";
|
|
2564
2565
|
accentDim: "accentDim";
|
|
2565
|
-
warning: "warning";
|
|
2566
2566
|
textMuted: "textMuted";
|
|
2567
2567
|
}>>;
|
|
2568
2568
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -2573,10 +2573,10 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2573
2573
|
source: z.ZodLiteral<"column">;
|
|
2574
2574
|
key: z.ZodString;
|
|
2575
2575
|
aggregate: z.ZodEnum<{
|
|
2576
|
+
sum: "sum";
|
|
2576
2577
|
avg: "avg";
|
|
2577
2578
|
min: "min";
|
|
2578
2579
|
max: "max";
|
|
2579
|
-
sum: "sum";
|
|
2580
2580
|
latest: "latest";
|
|
2581
2581
|
passThresholdRate: "passThresholdRate";
|
|
2582
2582
|
}>;
|
|
@@ -2584,9 +2584,9 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2584
2584
|
color: z.ZodOptional<z.ZodEnum<{
|
|
2585
2585
|
success: "success";
|
|
2586
2586
|
error: "error";
|
|
2587
|
+
warning: "warning";
|
|
2587
2588
|
accent: "accent";
|
|
2588
2589
|
accentDim: "accentDim";
|
|
2589
|
-
warning: "warning";
|
|
2590
2590
|
textMuted: "textMuted";
|
|
2591
2591
|
}>>;
|
|
2592
2592
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -2615,10 +2615,10 @@ declare const evalSummarySchema$1: z.ZodObject<{
|
|
|
2615
2615
|
source: z.ZodLiteral<"column">;
|
|
2616
2616
|
key: z.ZodString;
|
|
2617
2617
|
aggregate: z.ZodEnum<{
|
|
2618
|
+
sum: "sum";
|
|
2618
2619
|
avg: "avg";
|
|
2619
2620
|
min: "min";
|
|
2620
2621
|
max: "max";
|
|
2621
|
-
sum: "sum";
|
|
2622
2622
|
latest: "latest";
|
|
2623
2623
|
passThresholdRate: "passThresholdRate";
|
|
2624
2624
|
}>;
|
|
@@ -2715,10 +2715,10 @@ declare const caseRowSchema$1: z.ZodObject<{
|
|
|
2715
2715
|
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
2716
2716
|
status: z.ZodEnum<{
|
|
2717
2717
|
error: "error";
|
|
2718
|
-
pass: "pass";
|
|
2719
|
-
fail: "fail";
|
|
2720
2718
|
running: "running";
|
|
2721
2719
|
cancelled: "cancelled";
|
|
2720
|
+
pass: "pass";
|
|
2721
|
+
fail: "fail";
|
|
2722
2722
|
pending: "pending";
|
|
2723
2723
|
}>;
|
|
2724
2724
|
durationMs: z.ZodNullable<z.ZodNumber>;
|
|
@@ -2857,8 +2857,8 @@ declare const scoreTraceSchema: z.ZodObject<{
|
|
|
2857
2857
|
status: z.ZodEnum<{
|
|
2858
2858
|
error: "error";
|
|
2859
2859
|
running: "running";
|
|
2860
|
-
cancelled: "cancelled";
|
|
2861
2860
|
ok: "ok";
|
|
2861
|
+
cancelled: "cancelled";
|
|
2862
2862
|
}>;
|
|
2863
2863
|
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2864
2864
|
error: z.ZodOptional<z.ZodObject<{
|
|
@@ -2908,9 +2908,9 @@ declare const scoreTraceSchema: z.ZodObject<{
|
|
|
2908
2908
|
subtree: "subtree";
|
|
2909
2909
|
}>>;
|
|
2910
2910
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
2911
|
-
sum: "sum";
|
|
2912
2911
|
all: "all";
|
|
2913
2912
|
last: "last";
|
|
2913
|
+
sum: "sum";
|
|
2914
2914
|
}>>;
|
|
2915
2915
|
}, z.core.$strip>>>;
|
|
2916
2916
|
}, z.core.$strip>;
|
|
@@ -2942,10 +2942,10 @@ declare const caseDetailSchema$1: z.ZodObject<{
|
|
|
2942
2942
|
tags: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
2943
2943
|
status: z.ZodEnum<{
|
|
2944
2944
|
error: "error";
|
|
2945
|
-
pass: "pass";
|
|
2946
|
-
fail: "fail";
|
|
2947
2945
|
running: "running";
|
|
2948
2946
|
cancelled: "cancelled";
|
|
2947
|
+
pass: "pass";
|
|
2948
|
+
fail: "fail";
|
|
2949
2949
|
pending: "pending";
|
|
2950
2950
|
}>;
|
|
2951
2951
|
input: z.ZodUnknown;
|
|
@@ -2960,8 +2960,8 @@ declare const caseDetailSchema$1: z.ZodObject<{
|
|
|
2960
2960
|
status: z.ZodEnum<{
|
|
2961
2961
|
error: "error";
|
|
2962
2962
|
running: "running";
|
|
2963
|
-
cancelled: "cancelled";
|
|
2964
2963
|
ok: "ok";
|
|
2964
|
+
cancelled: "cancelled";
|
|
2965
2965
|
}>;
|
|
2966
2966
|
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
2967
2967
|
error: z.ZodOptional<z.ZodObject<{
|
|
@@ -3011,9 +3011,9 @@ declare const caseDetailSchema$1: z.ZodObject<{
|
|
|
3011
3011
|
subtree: "subtree";
|
|
3012
3012
|
}>>;
|
|
3013
3013
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
3014
|
-
sum: "sum";
|
|
3015
3014
|
all: "all";
|
|
3016
3015
|
last: "last";
|
|
3016
|
+
sum: "sum";
|
|
3017
3017
|
}>>;
|
|
3018
3018
|
}, z.core.$strip>>>;
|
|
3019
3019
|
}, z.core.$strip>;
|
|
@@ -3029,8 +3029,8 @@ declare const caseDetailSchema$1: z.ZodObject<{
|
|
|
3029
3029
|
status: z.ZodEnum<{
|
|
3030
3030
|
error: "error";
|
|
3031
3031
|
running: "running";
|
|
3032
|
-
cancelled: "cancelled";
|
|
3033
3032
|
ok: "ok";
|
|
3033
|
+
cancelled: "cancelled";
|
|
3034
3034
|
}>;
|
|
3035
3035
|
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
3036
3036
|
error: z.ZodOptional<z.ZodObject<{
|
|
@@ -3080,9 +3080,9 @@ declare const caseDetailSchema$1: z.ZodObject<{
|
|
|
3080
3080
|
subtree: "subtree";
|
|
3081
3081
|
}>>;
|
|
3082
3082
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
3083
|
-
sum: "sum";
|
|
3084
3083
|
all: "all";
|
|
3085
3084
|
last: "last";
|
|
3085
|
+
sum: "sum";
|
|
3086
3086
|
}>>;
|
|
3087
3087
|
}, z.core.$strip>>>;
|
|
3088
3088
|
}, z.core.$strip>;
|
|
@@ -3269,10 +3269,10 @@ declare const evalChartBuiltinMetricSchema: z.ZodEnum<{
|
|
|
3269
3269
|
type EvalChartBuiltinMetric = z.infer<typeof evalChartBuiltinMetricSchema>;
|
|
3270
3270
|
/** Reducer applied to a numeric column across all cases of a single run. */
|
|
3271
3271
|
declare const evalChartAggregateSchema: z.ZodEnum<{
|
|
3272
|
+
sum: "sum";
|
|
3272
3273
|
avg: "avg";
|
|
3273
3274
|
min: "min";
|
|
3274
3275
|
max: "max";
|
|
3275
|
-
sum: "sum";
|
|
3276
3276
|
latest: "latest";
|
|
3277
3277
|
passThresholdRate: "passThresholdRate";
|
|
3278
3278
|
}>;
|
|
@@ -3285,9 +3285,9 @@ type EvalChartAggregate = z.infer<typeof evalChartAggregateSchema>;
|
|
|
3285
3285
|
declare const evalChartColorSchema: z.ZodEnum<{
|
|
3286
3286
|
success: "success";
|
|
3287
3287
|
error: "error";
|
|
3288
|
+
warning: "warning";
|
|
3288
3289
|
accent: "accent";
|
|
3289
3290
|
accentDim: "accentDim";
|
|
3290
|
-
warning: "warning";
|
|
3291
3291
|
textMuted: "textMuted";
|
|
3292
3292
|
}>;
|
|
3293
3293
|
/** Semantic color token resolved to a theme color by the web UI. */
|
|
@@ -3314,9 +3314,9 @@ declare const evalChartMetricSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
3314
3314
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3315
3315
|
success: "success";
|
|
3316
3316
|
error: "error";
|
|
3317
|
+
warning: "warning";
|
|
3317
3318
|
accent: "accent";
|
|
3318
3319
|
accentDim: "accentDim";
|
|
3319
|
-
warning: "warning";
|
|
3320
3320
|
textMuted: "textMuted";
|
|
3321
3321
|
}>>;
|
|
3322
3322
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -3327,10 +3327,10 @@ declare const evalChartMetricSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
3327
3327
|
source: z.ZodLiteral<"column">;
|
|
3328
3328
|
key: z.ZodString;
|
|
3329
3329
|
aggregate: z.ZodEnum<{
|
|
3330
|
+
sum: "sum";
|
|
3330
3331
|
avg: "avg";
|
|
3331
3332
|
min: "min";
|
|
3332
3333
|
max: "max";
|
|
3333
|
-
sum: "sum";
|
|
3334
3334
|
latest: "latest";
|
|
3335
3335
|
passThresholdRate: "passThresholdRate";
|
|
3336
3336
|
}>;
|
|
@@ -3338,9 +3338,9 @@ declare const evalChartMetricSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
|
3338
3338
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3339
3339
|
success: "success";
|
|
3340
3340
|
error: "error";
|
|
3341
|
+
warning: "warning";
|
|
3341
3342
|
accent: "accent";
|
|
3342
3343
|
accentDim: "accentDim";
|
|
3343
|
-
warning: "warning";
|
|
3344
3344
|
textMuted: "textMuted";
|
|
3345
3345
|
}>>;
|
|
3346
3346
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -3362,10 +3362,10 @@ declare const evalChartTooltipExtraSchema: z.ZodDiscriminatedUnion<[z.ZodObject<
|
|
|
3362
3362
|
source: z.ZodLiteral<"column">;
|
|
3363
3363
|
key: z.ZodString;
|
|
3364
3364
|
aggregate: z.ZodEnum<{
|
|
3365
|
+
sum: "sum";
|
|
3365
3366
|
avg: "avg";
|
|
3366
3367
|
min: "min";
|
|
3367
3368
|
max: "max";
|
|
3368
|
-
sum: "sum";
|
|
3369
3369
|
latest: "latest";
|
|
3370
3370
|
passThresholdRate: "passThresholdRate";
|
|
3371
3371
|
}>;
|
|
@@ -3397,9 +3397,9 @@ declare const evalChartConfigSchema: z.ZodObject<{
|
|
|
3397
3397
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3398
3398
|
success: "success";
|
|
3399
3399
|
error: "error";
|
|
3400
|
+
warning: "warning";
|
|
3400
3401
|
accent: "accent";
|
|
3401
3402
|
accentDim: "accentDim";
|
|
3402
|
-
warning: "warning";
|
|
3403
3403
|
textMuted: "textMuted";
|
|
3404
3404
|
}>>;
|
|
3405
3405
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -3410,10 +3410,10 @@ declare const evalChartConfigSchema: z.ZodObject<{
|
|
|
3410
3410
|
source: z.ZodLiteral<"column">;
|
|
3411
3411
|
key: z.ZodString;
|
|
3412
3412
|
aggregate: z.ZodEnum<{
|
|
3413
|
+
sum: "sum";
|
|
3413
3414
|
avg: "avg";
|
|
3414
3415
|
min: "min";
|
|
3415
3416
|
max: "max";
|
|
3416
|
-
sum: "sum";
|
|
3417
3417
|
latest: "latest";
|
|
3418
3418
|
passThresholdRate: "passThresholdRate";
|
|
3419
3419
|
}>;
|
|
@@ -3421,9 +3421,9 @@ declare const evalChartConfigSchema: z.ZodObject<{
|
|
|
3421
3421
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3422
3422
|
success: "success";
|
|
3423
3423
|
error: "error";
|
|
3424
|
+
warning: "warning";
|
|
3424
3425
|
accent: "accent";
|
|
3425
3426
|
accentDim: "accentDim";
|
|
3426
|
-
warning: "warning";
|
|
3427
3427
|
textMuted: "textMuted";
|
|
3428
3428
|
}>>;
|
|
3429
3429
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -3452,10 +3452,10 @@ declare const evalChartConfigSchema: z.ZodObject<{
|
|
|
3452
3452
|
source: z.ZodLiteral<"column">;
|
|
3453
3453
|
key: z.ZodString;
|
|
3454
3454
|
aggregate: z.ZodEnum<{
|
|
3455
|
+
sum: "sum";
|
|
3455
3456
|
avg: "avg";
|
|
3456
3457
|
min: "min";
|
|
3457
3458
|
max: "max";
|
|
3458
|
-
sum: "sum";
|
|
3459
3459
|
latest: "latest";
|
|
3460
3460
|
passThresholdRate: "passThresholdRate";
|
|
3461
3461
|
}>;
|
|
@@ -3487,9 +3487,9 @@ declare const evalChartsConfigSchema: z.ZodArray<z.ZodObject<{
|
|
|
3487
3487
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3488
3488
|
success: "success";
|
|
3489
3489
|
error: "error";
|
|
3490
|
+
warning: "warning";
|
|
3490
3491
|
accent: "accent";
|
|
3491
3492
|
accentDim: "accentDim";
|
|
3492
|
-
warning: "warning";
|
|
3493
3493
|
textMuted: "textMuted";
|
|
3494
3494
|
}>>;
|
|
3495
3495
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -3500,10 +3500,10 @@ declare const evalChartsConfigSchema: z.ZodArray<z.ZodObject<{
|
|
|
3500
3500
|
source: z.ZodLiteral<"column">;
|
|
3501
3501
|
key: z.ZodString;
|
|
3502
3502
|
aggregate: z.ZodEnum<{
|
|
3503
|
+
sum: "sum";
|
|
3503
3504
|
avg: "avg";
|
|
3504
3505
|
min: "min";
|
|
3505
3506
|
max: "max";
|
|
3506
|
-
sum: "sum";
|
|
3507
3507
|
latest: "latest";
|
|
3508
3508
|
passThresholdRate: "passThresholdRate";
|
|
3509
3509
|
}>;
|
|
@@ -3511,9 +3511,9 @@ declare const evalChartsConfigSchema: z.ZodArray<z.ZodObject<{
|
|
|
3511
3511
|
color: z.ZodOptional<z.ZodEnum<{
|
|
3512
3512
|
success: "success";
|
|
3513
3513
|
error: "error";
|
|
3514
|
+
warning: "warning";
|
|
3514
3515
|
accent: "accent";
|
|
3515
3516
|
accentDim: "accentDim";
|
|
3516
|
-
warning: "warning";
|
|
3517
3517
|
textMuted: "textMuted";
|
|
3518
3518
|
}>>;
|
|
3519
3519
|
axis: z.ZodOptional<z.ZodEnum<{
|
|
@@ -3542,10 +3542,10 @@ declare const evalChartsConfigSchema: z.ZodArray<z.ZodObject<{
|
|
|
3542
3542
|
source: z.ZodLiteral<"column">;
|
|
3543
3543
|
key: z.ZodString;
|
|
3544
3544
|
aggregate: z.ZodEnum<{
|
|
3545
|
+
sum: "sum";
|
|
3545
3546
|
avg: "avg";
|
|
3546
3547
|
min: "min";
|
|
3547
3548
|
max: "max";
|
|
3548
|
-
sum: "sum";
|
|
3549
3549
|
latest: "latest";
|
|
3550
3550
|
passThresholdRate: "passThresholdRate";
|
|
3551
3551
|
}>;
|
|
@@ -3573,8 +3573,8 @@ declare const runManifestSchema$1: z.ZodObject<{
|
|
|
3573
3573
|
evalSourceFingerprints: z.ZodDefault<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>>;
|
|
3574
3574
|
target: z.ZodObject<{
|
|
3575
3575
|
mode: z.ZodEnum<{
|
|
3576
|
-
caseIds: "caseIds";
|
|
3577
3576
|
all: "all";
|
|
3577
|
+
caseIds: "caseIds";
|
|
3578
3578
|
evalIds: "evalIds";
|
|
3579
3579
|
}>;
|
|
3580
3580
|
evalKeys: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
@@ -4729,8 +4729,8 @@ declare const cacheRecordingSchema: z.ZodObject<{
|
|
|
4729
4729
|
finalStatus: z.ZodOptional<z.ZodEnum<{
|
|
4730
4730
|
error: "error";
|
|
4731
4731
|
running: "running";
|
|
4732
|
-
cancelled: "cancelled";
|
|
4733
4732
|
ok: "ok";
|
|
4733
|
+
cancelled: "cancelled";
|
|
4734
4734
|
}>>;
|
|
4735
4735
|
finalError: z.ZodOptional<z.ZodObject<{
|
|
4736
4736
|
name: z.ZodOptional<z.ZodString>;
|
|
@@ -4830,8 +4830,8 @@ declare const cacheEntrySchema: z.ZodObject<{
|
|
|
4830
4830
|
finalStatus: z.ZodOptional<z.ZodEnum<{
|
|
4831
4831
|
error: "error";
|
|
4832
4832
|
running: "running";
|
|
4833
|
-
cancelled: "cancelled";
|
|
4834
4833
|
ok: "ok";
|
|
4834
|
+
cancelled: "cancelled";
|
|
4835
4835
|
}>>;
|
|
4836
4836
|
finalError: z.ZodOptional<z.ZodObject<{
|
|
4837
4837
|
name: z.ZodOptional<z.ZodString>;
|
|
@@ -4948,8 +4948,8 @@ declare const cacheDebugKeyEntrySchema: z.ZodObject<{
|
|
|
4948
4948
|
finalStatus: z.ZodOptional<z.ZodEnum<{
|
|
4949
4949
|
error: "error";
|
|
4950
4950
|
running: "running";
|
|
4951
|
-
cancelled: "cancelled";
|
|
4952
4951
|
ok: "ok";
|
|
4952
|
+
cancelled: "cancelled";
|
|
4953
4953
|
}>>;
|
|
4954
4954
|
finalError: z.ZodOptional<z.ZodObject<{
|
|
4955
4955
|
name: z.ZodOptional<z.ZodString>;
|
|
@@ -5055,8 +5055,8 @@ declare const cacheEntryWithDebugKeySchema$1: z.ZodObject<{
|
|
|
5055
5055
|
finalStatus: z.ZodOptional<z.ZodEnum<{
|
|
5056
5056
|
error: "error";
|
|
5057
5057
|
running: "running";
|
|
5058
|
-
cancelled: "cancelled";
|
|
5059
5058
|
ok: "ok";
|
|
5059
|
+
cancelled: "cancelled";
|
|
5060
5060
|
}>>;
|
|
5061
5061
|
finalError: z.ZodOptional<z.ZodObject<{
|
|
5062
5062
|
name: z.ZodOptional<z.ZodString>;
|
|
@@ -5164,8 +5164,8 @@ declare const cacheEntryWithDebugKeySchema$1: z.ZodObject<{
|
|
|
5164
5164
|
finalStatus: z.ZodOptional<z.ZodEnum<{
|
|
5165
5165
|
error: "error";
|
|
5166
5166
|
running: "running";
|
|
5167
|
-
cancelled: "cancelled";
|
|
5168
5167
|
ok: "ok";
|
|
5168
|
+
cancelled: "cancelled";
|
|
5169
5169
|
}>>;
|
|
5170
5170
|
finalError: z.ZodOptional<z.ZodObject<{
|
|
5171
5171
|
name: z.ZodOptional<z.ZodString>;
|
|
@@ -5271,8 +5271,8 @@ declare const cacheFileSchema: z.ZodObject<{
|
|
|
5271
5271
|
finalStatus: z.ZodOptional<z.ZodEnum<{
|
|
5272
5272
|
error: "error";
|
|
5273
5273
|
running: "running";
|
|
5274
|
-
cancelled: "cancelled";
|
|
5275
5274
|
ok: "ok";
|
|
5275
|
+
cancelled: "cancelled";
|
|
5276
5276
|
}>>;
|
|
5277
5277
|
finalError: z.ZodOptional<z.ZodObject<{
|
|
5278
5278
|
name: z.ZodOptional<z.ZodString>;
|
|
@@ -5388,8 +5388,8 @@ declare const cacheDebugKeyFileSchema: z.ZodObject<{
|
|
|
5388
5388
|
finalStatus: z.ZodOptional<z.ZodEnum<{
|
|
5389
5389
|
error: "error";
|
|
5390
5390
|
running: "running";
|
|
5391
|
-
cancelled: "cancelled";
|
|
5392
5391
|
ok: "ok";
|
|
5392
|
+
cancelled: "cancelled";
|
|
5393
5393
|
}>>;
|
|
5394
5394
|
finalError: z.ZodOptional<z.ZodObject<{
|
|
5395
5395
|
name: z.ZodOptional<z.ZodString>;
|
|
@@ -5573,8 +5573,8 @@ type ConfigReloadState = z.infer<typeof configReloadStateSchema$1>;
|
|
|
5573
5573
|
declare const createRunRequestSchema$1: z.ZodObject<{
|
|
5574
5574
|
target: z.ZodObject<{
|
|
5575
5575
|
mode: z.ZodEnum<{
|
|
5576
|
-
caseIds: "caseIds";
|
|
5577
5576
|
all: "all";
|
|
5577
|
+
caseIds: "caseIds";
|
|
5578
5578
|
evalIds: "evalIds";
|
|
5579
5579
|
}>;
|
|
5580
5580
|
evalKeys: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
package/package.json
CHANGED
|
@@ -212,11 +212,11 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape (forma
|
|
|
212
212
|
- `tracingAssertions` is a single function that can be authored globally or locally on one eval when a finished-trace invariant should pass or fail the case without creating a fake score column. It receives the same `{ trace, input, case }` context as `deriveFromTracing`; call `evalAssert(...)` or `evalExpect(...)` inside it. Useful trace helpers include `trace.findSpan(name)`, `trace.findSpans(name)`, `trace.hasSpan(name)`, `trace.findSpansByKind(kind)`, `trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`, `trace.hasToolCallSpan(name)`, `trace.getToolCallSpans(name)`, `trace.getToolCallSpanCount(toolName)`, `trace.hasToolCallSpanCount(toolName, expectedCalls)`, `trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and `trace.flattenDfs()`. The tool-call helpers include both `kind: 'tool'` spans and imported execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts match the span `name` as well as GenAI/Mastra identity attributes such as `genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer those tool identity attributes when present. `getToolCallSpans(name)` returns one normalized object per matching call, including parsed `arguments`, parsed `result`, `description`, `toolType`, `attributes`, and the original `span`.
|
|
213
213
|
- `traceDisplay` promotes selected span attributes into the trace tree and detail pane; it supports aggregation across subtrees (`scope`, `mode`) and user-defined `transform(...)` for derived views (e.g. currency conversion). See the `TraceDisplayInputConfig` type.
|
|
214
214
|
- `llmCalls` (in `agent-evals.config.ts`) configures how LLM-call spans are summarized for review. Defaults to `kind: 'llm'` spans with `model`, `usage.*`, `latencyMs`, `input`, `output`, etc. read from conventional attribute paths. The default `steps` path reads an array from `span.attributes.steps`; if it is missing, direct child `model_step` spans are shown as that call's steps. Tool calls are aggregated from the configured `toolCalls` path plus step-level `toolCalls` on authored step arrays or direct `model_step` child spans, including Mastra's serialized `mastra.model_step.output` format, and child `tool_call` execution spans under each model step. `latencyMs` is time to first token; duration, total tokens, output tokens/sec, and USD costs are derived. Override `kinds` to broaden the filter, override `attributes.<field>` for non-default primitive span shapes, configure model-keyed `pricing` to derive USD costs from token counts, with nested `providers` entries for provider-specific rates, add `costCurrencies` to show converted cost columns in the expanded breakdown table only, add `derivedAttributes` to persist computed values back onto matching LLM spans before trace consumers run, and add entries to `metrics` to surface arbitrary user metrics (`format: 'string' | 'number' | 'duration' | 'json' | 'boolean'`, `placements: ['header' | 'body']`). `derivedAttributes` can be a keyed map for one-off fields or one callback that returns multiple path/value pairs. Derived keys are dot-paths under `span.attributes`; return `undefined` to skip one span or one returned key.
|
|
215
|
-
- Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
|
|
215
|
+
- Default usage config derives missing eval outputs from matching LLM/API spans before `outputsSchema` and scores run: `apiCalls`, `costUsd`, `llmTurns`, `inputTokens`, `outputTokens`, `totalTokens`, `cachedInputTokens`, `cacheCreationInputTokens`, `reasoningTokens`, and `llmDurationMs`. Authored outputs and column overrides win. The web UI fills in baseline run-health stats (`cases`, `passRate`, `duration`) and a pass-rate/duration history chart when an eval has not already authored equivalent run-health UI. If discovery metadata is missing but saved runs contain runtime columns such as `costUsd`, `inputTokens`, or `apiCalls`, the single-eval page can infer the standard usage stats and charts from those saved run values. Default usage columns, stats, and charts use `hideIfNoValue: true`. Default LLM usage charts configure cost, input tokens, and output tokens separately and use `dedupeConsecutiveValues: true` to skip repeated adjacent chart values. `totalTokens` is input + output only; cache read/write tokens stay separate and affect `costUsd` at their own rates. `llmTurns` is the maximum per-call turn count in the case run, using configured steps when available and otherwise one turn per matched LLM call span. Derived base input cost uses `inputTokens - cachedInputTokens - cacheCreationInputTokens` so cache details are not double-counted. `cacheCreationInputTokens` is the total cache-write count; optional `cacheCreationInput1hTokens` only splits that total for 1-hour write pricing via `cacheCreationInput1hUsdPerMillion`. `llmDurationMs` sums elapsed matched LLM span durations; it is not time-to-first-token latency. Remove defaults globally or per eval with `removeDefaultConfig: true` or a key list such as `removeDefaultConfig: ['apiCalls', 'reasoningTokens']`.
|
|
216
216
|
- `apiCalls` (in `agent-evals.config.ts`) configures how API-call spans are summarized for review. Defaults to `kind: 'api'`, `'http'`, `'http.client'`, and `'fetch'` spans with `method`, `url`, `statusCode`, `request`, `routeAlias`, `response`, `requestBody`, `responseBody`, `headers`, `durationMs`, and `error` read from conventional attribute paths. Override `kinds` or `attributes.<field>` for external tracers. Set a per-span `routeAlias` attribute such as `/v3/tabs/:id` to group dynamic URL paths in API-call route labels and endpoint charts while preserving original URLs in row details. Add `derivedAttributes` as a keyed map or object-returning callback for computed persisted API span attributes, and add `metrics` with the same formats and placements as LLM-call metrics.
|
|
217
217
|
- `runLogs` (in `agent-evals.config.ts`) controls case log capture. Use `runLogs: { captureConsole: false }` to keep console output in the terminal without persisting console calls to case details. Manual `evalLog(...)` calls are still captured. Captured log locations store the selected user-facing source frame and the full JavaScript stack so agents can inspect additional frames in persisted artifacts when diagnosing where a log came from.
|
|
218
218
|
|
|
219
|
-
Stats rows and history charts can be authored via `stats` / `charts` on the eval definition. Global `stats` in `agent-evals.config.ts` combine with eval-level stats. Native stat kinds include `cases`, `passRate`, `duration`, and `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit stats use a separate aggregate control and default to `sum`; `avg` is average per-case hit rate, and min/max/best/worst select cases by hit rate. `duration` aggregates per-case durations using the same modes as column stats. Usage stats and LLM usage charts are added by default unless removed with `removeDefaultConfig`. Column stats can override `format` and `numberFormat`, otherwise they inherit from the matching column. Duration and column stat aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value), and `worst` (lowest finite value). Use `defaultStatAggregate` in `agent-evals.config.ts` to set the workspace-wide initial duration/column stat mode, or on an eval definition to override it for that eval. Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats and charts support `hideIfNoValue: true`. Charts support `dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics and tooltip extras match the previous kept point. Their shapes live in the types; no need to memorize the option set.
|
|
219
|
+
Stats rows and history charts can be authored via `stats` / `charts` on the eval definition. Global `stats` in `agent-evals.config.ts` combine with eval-level stats. The web UI automatically supplies missing `cases`, `passRate`, and `duration` stats plus a pass-rate/duration history chart, including for a single completed run. Native stat kinds include `cases`, `passRate`, `duration`, and `cacheHits`; `cacheHits` shows Agent Eval operation-level cache hits over total cache operations (`hits/total`) from spans and `evalTracer.cache(...)` refs, not LLM provider prompt-cache read tokens such as `cachedInputTokens`. Cache-hit stats use a separate aggregate control and default to `sum`; `avg` is average per-case hit rate, and min/max/best/worst select cases by hit rate. `duration` aggregates per-case durations using the same modes as column stats. Usage stats and LLM usage charts are added by default unless removed with `removeDefaultConfig`. Column stats can override `format` and `numberFormat`, otherwise they inherit from the matching column. Duration and column stat aggregates support `avg`, `min`, `max`, `sum`, `best` (highest finite value), and `worst` (lowest finite value). Use `defaultStatAggregate` in `agent-evals.config.ts` to set the workspace-wide initial duration/column stat mode, or on an eval definition to override it for that eval. Number formats use `maxDecimalPlaces` to cap decimals and `minDecimalPlaces` to pad trailing zeroes. Without `maxDecimalPlaces`, the default cap is 3 decimal places. Stats and charts support `hideIfNoValue: true`. Charts support `dedupeConsecutiveValues: true` to omit consecutive points whose plotted metrics and tooltip extras match the previous kept point. Rendered charts with no plottable values show an unavailable state instead of a blank frame. Their shapes live in the types; no need to memorize the option set.
|
|
220
220
|
|
|
221
221
|
## Cached operations
|
|
222
222
|
|