@ls-stack/agent-eval 0.58.0 → 0.58.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-L9GdY28I.mjs → app-BxD6aHbp.mjs} +52 -7
- package/dist/apps/web/dist/assets/index-BMWBZw_u.js +377 -0
- package/dist/apps/web/dist/assets/index-CHH7m5Cv.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +2 -1
- package/dist/{cli-Cf37PZKi.mjs → cli-HBwXIJsg.mjs} +31 -5
- package/dist/index.d.mts +136 -80
- package/dist/index.mjs +4 -4
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-C4kAOhC1.mjs → runExecution-pHJ0_TzH.mjs} +188 -89
- package/dist/{runOrchestration-5xEiQxiS.mjs → runOrchestration-ngVXShH4.mjs} +73 -6
- package/dist/{runner-JIykMlve.mjs → runner-BnZMGBla.mjs} +1 -1
- package/dist/{runner-bjd_UB9i.mjs → runner-D_pz2NON.mjs} +2 -2
- package/dist/{src-303BocMW.mjs → src-AeXGBJ26.mjs} +2 -2
- package/package.json +1 -1
- package/skills/agent-eval/SKILL.md +18 -3
- package/dist/apps/web/dist/assets/index-Cz9p4l-t.js +0 -377
- package/dist/apps/web/dist/assets/index-DtARRwsS.css +0 -1
package/dist/index.d.mts
CHANGED
|
@@ -268,6 +268,7 @@ declare const runLogEntrySchema$1: z$1.ZodObject<{
|
|
|
268
268
|
phase: z$1.ZodEnum<{
|
|
269
269
|
eval: "eval";
|
|
270
270
|
derive: "derive";
|
|
271
|
+
tracingAssertions: "tracingAssertions";
|
|
271
272
|
outputsSchema: "outputsSchema";
|
|
272
273
|
scorer: "scorer";
|
|
273
274
|
}>;
|
|
@@ -399,10 +400,17 @@ type EvalCase$1$1<TInput = unknown> = {
|
|
|
399
400
|
};
|
|
400
401
|
/** Query helpers built from the flattened trace recorded for one eval case. */
|
|
401
402
|
type EvalTraceTree = {
|
|
402
|
-
spans: EvalTraceSpan$2[];
|
|
403
|
-
rootSpans: EvalTraceSpan$2[];
|
|
404
|
-
findSpan: (name: string) => EvalTraceSpan$2 | undefined;
|
|
405
|
-
|
|
403
|
+
/** Flat span list in creation order. */spans: EvalTraceSpan$2[]; /** Top-level spans whose `parentId` is `null`. */
|
|
404
|
+
rootSpans: EvalTraceSpan$2[]; /** Return the first span whose name exactly matches `name`. */
|
|
405
|
+
findSpan: (name: string) => EvalTraceSpan$2 | undefined; /** Return every span whose name exactly matches `name`. */
|
|
406
|
+
findSpans: (name: string) => EvalTraceSpan$2[]; /** Return whether any span name exactly matches `name`. */
|
|
407
|
+
hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
|
|
408
|
+
findSpansByKind: (kind: string) => EvalTraceSpan$2[]; /** Return every span with `kind: 'tool'`. */
|
|
409
|
+
findToolCallSpans: () => EvalTraceSpan$2[]; /** Return the names of every span with `kind: 'tool'`. */
|
|
410
|
+
listToolCallSpanNames: () => string[]; /** Return whether a `kind: 'tool'` span has a name exactly matching `name`. */
|
|
411
|
+
hasToolCallSpan: (name: string) => boolean; /** Return span names in creation order, optionally filtered by kind. */
|
|
412
|
+
listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
|
|
413
|
+
listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
|
|
406
414
|
flattenDfs: () => EvalTraceSpan$2[];
|
|
407
415
|
checkpoints: Map<string, unknown>;
|
|
408
416
|
};
|
|
@@ -421,6 +429,12 @@ type EvalDeriveMap<TInput = unknown> = Record<string, EvalDeriveValueFn<TInput>>
|
|
|
421
429
|
type EvalDeriveFn<TInput = unknown> = (ctx: EvalDeriveContext<TInput>) => Record<string, unknown> | Promise<Record<string, unknown>>;
|
|
422
430
|
/** Trace-derived output config accepted globally and on eval definitions. */
|
|
423
431
|
type EvalDeriveConfig<TInput = unknown> = EvalDeriveMap<TInput> | EvalDeriveFn<TInput>;
|
|
432
|
+
/** Function that records trace-derived assertions for one case. */
|
|
433
|
+
type EvalTracingAssertionsFn<TInput = unknown> = (ctx: EvalDeriveContext<TInput>) => MaybePromise$1<void>;
|
|
434
|
+
/** Keyed trace-derived assertion config for grouping related checks. */
|
|
435
|
+
type EvalTracingAssertionsMap<TInput = unknown> = Record<string, EvalTracingAssertionsFn<TInput>>;
|
|
436
|
+
/** Trace-derived assertion config accepted globally and on eval definitions. */
|
|
437
|
+
type EvalTracingAssertionsConfig<TInput = unknown> = EvalTracingAssertionsMap<TInput> | EvalTracingAssertionsFn<TInput>;
|
|
424
438
|
/** UI overrides for a derived or scored column emitted by an eval. */
|
|
425
439
|
type EvalColumnOverride = {
|
|
426
440
|
/** Display label shown for the column in tables and detail views. */label?: string;
|
|
@@ -1142,9 +1156,18 @@ type EvalDefinitionBase<TInput = unknown, TOutputs extends EvalOutputs = EvalOut
|
|
|
1142
1156
|
*
|
|
1143
1157
|
* Prefer the keyed map form when each key has one derivation. The
|
|
1144
1158
|
* object-returning callback form is also supported. Derived values only fill
|
|
1145
|
-
* keys not already recorded during execution.
|
|
1159
|
+
* keys not already recorded during execution. Assertion helpers are not
|
|
1160
|
+
* allowed here; use `tracingAssertions` for trace-derived pass/fail checks.
|
|
1146
1161
|
*/
|
|
1147
1162
|
deriveFromTracing?: EvalDeriveConfig<TInput>;
|
|
1163
|
+
/**
|
|
1164
|
+
* Record assertions from the finished execution trace.
|
|
1165
|
+
*
|
|
1166
|
+
* Runs after `deriveFromTracing` and before output schema validation and
|
|
1167
|
+
* scores. Use `evalAssert(...)` or `evalExpect(...)` inside the callback to
|
|
1168
|
+
* write normal assertion results without creating score columns.
|
|
1169
|
+
*/
|
|
1170
|
+
tracingAssertions?: EvalTracingAssertionsConfig<TInput>;
|
|
1148
1171
|
/**
|
|
1149
1172
|
* Computed score columns for each case.
|
|
1150
1173
|
*
|
|
@@ -1455,7 +1478,9 @@ type CacheScopeContext = {
|
|
|
1455
1478
|
/** Active recording frame captured while a cached operation body executes. */
|
|
1456
1479
|
type CacheRecordingFrame = {
|
|
1457
1480
|
/** Length of `scope.spans` immediately before the cached body started. */baseSpanIndex: number; /** Parent id used when recording and replaying direct child spans. */
|
|
1458
|
-
replayParentSpanId: string | null; /**
|
|
1481
|
+
replayParentSpanId: string | null; /** Spans created by this cache body's async execution branch. */
|
|
1482
|
+
spanIds: Set<string>; /** Non-cache attributes written to the replay parent by this async branch. */
|
|
1483
|
+
finalAttributes: Record<string, unknown>; /** Ordered observable effects recorded during the cached body. */
|
|
1459
1484
|
ops: CacheRecordingOp$1[];
|
|
1460
1485
|
};
|
|
1461
1486
|
/** Mutable per-case runtime state stored in async local storage. */
|
|
@@ -1480,11 +1505,6 @@ type EvalCaseScope = {
|
|
|
1480
1505
|
logs: RunLogEntry$1[];
|
|
1481
1506
|
spans: EvalTraceSpan$2[];
|
|
1482
1507
|
checkpoints: Map<string, unknown>;
|
|
1483
|
-
/**
|
|
1484
|
-
* Stack of active cache recorders. Ops are written to the top-most frame
|
|
1485
|
-
* when it exists and `replayingDepth === 0`.
|
|
1486
|
-
*/
|
|
1487
|
-
recordingStack: CacheRecordingFrame[];
|
|
1488
1508
|
/**
|
|
1489
1509
|
* Incremented while replaying a cached operation, so nested SDK calls do not
|
|
1490
1510
|
* accidentally double-record ops into outer recorders.
|
|
@@ -1506,12 +1526,16 @@ type EvalCaseScope = {
|
|
|
1506
1526
|
* covers run-time module/environment loading, including top-level code in
|
|
1507
1527
|
* modules imported while a run is being prepared.
|
|
1508
1528
|
*/
|
|
1509
|
-
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'outputsSchema' | 'scorer';
|
|
1529
|
+
type EvalRuntimeScope = 'env' | 'cases' | 'eval' | 'derive' | 'tracingAssertions' | 'outputsSchema' | 'scorer';
|
|
1510
1530
|
type EvalLogLevelInput = RunLogLevel$1 | 'warning';
|
|
1511
1531
|
/** Error thrown when an eval assertion fails during case execution. */
|
|
1512
1532
|
declare class EvalAssertionError extends Error {
|
|
1513
1533
|
constructor(message: string);
|
|
1514
1534
|
}
|
|
1535
|
+
/** Error thrown when an SDK helper is used in an unsupported runner phase. */
|
|
1536
|
+
declare class EvalRuntimeUsageError extends Error {
|
|
1537
|
+
constructor(message: string);
|
|
1538
|
+
}
|
|
1515
1539
|
/** Return the host process clock, bypassing the eval Date shim. */
|
|
1516
1540
|
/**
|
|
1517
1541
|
* Eval time helpers for reading and moving the active eval clock.
|
|
@@ -1542,8 +1566,10 @@ declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
|
1542
1566
|
*
|
|
1543
1567
|
* Returns `null` outside eval-owned work, `env` while the runner is loading
|
|
1544
1568
|
* eval modules for a run, `cases` while generating cases, `eval` while running
|
|
1545
|
-
* case `execute`, `derive` while deriving outputs from traces,
|
|
1546
|
-
*
|
|
1569
|
+
* case `execute`, `derive` while deriving outputs from traces,
|
|
1570
|
+
* `tracingAssertions` while checking trace-derived assertions,
|
|
1571
|
+
* `outputsSchema` while validating outputs, and `scorer` while computing
|
|
1572
|
+
* scores.
|
|
1547
1573
|
*/
|
|
1548
1574
|
declare function isInEvalScope(): EvalRuntimeScope | null;
|
|
1549
1575
|
/**
|
|
@@ -1659,7 +1685,8 @@ declare function incrementEvalOutput(key: string, delta: number): void;
|
|
|
1659
1685
|
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
1660
1686
|
* can safely reuse `evalAssert(...)` when it also runs outside an eval. The
|
|
1661
1687
|
* TypeScript assertion signature still narrows the checked value after the
|
|
1662
|
-
* call.
|
|
1688
|
+
* call. Calls inside `deriveFromTracing` throw because derivations must only
|
|
1689
|
+
* write outputs; use `tracingAssertions` for trace-derived pass/fail checks.
|
|
1663
1690
|
*/
|
|
1664
1691
|
declare function evalAssert(condition: unknown, message: string): asserts condition; //#endregion
|
|
1665
1692
|
//#region src/valueCache.d.ts
|
|
@@ -2017,8 +2044,8 @@ declare const traceAttributeDisplaySchema: z$1.ZodObject<{
|
|
|
2017
2044
|
subtree: "subtree";
|
|
2018
2045
|
}>>;
|
|
2019
2046
|
mode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2020
|
-
all: "all";
|
|
2021
2047
|
sum: "sum";
|
|
2048
|
+
all: "all";
|
|
2022
2049
|
last: "last";
|
|
2023
2050
|
}>>;
|
|
2024
2051
|
}, z$1.core.$strip>;
|
|
@@ -2053,8 +2080,8 @@ declare const traceDisplayConfigSchema: z$1.ZodObject<{
|
|
|
2053
2080
|
subtree: "subtree";
|
|
2054
2081
|
}>>;
|
|
2055
2082
|
mode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2056
|
-
all: "all";
|
|
2057
2083
|
sum: "sum";
|
|
2084
|
+
all: "all";
|
|
2058
2085
|
last: "last";
|
|
2059
2086
|
}>>;
|
|
2060
2087
|
}, z$1.core.$strip>>>;
|
|
@@ -2093,8 +2120,8 @@ declare const traceAttributeDisplayInputSchema: z$1.ZodObject<{
|
|
|
2093
2120
|
subtree: "subtree";
|
|
2094
2121
|
}>>;
|
|
2095
2122
|
mode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2096
|
-
all: "all";
|
|
2097
2123
|
sum: "sum";
|
|
2124
|
+
all: "all";
|
|
2098
2125
|
last: "last";
|
|
2099
2126
|
}>>;
|
|
2100
2127
|
transform: z$1.ZodOptional<z$1.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
@@ -2131,8 +2158,8 @@ declare const traceDisplayInputConfigSchema: z$1.ZodObject<{
|
|
|
2131
2158
|
subtree: "subtree";
|
|
2132
2159
|
}>>;
|
|
2133
2160
|
mode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2134
|
-
all: "all";
|
|
2135
2161
|
sum: "sum";
|
|
2162
|
+
all: "all";
|
|
2136
2163
|
last: "last";
|
|
2137
2164
|
}>>;
|
|
2138
2165
|
transform: z$1.ZodOptional<z$1.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
@@ -2217,9 +2244,9 @@ type EvalFreshnessStatus = z$1.infer<typeof evalFreshnessStatusSchema>;
|
|
|
2217
2244
|
*/
|
|
2218
2245
|
declare const evalStatAggregateSchema: z$1.ZodEnum<{
|
|
2219
2246
|
avg: "avg";
|
|
2220
|
-
sum: "sum";
|
|
2221
2247
|
min: "min";
|
|
2222
2248
|
max: "max";
|
|
2249
|
+
sum: "sum";
|
|
2223
2250
|
best: "best";
|
|
2224
2251
|
worst: "worst";
|
|
2225
2252
|
}>;
|
|
@@ -2249,9 +2276,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
2249
2276
|
kind: z$1.ZodLiteral<"duration">;
|
|
2250
2277
|
aggregate: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2251
2278
|
avg: "avg";
|
|
2252
|
-
sum: "sum";
|
|
2253
2279
|
min: "min";
|
|
2254
2280
|
max: "max";
|
|
2281
|
+
sum: "sum";
|
|
2255
2282
|
best: "best";
|
|
2256
2283
|
worst: "worst";
|
|
2257
2284
|
}>>;
|
|
@@ -2260,9 +2287,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
2260
2287
|
kind: z$1.ZodLiteral<"cacheHits">;
|
|
2261
2288
|
aggregate: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2262
2289
|
avg: "avg";
|
|
2263
|
-
sum: "sum";
|
|
2264
2290
|
min: "min";
|
|
2265
2291
|
max: "max";
|
|
2292
|
+
sum: "sum";
|
|
2266
2293
|
best: "best";
|
|
2267
2294
|
worst: "worst";
|
|
2268
2295
|
}>>;
|
|
@@ -2273,9 +2300,9 @@ declare const evalStatItemSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
2273
2300
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
2274
2301
|
aggregate: z$1.ZodEnum<{
|
|
2275
2302
|
avg: "avg";
|
|
2276
|
-
sum: "sum";
|
|
2277
2303
|
min: "min";
|
|
2278
2304
|
max: "max";
|
|
2305
|
+
sum: "sum";
|
|
2279
2306
|
best: "best";
|
|
2280
2307
|
worst: "worst";
|
|
2281
2308
|
}>;
|
|
@@ -2313,9 +2340,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
2313
2340
|
kind: z$1.ZodLiteral<"duration">;
|
|
2314
2341
|
aggregate: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2315
2342
|
avg: "avg";
|
|
2316
|
-
sum: "sum";
|
|
2317
2343
|
min: "min";
|
|
2318
2344
|
max: "max";
|
|
2345
|
+
sum: "sum";
|
|
2319
2346
|
best: "best";
|
|
2320
2347
|
worst: "worst";
|
|
2321
2348
|
}>>;
|
|
@@ -2324,9 +2351,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
2324
2351
|
kind: z$1.ZodLiteral<"cacheHits">;
|
|
2325
2352
|
aggregate: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2326
2353
|
avg: "avg";
|
|
2327
|
-
sum: "sum";
|
|
2328
2354
|
min: "min";
|
|
2329
2355
|
max: "max";
|
|
2356
|
+
sum: "sum";
|
|
2330
2357
|
best: "best";
|
|
2331
2358
|
worst: "worst";
|
|
2332
2359
|
}>>;
|
|
@@ -2337,9 +2364,9 @@ declare const evalStatsConfigSchema: z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1
|
|
|
2337
2364
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
2338
2365
|
aggregate: z$1.ZodEnum<{
|
|
2339
2366
|
avg: "avg";
|
|
2340
|
-
sum: "sum";
|
|
2341
2367
|
min: "min";
|
|
2342
2368
|
max: "max";
|
|
2369
|
+
sum: "sum";
|
|
2343
2370
|
best: "best";
|
|
2344
2371
|
worst: "worst";
|
|
2345
2372
|
}>;
|
|
@@ -2422,10 +2449,10 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2422
2449
|
caseIds: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2423
2450
|
lastRunStatus: z$1.ZodNullable<z$1.ZodEnum<{
|
|
2424
2451
|
error: "error";
|
|
2425
|
-
running: "running";
|
|
2426
|
-
cancelled: "cancelled";
|
|
2427
2452
|
pass: "pass";
|
|
2428
2453
|
fail: "fail";
|
|
2454
|
+
running: "running";
|
|
2455
|
+
cancelled: "cancelled";
|
|
2429
2456
|
unscored: "unscored";
|
|
2430
2457
|
}>>;
|
|
2431
2458
|
stats: z$1.ZodOptional<z$1.ZodArray<z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
@@ -2440,9 +2467,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2440
2467
|
kind: z$1.ZodLiteral<"duration">;
|
|
2441
2468
|
aggregate: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2442
2469
|
avg: "avg";
|
|
2443
|
-
sum: "sum";
|
|
2444
2470
|
min: "min";
|
|
2445
2471
|
max: "max";
|
|
2472
|
+
sum: "sum";
|
|
2446
2473
|
best: "best";
|
|
2447
2474
|
worst: "worst";
|
|
2448
2475
|
}>>;
|
|
@@ -2451,9 +2478,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2451
2478
|
kind: z$1.ZodLiteral<"cacheHits">;
|
|
2452
2479
|
aggregate: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2453
2480
|
avg: "avg";
|
|
2454
|
-
sum: "sum";
|
|
2455
2481
|
min: "min";
|
|
2456
2482
|
max: "max";
|
|
2483
|
+
sum: "sum";
|
|
2457
2484
|
best: "best";
|
|
2458
2485
|
worst: "worst";
|
|
2459
2486
|
}>>;
|
|
@@ -2464,9 +2491,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2464
2491
|
label: z$1.ZodOptional<z$1.ZodString>;
|
|
2465
2492
|
aggregate: z$1.ZodEnum<{
|
|
2466
2493
|
avg: "avg";
|
|
2467
|
-
sum: "sum";
|
|
2468
2494
|
min: "min";
|
|
2469
2495
|
max: "max";
|
|
2496
|
+
sum: "sum";
|
|
2470
2497
|
best: "best";
|
|
2471
2498
|
worst: "worst";
|
|
2472
2499
|
}>;
|
|
@@ -2491,9 +2518,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2491
2518
|
}, z$1.core.$strip>], "kind">>>;
|
|
2492
2519
|
defaultStatAggregate: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2493
2520
|
avg: "avg";
|
|
2494
|
-
sum: "sum";
|
|
2495
2521
|
min: "min";
|
|
2496
2522
|
max: "max";
|
|
2523
|
+
sum: "sum";
|
|
2497
2524
|
best: "best";
|
|
2498
2525
|
worst: "worst";
|
|
2499
2526
|
}>>;
|
|
@@ -2530,9 +2557,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2530
2557
|
key: z$1.ZodString;
|
|
2531
2558
|
aggregate: z$1.ZodEnum<{
|
|
2532
2559
|
avg: "avg";
|
|
2533
|
-
sum: "sum";
|
|
2534
2560
|
min: "min";
|
|
2535
2561
|
max: "max";
|
|
2562
|
+
sum: "sum";
|
|
2536
2563
|
latest: "latest";
|
|
2537
2564
|
passThresholdRate: "passThresholdRate";
|
|
2538
2565
|
}>;
|
|
@@ -2572,9 +2599,9 @@ declare const evalSummarySchema$1: z$1.ZodObject<{
|
|
|
2572
2599
|
key: z$1.ZodString;
|
|
2573
2600
|
aggregate: z$1.ZodEnum<{
|
|
2574
2601
|
avg: "avg";
|
|
2575
|
-
sum: "sum";
|
|
2576
2602
|
min: "min";
|
|
2577
2603
|
max: "max";
|
|
2604
|
+
sum: "sum";
|
|
2578
2605
|
latest: "latest";
|
|
2579
2606
|
passThresholdRate: "passThresholdRate";
|
|
2580
2607
|
}>;
|
|
@@ -2671,11 +2698,11 @@ declare const caseRowSchema$1: z$1.ZodObject<{
|
|
|
2671
2698
|
tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2672
2699
|
status: z$1.ZodEnum<{
|
|
2673
2700
|
error: "error";
|
|
2674
|
-
pending: "pending";
|
|
2675
|
-
running: "running";
|
|
2676
|
-
cancelled: "cancelled";
|
|
2677
2701
|
pass: "pass";
|
|
2678
2702
|
fail: "fail";
|
|
2703
|
+
running: "running";
|
|
2704
|
+
cancelled: "cancelled";
|
|
2705
|
+
pending: "pending";
|
|
2679
2706
|
}>;
|
|
2680
2707
|
durationMs: z$1.ZodNullable<z$1.ZodNumber>;
|
|
2681
2708
|
cacheHits: z$1.ZodOptional<z$1.ZodNumber>;
|
|
@@ -2756,6 +2783,7 @@ type RunLogLevel = z$1.infer<typeof runLogLevelSchema>;
|
|
|
2756
2783
|
declare const runLogPhaseSchema: z$1.ZodEnum<{
|
|
2757
2784
|
eval: "eval";
|
|
2758
2785
|
derive: "derive";
|
|
2786
|
+
tracingAssertions: "tracingAssertions";
|
|
2759
2787
|
outputsSchema: "outputsSchema";
|
|
2760
2788
|
scorer: "scorer";
|
|
2761
2789
|
}>;
|
|
@@ -2782,6 +2810,7 @@ declare const runLogEntrySchema: z$1.ZodObject<{
|
|
|
2782
2810
|
phase: z$1.ZodEnum<{
|
|
2783
2811
|
eval: "eval";
|
|
2784
2812
|
derive: "derive";
|
|
2813
|
+
tracingAssertions: "tracingAssertions";
|
|
2785
2814
|
outputsSchema: "outputsSchema";
|
|
2786
2815
|
scorer: "scorer";
|
|
2787
2816
|
}>;
|
|
@@ -2862,8 +2891,8 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
2862
2891
|
subtree: "subtree";
|
|
2863
2892
|
}>>;
|
|
2864
2893
|
mode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2865
|
-
all: "all";
|
|
2866
2894
|
sum: "sum";
|
|
2895
|
+
all: "all";
|
|
2867
2896
|
last: "last";
|
|
2868
2897
|
}>>;
|
|
2869
2898
|
}, z$1.core.$strip>>>;
|
|
@@ -2874,10 +2903,10 @@ declare const scoreTraceSchema: z$1.ZodObject<{
|
|
|
2874
2903
|
namespace: z$1.ZodString;
|
|
2875
2904
|
key: z$1.ZodString;
|
|
2876
2905
|
status: z$1.ZodEnum<{
|
|
2877
|
-
bypass: "bypass";
|
|
2878
|
-
refresh: "refresh";
|
|
2879
2906
|
hit: "hit";
|
|
2880
2907
|
miss: "miss";
|
|
2908
|
+
refresh: "refresh";
|
|
2909
|
+
bypass: "bypass";
|
|
2881
2910
|
}>;
|
|
2882
2911
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
2883
2912
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -2896,11 +2925,11 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2896
2925
|
tags: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
2897
2926
|
status: z$1.ZodEnum<{
|
|
2898
2927
|
error: "error";
|
|
2899
|
-
pending: "pending";
|
|
2900
|
-
running: "running";
|
|
2901
|
-
cancelled: "cancelled";
|
|
2902
2928
|
pass: "pass";
|
|
2903
2929
|
fail: "fail";
|
|
2930
|
+
running: "running";
|
|
2931
|
+
cancelled: "cancelled";
|
|
2932
|
+
pending: "pending";
|
|
2904
2933
|
}>;
|
|
2905
2934
|
input: z$1.ZodUnknown;
|
|
2906
2935
|
trace: z$1.ZodArray<z$1.ZodObject<{
|
|
@@ -2965,8 +2994,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
2965
2994
|
subtree: "subtree";
|
|
2966
2995
|
}>>;
|
|
2967
2996
|
mode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
2968
|
-
all: "all";
|
|
2969
2997
|
sum: "sum";
|
|
2998
|
+
all: "all";
|
|
2970
2999
|
last: "last";
|
|
2971
3000
|
}>>;
|
|
2972
3001
|
}, z$1.core.$strip>>>;
|
|
@@ -3034,8 +3063,8 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
3034
3063
|
subtree: "subtree";
|
|
3035
3064
|
}>>;
|
|
3036
3065
|
mode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3037
|
-
all: "all";
|
|
3038
3066
|
sum: "sum";
|
|
3067
|
+
all: "all";
|
|
3039
3068
|
last: "last";
|
|
3040
3069
|
}>>;
|
|
3041
3070
|
}, z$1.core.$strip>>>;
|
|
@@ -3046,10 +3075,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
3046
3075
|
namespace: z$1.ZodString;
|
|
3047
3076
|
key: z$1.ZodString;
|
|
3048
3077
|
status: z$1.ZodEnum<{
|
|
3049
|
-
bypass: "bypass";
|
|
3050
|
-
refresh: "refresh";
|
|
3051
3078
|
hit: "hit";
|
|
3052
3079
|
miss: "miss";
|
|
3080
|
+
refresh: "refresh";
|
|
3081
|
+
bypass: "bypass";
|
|
3053
3082
|
}>;
|
|
3054
3083
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
3055
3084
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -3140,6 +3169,7 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
3140
3169
|
phase: z$1.ZodEnum<{
|
|
3141
3170
|
eval: "eval";
|
|
3142
3171
|
derive: "derive";
|
|
3172
|
+
tracingAssertions: "tracingAssertions";
|
|
3143
3173
|
outputsSchema: "outputsSchema";
|
|
3144
3174
|
scorer: "scorer";
|
|
3145
3175
|
}>;
|
|
@@ -3166,10 +3196,10 @@ declare const caseDetailSchema$1: z$1.ZodObject<{
|
|
|
3166
3196
|
namespace: z$1.ZodString;
|
|
3167
3197
|
key: z$1.ZodString;
|
|
3168
3198
|
status: z$1.ZodEnum<{
|
|
3169
|
-
bypass: "bypass";
|
|
3170
|
-
refresh: "refresh";
|
|
3171
3199
|
hit: "hit";
|
|
3172
3200
|
miss: "miss";
|
|
3201
|
+
refresh: "refresh";
|
|
3202
|
+
bypass: "bypass";
|
|
3173
3203
|
}>;
|
|
3174
3204
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
3175
3205
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -3223,9 +3253,9 @@ type EvalChartBuiltinMetric = z$1.infer<typeof evalChartBuiltinMetricSchema>;
|
|
|
3223
3253
|
/** Reducer applied to a numeric column across all cases of a single run. */
|
|
3224
3254
|
declare const evalChartAggregateSchema: z$1.ZodEnum<{
|
|
3225
3255
|
avg: "avg";
|
|
3226
|
-
sum: "sum";
|
|
3227
3256
|
min: "min";
|
|
3228
3257
|
max: "max";
|
|
3258
|
+
sum: "sum";
|
|
3229
3259
|
latest: "latest";
|
|
3230
3260
|
passThresholdRate: "passThresholdRate";
|
|
3231
3261
|
}>;
|
|
@@ -3281,9 +3311,9 @@ declare const evalChartMetricSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObject<{
|
|
|
3281
3311
|
key: z$1.ZodString;
|
|
3282
3312
|
aggregate: z$1.ZodEnum<{
|
|
3283
3313
|
avg: "avg";
|
|
3284
|
-
sum: "sum";
|
|
3285
3314
|
min: "min";
|
|
3286
3315
|
max: "max";
|
|
3316
|
+
sum: "sum";
|
|
3287
3317
|
latest: "latest";
|
|
3288
3318
|
passThresholdRate: "passThresholdRate";
|
|
3289
3319
|
}>;
|
|
@@ -3316,9 +3346,9 @@ declare const evalChartTooltipExtraSchema: z$1.ZodDiscriminatedUnion<[z$1.ZodObj
|
|
|
3316
3346
|
key: z$1.ZodString;
|
|
3317
3347
|
aggregate: z$1.ZodEnum<{
|
|
3318
3348
|
avg: "avg";
|
|
3319
|
-
sum: "sum";
|
|
3320
3349
|
min: "min";
|
|
3321
3350
|
max: "max";
|
|
3351
|
+
sum: "sum";
|
|
3322
3352
|
latest: "latest";
|
|
3323
3353
|
passThresholdRate: "passThresholdRate";
|
|
3324
3354
|
}>;
|
|
@@ -3364,9 +3394,9 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
|
|
|
3364
3394
|
key: z$1.ZodString;
|
|
3365
3395
|
aggregate: z$1.ZodEnum<{
|
|
3366
3396
|
avg: "avg";
|
|
3367
|
-
sum: "sum";
|
|
3368
3397
|
min: "min";
|
|
3369
3398
|
max: "max";
|
|
3399
|
+
sum: "sum";
|
|
3370
3400
|
latest: "latest";
|
|
3371
3401
|
passThresholdRate: "passThresholdRate";
|
|
3372
3402
|
}>;
|
|
@@ -3406,9 +3436,9 @@ declare const evalChartConfigSchema: z$1.ZodObject<{
|
|
|
3406
3436
|
key: z$1.ZodString;
|
|
3407
3437
|
aggregate: z$1.ZodEnum<{
|
|
3408
3438
|
avg: "avg";
|
|
3409
|
-
sum: "sum";
|
|
3410
3439
|
min: "min";
|
|
3411
3440
|
max: "max";
|
|
3441
|
+
sum: "sum";
|
|
3412
3442
|
latest: "latest";
|
|
3413
3443
|
passThresholdRate: "passThresholdRate";
|
|
3414
3444
|
}>;
|
|
@@ -3454,9 +3484,9 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
|
|
|
3454
3484
|
key: z$1.ZodString;
|
|
3455
3485
|
aggregate: z$1.ZodEnum<{
|
|
3456
3486
|
avg: "avg";
|
|
3457
|
-
sum: "sum";
|
|
3458
3487
|
min: "min";
|
|
3459
3488
|
max: "max";
|
|
3489
|
+
sum: "sum";
|
|
3460
3490
|
latest: "latest";
|
|
3461
3491
|
passThresholdRate: "passThresholdRate";
|
|
3462
3492
|
}>;
|
|
@@ -3496,9 +3526,9 @@ declare const evalChartsConfigSchema: z$1.ZodArray<z$1.ZodObject<{
|
|
|
3496
3526
|
key: z$1.ZodString;
|
|
3497
3527
|
aggregate: z$1.ZodEnum<{
|
|
3498
3528
|
avg: "avg";
|
|
3499
|
-
sum: "sum";
|
|
3500
3529
|
min: "min";
|
|
3501
3530
|
max: "max";
|
|
3531
|
+
sum: "sum";
|
|
3502
3532
|
latest: "latest";
|
|
3503
3533
|
passThresholdRate: "passThresholdRate";
|
|
3504
3534
|
}>;
|
|
@@ -3514,10 +3544,10 @@ declare const runManifestSchema$1: z$1.ZodObject<{
|
|
|
3514
3544
|
shortId: z$1.ZodString;
|
|
3515
3545
|
status: z$1.ZodEnum<{
|
|
3516
3546
|
error: "error";
|
|
3517
|
-
pending: "pending";
|
|
3518
3547
|
running: "running";
|
|
3519
|
-
completed: "completed";
|
|
3520
3548
|
cancelled: "cancelled";
|
|
3549
|
+
pending: "pending";
|
|
3550
|
+
completed: "completed";
|
|
3521
3551
|
}>;
|
|
3522
3552
|
temporary: z$1.ZodDefault<z$1.ZodOptional<z$1.ZodBoolean>>;
|
|
3523
3553
|
startedAt: z$1.ZodString;
|
|
@@ -3526,9 +3556,9 @@ declare const runManifestSchema$1: z$1.ZodObject<{
|
|
|
3526
3556
|
evalSourceFingerprints: z$1.ZodDefault<z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodString>>>;
|
|
3527
3557
|
target: z$1.ZodObject<{
|
|
3528
3558
|
mode: z$1.ZodEnum<{
|
|
3559
|
+
caseIds: "caseIds";
|
|
3529
3560
|
all: "all";
|
|
3530
3561
|
evalIds: "evalIds";
|
|
3531
|
-
caseIds: "caseIds";
|
|
3532
3562
|
}>;
|
|
3533
3563
|
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
3534
3564
|
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
@@ -3542,9 +3572,9 @@ declare const runManifestSchema$1: z$1.ZodObject<{
|
|
|
3542
3572
|
median: "median";
|
|
3543
3573
|
}>>>;
|
|
3544
3574
|
cacheMode: z$1.ZodOptional<z$1.ZodEnum<{
|
|
3545
|
-
use: "use";
|
|
3546
|
-
bypass: "bypass";
|
|
3547
3575
|
refresh: "refresh";
|
|
3576
|
+
bypass: "bypass";
|
|
3577
|
+
use: "use";
|
|
3548
3578
|
}>>;
|
|
3549
3579
|
}, z$1.core.$strip>;
|
|
3550
3580
|
/** Persisted lifecycle metadata for a single eval run. */
|
|
@@ -3554,10 +3584,10 @@ declare const runSummarySchema$1: z$1.ZodObject<{
|
|
|
3554
3584
|
runId: z$1.ZodString;
|
|
3555
3585
|
status: z$1.ZodEnum<{
|
|
3556
3586
|
error: "error";
|
|
3557
|
-
pending: "pending";
|
|
3558
3587
|
running: "running";
|
|
3559
|
-
completed: "completed";
|
|
3560
3588
|
cancelled: "cancelled";
|
|
3589
|
+
pending: "pending";
|
|
3590
|
+
completed: "completed";
|
|
3561
3591
|
}>;
|
|
3562
3592
|
totalCases: z$1.ZodNumber;
|
|
3563
3593
|
passedCases: z$1.ZodNumber;
|
|
@@ -3613,7 +3643,7 @@ type ScopedCaseSummary = {
|
|
|
3613
3643
|
//#endregion
|
|
3614
3644
|
//#region src/evalStatus.d.ts
|
|
3615
3645
|
/** Display status used for eval, file, and folder UI surfaces. */
|
|
3616
|
-
type EvalDisplayStatus = DerivedStatus | 'stale' | 'outdated' | 'unscored';
|
|
3646
|
+
type EvalDisplayStatus = DerivedStatus | 'enqueued' | 'stale' | 'outdated' | 'unscored';
|
|
3617
3647
|
/**
|
|
3618
3648
|
* Derive the user-facing eval status from the raw latest run result plus
|
|
3619
3649
|
* freshness state.
|
|
@@ -3661,10 +3691,17 @@ type EvalCase$1<TInput = unknown> = {
|
|
|
3661
3691
|
};
|
|
3662
3692
|
/** Query helpers built from the flattened trace recorded for one eval case. */
|
|
3663
3693
|
type EvalTraceTree$1 = {
|
|
3664
|
-
spans: EvalTraceSpan$1[];
|
|
3665
|
-
rootSpans: EvalTraceSpan$1[];
|
|
3666
|
-
findSpan: (name: string) => EvalTraceSpan$1 | undefined;
|
|
3667
|
-
|
|
3694
|
+
/** Flat span list in creation order. */spans: EvalTraceSpan$1[]; /** Top-level spans whose `parentId` is `null`. */
|
|
3695
|
+
rootSpans: EvalTraceSpan$1[]; /** Return the first span whose name exactly matches `name`. */
|
|
3696
|
+
findSpan: (name: string) => EvalTraceSpan$1 | undefined; /** Return every span whose name exactly matches `name`. */
|
|
3697
|
+
findSpans: (name: string) => EvalTraceSpan$1[]; /** Return whether any span name exactly matches `name`. */
|
|
3698
|
+
hasSpan: (name: string) => boolean; /** Return every span whose kind exactly matches `kind`. */
|
|
3699
|
+
findSpansByKind: (kind: string) => EvalTraceSpan$1[]; /** Return every span with `kind: 'tool'`. */
|
|
3700
|
+
findToolCallSpans: () => EvalTraceSpan$1[]; /** Return the names of every span with `kind: 'tool'`. */
|
|
3701
|
+
listToolCallSpanNames: () => string[]; /** Return whether a `kind: 'tool'` span has a name exactly matching `name`. */
|
|
3702
|
+
hasToolCallSpan: (name: string) => boolean; /** Return span names in creation order, optionally filtered by kind. */
|
|
3703
|
+
listSpanNames: (kind?: string) => string[]; /** Return span names in depth-first tree order, optionally filtered by kind. */
|
|
3704
|
+
listSpanNamesDfs: (kind?: string) => string[]; /** Return all spans in depth-first tree order. */
|
|
3668
3705
|
flattenDfs: () => EvalTraceSpan$1[];
|
|
3669
3706
|
checkpoints: Map<string, unknown>;
|
|
3670
3707
|
};
|
|
@@ -3684,6 +3721,13 @@ type EvalDeriveFn$1<TInput = unknown> = (ctx: EvalDeriveContext$1<TInput>) => Re
|
|
|
3684
3721
|
/** Trace-derived output config accepted globally and on eval definitions. */
|
|
3685
3722
|
type EvalDeriveConfig$1<TInput = unknown> = EvalDeriveMap$1<TInput> | EvalDeriveFn$1<TInput>;
|
|
3686
3723
|
/** Schema for keyed or object-returning trace-derived output config. */
|
|
3724
|
+
/** Function that records trace-derived assertions for one case. */
|
|
3725
|
+
type EvalTracingAssertionsFn$1<TInput = unknown> = (ctx: EvalDeriveContext$1<TInput>) => MaybePromise<void>;
|
|
3726
|
+
/** Keyed trace-derived assertion config for grouping related checks. */
|
|
3727
|
+
type EvalTracingAssertionsMap$1<TInput = unknown> = Record<string, EvalTracingAssertionsFn$1<TInput>>;
|
|
3728
|
+
/** Trace-derived assertion config accepted globally and on eval definitions. */
|
|
3729
|
+
type EvalTracingAssertionsConfig$1<TInput = unknown> = EvalTracingAssertionsMap$1<TInput> | EvalTracingAssertionsFn$1<TInput>;
|
|
3730
|
+
/** Schema for function or keyed trace-derived assertion config. */
|
|
3687
3731
|
/** UI overrides for a derived or scored column emitted by an eval. */
|
|
3688
3732
|
type EvalColumnOverride$1 = {
|
|
3689
3733
|
/** Display label shown for the column in tables and detail views. */label?: string;
|
|
@@ -4136,9 +4180,19 @@ type AgentEvalsConfig$1 = {
|
|
|
4136
4180
|
* Prefer the keyed map form for shared metrics:
|
|
4137
4181
|
* `{ toolCalls: ({ trace }) => trace.findSpansByKind('tool').length }`.
|
|
4138
4182
|
* The object-returning function form is also supported. Derived outputs
|
|
4139
|
-
* only fill keys that were not already recorded by eval execution.
|
|
4183
|
+
* only fill keys that were not already recorded by eval execution. Do not
|
|
4184
|
+
* call assertion helpers here; use `tracingAssertions` for trace-derived
|
|
4185
|
+
* pass/fail checks.
|
|
4140
4186
|
*/
|
|
4141
4187
|
deriveFromTracing?: EvalDeriveConfig$1;
|
|
4188
|
+
/**
|
|
4189
|
+
* Workspace-wide assertions derived from the finished execution trace.
|
|
4190
|
+
*
|
|
4191
|
+
* These run after `deriveFromTracing` and before output schema validation and
|
|
4192
|
+
* scores. Use `evalAssert(...)` or `evalExpect(...)` inside the callback to
|
|
4193
|
+
* record normal assertion results without creating fake score columns.
|
|
4194
|
+
*/
|
|
4195
|
+
tracingAssertions?: EvalTracingAssertionsConfig$1;
|
|
4142
4196
|
/**
|
|
4143
4197
|
* Workspace-wide stats prepended to every eval's stats row.
|
|
4144
4198
|
*
|
|
@@ -4469,9 +4523,9 @@ declare function extractApiCalls(spans: EvalTraceSpan$1[], config: ResolvedApiCa
|
|
|
4469
4523
|
* - `refresh`: never read, always write (forces re-execution and overwrites).
|
|
4470
4524
|
*/
|
|
4471
4525
|
declare const cacheModeSchema: z$1.ZodEnum<{
|
|
4472
|
-
use: "use";
|
|
4473
|
-
bypass: "bypass";
|
|
4474
4526
|
refresh: "refresh";
|
|
4527
|
+
bypass: "bypass";
|
|
4528
|
+
use: "use";
|
|
4475
4529
|
}>;
|
|
4476
4530
|
/** Mode controlling how cached spans behave during a run. */
|
|
4477
4531
|
type CacheMode = z$1.infer<typeof cacheModeSchema>;
|
|
@@ -4492,10 +4546,10 @@ declare const cacheOperationTypeSchema: z$1.ZodEnum<{
|
|
|
4492
4546
|
type CacheOperationType = z$1.infer<typeof cacheOperationTypeSchema>;
|
|
4493
4547
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4494
4548
|
declare const cacheStatusSchema: z$1.ZodEnum<{
|
|
4495
|
-
bypass: "bypass";
|
|
4496
|
-
refresh: "refresh";
|
|
4497
4549
|
hit: "hit";
|
|
4498
4550
|
miss: "miss";
|
|
4551
|
+
refresh: "refresh";
|
|
4552
|
+
bypass: "bypass";
|
|
4499
4553
|
}>;
|
|
4500
4554
|
/** Status of a cache lookup recorded on a span or case scope. */
|
|
4501
4555
|
type CacheStatus = z$1.infer<typeof cacheStatusSchema>;
|
|
@@ -4512,10 +4566,10 @@ declare const traceCacheRefSchema: z$1.ZodObject<{
|
|
|
4512
4566
|
namespace: z$1.ZodString;
|
|
4513
4567
|
key: z$1.ZodString;
|
|
4514
4568
|
status: z$1.ZodEnum<{
|
|
4515
|
-
bypass: "bypass";
|
|
4516
|
-
refresh: "refresh";
|
|
4517
4569
|
hit: "hit";
|
|
4518
4570
|
miss: "miss";
|
|
4571
|
+
refresh: "refresh";
|
|
4572
|
+
bypass: "bypass";
|
|
4519
4573
|
}>;
|
|
4520
4574
|
read: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
4521
4575
|
stored: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
@@ -5467,9 +5521,9 @@ type ConfigReloadState = z$1.infer<typeof configReloadStateSchema$1>;
|
|
|
5467
5521
|
declare const createRunRequestSchema$1: z$1.ZodObject<{
|
|
5468
5522
|
target: z$1.ZodObject<{
|
|
5469
5523
|
mode: z$1.ZodEnum<{
|
|
5524
|
+
caseIds: "caseIds";
|
|
5470
5525
|
all: "all";
|
|
5471
5526
|
evalIds: "evalIds";
|
|
5472
|
-
caseIds: "caseIds";
|
|
5473
5527
|
}>;
|
|
5474
5528
|
evalKeys: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
5475
5529
|
files: z$1.ZodOptional<z$1.ZodArray<z$1.ZodString>>;
|
|
@@ -5481,9 +5535,9 @@ declare const createRunRequestSchema$1: z$1.ZodObject<{
|
|
|
5481
5535
|
temporary: z$1.ZodOptional<z$1.ZodBoolean>;
|
|
5482
5536
|
cache: z$1.ZodOptional<z$1.ZodObject<{
|
|
5483
5537
|
mode: z$1.ZodDefault<z$1.ZodEnum<{
|
|
5484
|
-
use: "use";
|
|
5485
|
-
bypass: "bypass";
|
|
5486
5538
|
refresh: "refresh";
|
|
5539
|
+
bypass: "bypass";
|
|
5540
|
+
use: "use";
|
|
5487
5541
|
}>>;
|
|
5488
5542
|
}, z$1.core.$strip>>;
|
|
5489
5543
|
manualInputs: z$1.ZodOptional<z$1.ZodRecord<z$1.ZodString, z$1.ZodUnknown>>;
|
|
@@ -6369,6 +6423,7 @@ declare const caseDetailSchema: z$1.ZodObject<{
|
|
|
6369
6423
|
phase: z$1.ZodEnum<{
|
|
6370
6424
|
eval: "eval";
|
|
6371
6425
|
derive: "derive";
|
|
6426
|
+
tracingAssertions: "tracingAssertions";
|
|
6372
6427
|
outputsSchema: "outputsSchema";
|
|
6373
6428
|
scorer: "scorer";
|
|
6374
6429
|
}>;
|
|
@@ -6995,7 +7050,8 @@ type EvalRunner = {
|
|
|
6995
7050
|
getEvals(): EvalSummary$1[]; /** Look up one discovered eval by id. */
|
|
6996
7051
|
getEval(id: string): EvalSummary$1 | undefined; /** Return discovery errors that should be shown before running evals. */
|
|
6997
7052
|
getDiscoveryIssues(): DiscoveryIssue$1[]; /** Return current config-reload state for the long-running app server. */
|
|
6998
|
-
getConfigReloadState(): ConfigReloadState$1; /**
|
|
7053
|
+
getConfigReloadState(): ConfigReloadState$1; /** Return the effective per-run case concurrency after applying defaults. */
|
|
7054
|
+
getConfiguredConcurrency(): number; /** Re-scan configured eval files and emit a discovery update to listeners. */
|
|
6999
7055
|
refreshDiscovery(): Promise<void>;
|
|
7000
7056
|
startRun(request: CreateRunRequest$1): Promise<{
|
|
7001
7057
|
manifest: RunManifest$1;
|
|
@@ -7230,4 +7286,4 @@ declare function defineEval<TInput = unknown, TOutputs extends EvalOutputs = Eva
|
|
|
7230
7286
|
/** Return whether the active eval case has tags matching the typed input. */
|
|
7231
7287
|
declare function matchesEvalTags(input: EvalTagMatchInput): boolean;
|
|
7232
7288
|
//#endregion
|
|
7233
|
-
export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalTraceTree, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|
|
7289
|
+
export { AgentEvalTagRegistry, AgentEvalsConfig, type ApiCallEntry, type ApiCallMetric, type ApiCallMetricFormat, type ApiCallMetricPlacement, type ApiCallMetricValue, type ApiCallsConfigInput, type AssertionFailure, type CacheActivityEntry, type CacheAdapter, type CacheDebugKeyEntry, type CacheDebugKeyFile, type CacheDebugKeyWrite, type CacheEntry, type CacheEntryWithDebugKey, type CacheFile, type CacheHitEntry, type CacheKeyHashInput, type CacheKeyHashOptions, type CacheListItem, type CacheMode, type CacheOperationType, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheRepairSummary, type CacheScopeContext, type CacheSerializationOptions, type CacheStatus, type CallDerivedAttribute, type CallDerivedAttributeContext, type CallDerivedAttributesConfig, type CallDerivedAttributesFn, type CaptureEvalSpanErrorLevel, type CaptureEvalSpanErrorOptions, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type ConfigReloadState, type ConfigReloadStatus, type CreateRunRequest, type DefaultConfigKey, type DerivedStatus, type DiscoveryIssue, EvalAssertionError, type EvalCacheConfig, EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, EvalDefinition, type EvalDeriveConfig, type EvalDeriveContext, type EvalDeriveFn, type EvalDeriveMap, type EvalDeriveValueFn, type EvalDisplayStatus, type EvalExecuteContext, type EvalExpectation, type EvalFreshnessStatus, type EvalManualInputConfig, type EvalManualScoreDef, type EvalOutputs, type EvalOutputsSchema, type EvalRunner, type EvalRuntimeScope, EvalRuntimeUsageError, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalSetOutput, type EvalStartTime, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, EvalTag, EvalTagMatchInput, type EvalTraceTree, type EvalTracingAssertionsConfig, type EvalTracingAssertionsFn, type EvalTracingAssertionsMap, type JsonCell, type LlmCallCostBreakdown, type LlmCallCostCurrency, type LlmCallEntry, type LlmCallMetric, type LlmCallMetricFormat, type LlmCallMetricPlacement, type LlmCallMetricValue, type LlmCallPricing, type LlmCallPricingRate, type LlmCallPricingRegistry, type LlmCallSimulatedTokens, type LlmCallsConfigInput, type LlmCostScenario, type ManualInputDescriptor, type ManualInputFieldDescriptor, type ManualInputFieldKind, type ManualInputFieldOverride, type ManualInputFieldsConfig, type ManualInputFileValue, type ManualInputSelectOption, type MaterializeManualInputFilesResult, type NumberDisplayOptions, type ReadManualInputFileResult, type RemoveDefaultConfig, type ResolvedApiCallMetric, type ResolvedApiCallsConfig, type ResolvedCallDerivedAttribute, type ResolvedLlmCallCostCurrency, type ResolvedLlmCallMetric, type ResolvedLlmCallPricing, type ResolvedLlmCallsConfig, type RunInEvalScopeOptions, type RunLogEntry, type RunLogLevel, type RunLogLocation, type RunLogPhase, type RunLogsConfigInput, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SerializedCacheValue, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceCacheInfo, type TraceCacheRef, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TrialSelectionMode, type UpdateManualScoreRequest, appendToEvalOutput, buildTraceTree, captureEvalSpanError, cleanupStagedManualInputFiles, createRunner, defineEval, deserializeCacheRecording, deserializeCacheValue, evalAssert, evalExpect, evalLog, evalSpan, evalTime, evalTracer, extractApiCalls, extractCacheEntries, extractCacheHits, extractLlmCalls, getCurrentScope, getEvalCaseInput, getEvalRegistry, getNestedAttribute, hashCacheKey, hashCacheKeySync, incrementEvalOutput, isInEvalScope, isManualInputFileValue, manualInputFileValueSchema, matchesEvalTags, materializeManualInputFiles, mergeEvalOutput, nextEvalId, readManualInputFile, repoFile, runCli, runInEvalRuntimeScope, runInEvalScope, runInExistingEvalScope, serializeCacheRecording, serializeCacheValue, setEvalOutput, setScopeCacheContext, simulateLlmCallCost, simulateTokenAllocation, stageManualInputFile, stageManualInputFileFromPath, startEvalBackgroundJob, z };
|