@ls-stack/agent-eval 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/app-CKa9TjXw.mjs +244 -0
- package/dist/apps/web/dist/assets/index-BUz24J7O.css +1 -0
- package/dist/apps/web/dist/assets/index-Dm50Ynbs.js +109 -0
- package/dist/apps/web/dist/favicon.svg +20 -0
- package/dist/apps/web/dist/index.html +34 -0
- package/dist/bin.d.mts +1 -0
- package/dist/bin.mjs +41 -0
- package/dist/cli-CwEFLP0w.mjs +3422 -0
- package/dist/index.d.mts +2043 -0
- package/dist/index.mjs +3 -0
- package/dist/runner-CD5aDJ0C.mjs +15 -0
- package/dist/runner-Ck4X0H3p.mjs +2 -0
- package/dist/src-BDRmaWFu.mjs +2 -0
- package/package.json +71 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,2043 @@
|
|
|
1
|
+
import { z } from "zod/v4";
|
|
2
|
+
|
|
3
|
+
//#region ../shared/src/schemas/display.d.ts
|
|
4
|
+
declare const scalarCellSchema: z.ZodUnion<readonly [z.ZodString, z.ZodNumber, z.ZodBoolean, z.ZodNull]>;
|
|
5
|
+
/** Primitive table cell value supported by the eval UI. */
|
|
6
|
+
type ScalarCell = z.infer<typeof scalarCellSchema>;
|
|
7
|
+
declare const jsonCellSchema: z.ZodType<string | number | boolean | null | Record<string, unknown> | unknown[]>;
|
|
8
|
+
/** JSON-safe value supported by `format: 'json'` columns. */
|
|
9
|
+
type JsonCell = z.infer<typeof jsonCellSchema>;
|
|
10
|
+
declare const repoFileRefSchema: z.ZodObject<{
|
|
11
|
+
source: z.ZodLiteral<"repo">;
|
|
12
|
+
path: z.ZodString;
|
|
13
|
+
mimeType: z.ZodOptional<z.ZodString>;
|
|
14
|
+
}, z.core.$strip>;
|
|
15
|
+
/** Reference to a file that lives in the authored workspace. */
|
|
16
|
+
type RepoFileRef = z.infer<typeof repoFileRefSchema>;
|
|
17
|
+
declare const runArtifactRefSchema: z.ZodObject<{
|
|
18
|
+
source: z.ZodLiteral<"run">;
|
|
19
|
+
artifactId: z.ZodString;
|
|
20
|
+
mimeType: z.ZodString;
|
|
21
|
+
fileName: z.ZodOptional<z.ZodString>;
|
|
22
|
+
}, z.core.$strip>;
|
|
23
|
+
/** Reference to a generated artifact stored under a specific run. */
|
|
24
|
+
type RunArtifactRef = z.infer<typeof runArtifactRefSchema>;
|
|
25
|
+
declare const fileRefSchema: z.ZodUnion<readonly [z.ZodObject<{
|
|
26
|
+
source: z.ZodLiteral<"repo">;
|
|
27
|
+
path: z.ZodString;
|
|
28
|
+
mimeType: z.ZodOptional<z.ZodString>;
|
|
29
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
30
|
+
source: z.ZodLiteral<"run">;
|
|
31
|
+
artifactId: z.ZodString;
|
|
32
|
+
mimeType: z.ZodString;
|
|
33
|
+
fileName: z.ZodOptional<z.ZodString>;
|
|
34
|
+
}, z.core.$strip>]>;
|
|
35
|
+
/** File reference supported by media and file columns. */
|
|
36
|
+
type FileRef = z.infer<typeof fileRefSchema>;
|
|
37
|
+
/** Numeric presentation options for values rendered with `format: 'number'`. */
|
|
38
|
+
type NumberDisplayOptions = {
|
|
39
|
+
/** Number notation used when rendering the value. */notation?: 'standard' | 'compact'; /** Compact style used when `notation: 'compact'` is enabled. */
|
|
40
|
+
compactDisplay?: 'short' | 'long'; /** String prepended to the rendered number, such as `$`. */
|
|
41
|
+
prefix?: string; /** String appended to the rendered number, such as ` ms`. */
|
|
42
|
+
suffix?: string; /** Fixed number of decimal places to render. */
|
|
43
|
+
decimalPlaces?: number;
|
|
44
|
+
};
|
|
45
|
+
/** Schema for numeric presentation options used by number-formatted values. */
|
|
46
|
+
declare const numberDisplayOptionsSchema: z.ZodType<NumberDisplayOptions>;
|
|
47
|
+
/** Schema for the supported column rendering kinds in list views. */
|
|
48
|
+
declare const columnKindSchema: z.ZodEnum<{
|
|
49
|
+
string: "string";
|
|
50
|
+
number: "number";
|
|
51
|
+
boolean: "boolean";
|
|
52
|
+
}>;
|
|
53
|
+
/** Display kind used by a column definition in the UI. */
|
|
54
|
+
type ColumnKind = z.infer<typeof columnKindSchema>;
|
|
55
|
+
/** Schema for the built-in column formatting presets. */
|
|
56
|
+
declare const columnFormatSchema: z.ZodEnum<{
|
|
57
|
+
number: "number";
|
|
58
|
+
boolean: "boolean";
|
|
59
|
+
file: "file";
|
|
60
|
+
markdown: "markdown";
|
|
61
|
+
json: "json";
|
|
62
|
+
image: "image";
|
|
63
|
+
audio: "audio";
|
|
64
|
+
video: "video";
|
|
65
|
+
duration: "duration";
|
|
66
|
+
percent: "percent";
|
|
67
|
+
passFail: "passFail";
|
|
68
|
+
stars: "stars";
|
|
69
|
+
}>;
|
|
70
|
+
/** Formatting preset applied to a column value in the UI. */
|
|
71
|
+
type ColumnFormat = z.infer<typeof columnFormatSchema>;
|
|
72
|
+
/** Schema describing a rendered column in the eval results table. */
|
|
73
|
+
declare const columnDefSchema: z.ZodObject<{
|
|
74
|
+
key: z.ZodString;
|
|
75
|
+
label: z.ZodString;
|
|
76
|
+
kind: z.ZodEnum<{
|
|
77
|
+
string: "string";
|
|
78
|
+
number: "number";
|
|
79
|
+
boolean: "boolean";
|
|
80
|
+
}>;
|
|
81
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
82
|
+
number: "number";
|
|
83
|
+
boolean: "boolean";
|
|
84
|
+
file: "file";
|
|
85
|
+
markdown: "markdown";
|
|
86
|
+
json: "json";
|
|
87
|
+
image: "image";
|
|
88
|
+
audio: "audio";
|
|
89
|
+
video: "video";
|
|
90
|
+
duration: "duration";
|
|
91
|
+
percent: "percent";
|
|
92
|
+
passFail: "passFail";
|
|
93
|
+
stars: "stars";
|
|
94
|
+
}>>;
|
|
95
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
96
|
+
isScore: z.ZodOptional<z.ZodBoolean>;
|
|
97
|
+
isManualScore: z.ZodOptional<z.ZodBoolean>;
|
|
98
|
+
passThreshold: z.ZodOptional<z.ZodNumber>;
|
|
99
|
+
maxStars: z.ZodOptional<z.ZodNumber>;
|
|
100
|
+
hideInTable: z.ZodOptional<z.ZodBoolean>;
|
|
101
|
+
sortable: z.ZodOptional<z.ZodBoolean>;
|
|
102
|
+
align: z.ZodOptional<z.ZodEnum<{
|
|
103
|
+
left: "left";
|
|
104
|
+
center: "center";
|
|
105
|
+
right: "right";
|
|
106
|
+
}>>;
|
|
107
|
+
}, z.core.$strip>;
|
|
108
|
+
/** Column definition exposed to the UI for eval and case tables. */
|
|
109
|
+
type ColumnDef = z.infer<typeof columnDefSchema>;
|
|
110
|
+
/** Schema for any supported value that can populate a table cell. */
|
|
111
|
+
declare const cellValueSchema: z.ZodUnion<readonly [z.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z.ZodUnion<readonly [z.ZodObject<{
|
|
112
|
+
source: z.ZodLiteral<"repo">;
|
|
113
|
+
path: z.ZodString;
|
|
114
|
+
mimeType: z.ZodOptional<z.ZodString>;
|
|
115
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
116
|
+
source: z.ZodLiteral<"run">;
|
|
117
|
+
artifactId: z.ZodString;
|
|
118
|
+
mimeType: z.ZodString;
|
|
119
|
+
fileName: z.ZodOptional<z.ZodString>;
|
|
120
|
+
}, z.core.$strip>]>]>;
|
|
121
|
+
/** Value stored in a rendered eval result table cell. */
|
|
122
|
+
type CellValue = z.infer<typeof cellValueSchema>;
|
|
123
|
+
//#endregion
|
|
124
|
+
//#region ../shared/src/schemas/trace.d.ts
|
|
125
|
+
/** Schema for the semantic categories used to classify trace spans. */
|
|
126
|
+
declare const traceSpanKindSchema: z.ZodEnum<{
|
|
127
|
+
eval: "eval";
|
|
128
|
+
agent: "agent";
|
|
129
|
+
llm: "llm";
|
|
130
|
+
tool: "tool";
|
|
131
|
+
retrieval: "retrieval";
|
|
132
|
+
scorer: "scorer";
|
|
133
|
+
checkpoint: "checkpoint";
|
|
134
|
+
custom: "custom";
|
|
135
|
+
}>;
|
|
136
|
+
/** Semantic category used to classify a trace span in the UI. */
|
|
137
|
+
type TraceSpanKind = z.infer<typeof traceSpanKindSchema>;
|
|
138
|
+
/** Schema for the supported presentation formats of trace attributes. */
|
|
139
|
+
declare const traceAttributeDisplayFormatSchema: z.ZodEnum<{
|
|
140
|
+
string: "string";
|
|
141
|
+
number: "number";
|
|
142
|
+
duration: "duration";
|
|
143
|
+
json: "json";
|
|
144
|
+
}>;
|
|
145
|
+
/**
|
|
146
|
+
* Formatting hint for trace attribute values rendered by the UI.
|
|
147
|
+
*
|
|
148
|
+
* This affects presentation only and does not change the stored value.
|
|
149
|
+
*/
|
|
150
|
+
type TraceAttributeDisplayFormat = z.infer<typeof traceAttributeDisplayFormatSchema>;
|
|
151
|
+
/** Schema for the UI locations where a trace attribute can appear. */
|
|
152
|
+
declare const traceAttributeDisplayPlacementSchema: z.ZodEnum<{
|
|
153
|
+
tree: "tree";
|
|
154
|
+
detail: "detail";
|
|
155
|
+
section: "section";
|
|
156
|
+
}>;
|
|
157
|
+
/** UI locations where a trace attribute may be rendered. */
|
|
158
|
+
type TraceAttributeDisplayPlacement = z.infer<typeof traceAttributeDisplayPlacementSchema>;
|
|
159
|
+
/** Schema for resolved trace display rules sent to the UI. */
|
|
160
|
+
declare const traceAttributeDisplaySchema: z.ZodObject<{
|
|
161
|
+
key: z.ZodOptional<z.ZodString>;
|
|
162
|
+
path: z.ZodString;
|
|
163
|
+
label: z.ZodOptional<z.ZodString>;
|
|
164
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
165
|
+
string: "string";
|
|
166
|
+
number: "number";
|
|
167
|
+
duration: "duration";
|
|
168
|
+
json: "json";
|
|
169
|
+
}>>;
|
|
170
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
171
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
172
|
+
tree: "tree";
|
|
173
|
+
detail: "detail";
|
|
174
|
+
section: "section";
|
|
175
|
+
}>>>;
|
|
176
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
177
|
+
self: "self";
|
|
178
|
+
subtree: "subtree";
|
|
179
|
+
}>>;
|
|
180
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
181
|
+
all: "all";
|
|
182
|
+
last: "last";
|
|
183
|
+
sum: "sum";
|
|
184
|
+
}>>;
|
|
185
|
+
}, z.core.$strip>;
|
|
186
|
+
/**
|
|
187
|
+
* Resolved trace display rule consumed by the UI.
|
|
188
|
+
*
|
|
189
|
+
* `path` points at the attribute to render on each span. `scope` and `mode`
|
|
190
|
+
* control whether the value comes from the current span only or from the full
|
|
191
|
+
* subtree, and how multiple matches are combined.
|
|
192
|
+
*/
|
|
193
|
+
type TraceAttributeDisplay = z.infer<typeof traceAttributeDisplaySchema>;
|
|
194
|
+
/** Schema for trace display config after transforms have been resolved. */
|
|
195
|
+
declare const traceDisplayConfigSchema: z.ZodObject<{
|
|
196
|
+
attributes: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
197
|
+
key: z.ZodOptional<z.ZodString>;
|
|
198
|
+
path: z.ZodString;
|
|
199
|
+
label: z.ZodOptional<z.ZodString>;
|
|
200
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
201
|
+
string: "string";
|
|
202
|
+
number: "number";
|
|
203
|
+
duration: "duration";
|
|
204
|
+
json: "json";
|
|
205
|
+
}>>;
|
|
206
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
207
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
208
|
+
tree: "tree";
|
|
209
|
+
detail: "detail";
|
|
210
|
+
section: "section";
|
|
211
|
+
}>>>;
|
|
212
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
213
|
+
self: "self";
|
|
214
|
+
subtree: "subtree";
|
|
215
|
+
}>>;
|
|
216
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
217
|
+
all: "all";
|
|
218
|
+
last: "last";
|
|
219
|
+
sum: "sum";
|
|
220
|
+
}>>;
|
|
221
|
+
}, z.core.$strip>>>;
|
|
222
|
+
}, z.core.$strip>;
|
|
223
|
+
/** UI-ready trace display configuration attached to case details. */
|
|
224
|
+
type TraceDisplayConfig = z.infer<typeof traceDisplayConfigSchema>;
|
|
225
|
+
/** Context passed to a `traceDisplay` transform while resolving a span value. */
|
|
226
|
+
type TraceAttributeTransformContext = {
|
|
227
|
+
value: unknown;
|
|
228
|
+
span: EvalTraceSpan;
|
|
229
|
+
};
|
|
230
|
+
/**
|
|
231
|
+
* Runner-side transform used to derive a display value from a raw trace
|
|
232
|
+
* attribute.
|
|
233
|
+
*/
|
|
234
|
+
type TraceAttributeTransform = (ctx: TraceAttributeTransformContext) => unknown;
|
|
235
|
+
/** Schema for authored trace display rules accepted from user config. */
|
|
236
|
+
declare const traceAttributeDisplayInputSchema: z.ZodObject<{
|
|
237
|
+
key: z.ZodOptional<z.ZodString>;
|
|
238
|
+
path: z.ZodString;
|
|
239
|
+
label: z.ZodOptional<z.ZodString>;
|
|
240
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
241
|
+
string: "string";
|
|
242
|
+
number: "number";
|
|
243
|
+
duration: "duration";
|
|
244
|
+
json: "json";
|
|
245
|
+
}>>;
|
|
246
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
247
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
248
|
+
tree: "tree";
|
|
249
|
+
detail: "detail";
|
|
250
|
+
section: "section";
|
|
251
|
+
}>>>;
|
|
252
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
253
|
+
self: "self";
|
|
254
|
+
subtree: "subtree";
|
|
255
|
+
}>>;
|
|
256
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
257
|
+
all: "all";
|
|
258
|
+
last: "last";
|
|
259
|
+
sum: "sum";
|
|
260
|
+
}>>;
|
|
261
|
+
transform: z.ZodOptional<z.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
262
|
+
}, z.core.$strip>;
|
|
263
|
+
/**
|
|
264
|
+
* Authored trace display rule accepted in eval definitions and config files.
|
|
265
|
+
*
|
|
266
|
+
* `key` allows the same source `path` to be displayed multiple ways, such as
|
|
267
|
+
* raw and compact views of a single token count. `numberFormat` customizes
|
|
268
|
+
* `format: 'number'` values. `transform` runs in the
|
|
269
|
+
* runner before the UI receives the resolved trace payload.
|
|
270
|
+
*/
|
|
271
|
+
type TraceAttributeDisplayInput = z.infer<typeof traceAttributeDisplayInputSchema>;
|
|
272
|
+
/** Schema for authored trace display config in eval or workspace config. */
|
|
273
|
+
declare const traceDisplayInputConfigSchema: z.ZodObject<{
|
|
274
|
+
attributes: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
275
|
+
key: z.ZodOptional<z.ZodString>;
|
|
276
|
+
path: z.ZodString;
|
|
277
|
+
label: z.ZodOptional<z.ZodString>;
|
|
278
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
279
|
+
string: "string";
|
|
280
|
+
number: "number";
|
|
281
|
+
duration: "duration";
|
|
282
|
+
json: "json";
|
|
283
|
+
}>>;
|
|
284
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
285
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
286
|
+
tree: "tree";
|
|
287
|
+
detail: "detail";
|
|
288
|
+
section: "section";
|
|
289
|
+
}>>>;
|
|
290
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
291
|
+
self: "self";
|
|
292
|
+
subtree: "subtree";
|
|
293
|
+
}>>;
|
|
294
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
295
|
+
all: "all";
|
|
296
|
+
last: "last";
|
|
297
|
+
sum: "sum";
|
|
298
|
+
}>>;
|
|
299
|
+
transform: z.ZodOptional<z.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
300
|
+
}, z.core.$strip>>>;
|
|
301
|
+
}, z.core.$strip>;
|
|
302
|
+
/** Trace display configuration authored by users in config or eval files. */
|
|
303
|
+
type TraceDisplayInputConfig = z.infer<typeof traceDisplayInputConfigSchema>;
|
|
304
|
+
/** Schema for a persisted trace span captured during case execution. */
|
|
305
|
+
declare const traceSpanSchema: z.ZodObject<{
|
|
306
|
+
id: z.ZodString;
|
|
307
|
+
parentId: z.ZodNullable<z.ZodString>;
|
|
308
|
+
caseId: z.ZodString;
|
|
309
|
+
kind: z.ZodEnum<{
|
|
310
|
+
eval: "eval";
|
|
311
|
+
agent: "agent";
|
|
312
|
+
llm: "llm";
|
|
313
|
+
tool: "tool";
|
|
314
|
+
retrieval: "retrieval";
|
|
315
|
+
scorer: "scorer";
|
|
316
|
+
checkpoint: "checkpoint";
|
|
317
|
+
custom: "custom";
|
|
318
|
+
}>;
|
|
319
|
+
name: z.ZodString;
|
|
320
|
+
startedAt: z.ZodString;
|
|
321
|
+
endedAt: z.ZodNullable<z.ZodString>;
|
|
322
|
+
status: z.ZodEnum<{
|
|
323
|
+
error: "error";
|
|
324
|
+
running: "running";
|
|
325
|
+
ok: "ok";
|
|
326
|
+
cancelled: "cancelled";
|
|
327
|
+
}>;
|
|
328
|
+
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
329
|
+
error: z.ZodOptional<z.ZodObject<{
|
|
330
|
+
name: z.ZodOptional<z.ZodString>;
|
|
331
|
+
message: z.ZodString;
|
|
332
|
+
stack: z.ZodOptional<z.ZodString>;
|
|
333
|
+
}, z.core.$strip>>;
|
|
334
|
+
}, z.core.$strip>;
|
|
335
|
+
/** Persisted trace span shape stored for each eval case run. */
|
|
336
|
+
type EvalTraceSpan = z.infer<typeof traceSpanSchema>;
|
|
337
|
+
//#endregion
|
|
338
|
+
//#region ../shared/src/schemas/eval.d.ts
|
|
339
|
+
/** Freshness signal derived from the latest relevant run plus git state. */
|
|
340
|
+
declare const evalFreshnessStatusSchema: z.ZodEnum<{
|
|
341
|
+
fresh: "fresh";
|
|
342
|
+
stale: "stale";
|
|
343
|
+
outdated: "outdated";
|
|
344
|
+
}>;
|
|
345
|
+
/** Freshness signal derived from the latest relevant run plus git state. */
|
|
346
|
+
type EvalFreshnessStatus = z.infer<typeof evalFreshnessStatusSchema>;
|
|
347
|
+
/** Reducer used to collapse a column's per-case values into a single stat. */
|
|
348
|
+
declare const evalStatAggregateSchema: z.ZodEnum<{
|
|
349
|
+
avg: "avg";
|
|
350
|
+
min: "min";
|
|
351
|
+
max: "max";
|
|
352
|
+
sum: "sum";
|
|
353
|
+
last: "last";
|
|
354
|
+
}>;
|
|
355
|
+
/** Reducer used to collapse a column's per-case values into a single stat. */
|
|
356
|
+
type EvalStatAggregate = z.infer<typeof evalStatAggregateSchema>;
|
|
357
|
+
/**
|
|
358
|
+
* One entry in the EvalCard stats row. Built-in kinds use latest run totals;
|
|
359
|
+
* `column` aggregates a score or numeric output column across the latest run.
|
|
360
|
+
*/
|
|
361
|
+
declare const evalStatItemSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
362
|
+
kind: z.ZodLiteral<"cases">;
|
|
363
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
364
|
+
kind: z.ZodLiteral<"passRate">;
|
|
365
|
+
accent: z.ZodOptional<z.ZodBoolean>;
|
|
366
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
367
|
+
kind: z.ZodLiteral<"duration">;
|
|
368
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
369
|
+
kind: z.ZodLiteral<"column">;
|
|
370
|
+
key: z.ZodString;
|
|
371
|
+
label: z.ZodOptional<z.ZodString>;
|
|
372
|
+
aggregate: z.ZodEnum<{
|
|
373
|
+
avg: "avg";
|
|
374
|
+
min: "min";
|
|
375
|
+
max: "max";
|
|
376
|
+
sum: "sum";
|
|
377
|
+
last: "last";
|
|
378
|
+
}>;
|
|
379
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
380
|
+
number: "number";
|
|
381
|
+
boolean: "boolean";
|
|
382
|
+
file: "file";
|
|
383
|
+
duration: "duration";
|
|
384
|
+
markdown: "markdown";
|
|
385
|
+
json: "json";
|
|
386
|
+
image: "image";
|
|
387
|
+
audio: "audio";
|
|
388
|
+
video: "video";
|
|
389
|
+
percent: "percent";
|
|
390
|
+
passFail: "passFail";
|
|
391
|
+
stars: "stars";
|
|
392
|
+
}>>;
|
|
393
|
+
accent: z.ZodOptional<z.ZodBoolean>;
|
|
394
|
+
}, z.core.$strip>], "kind">;
|
|
395
|
+
/** Single stat rendered in the EvalCard stats row. */
|
|
396
|
+
type EvalStatItem = z.infer<typeof evalStatItemSchema>;
|
|
397
|
+
/** Ordered list of stats rendered in the EvalCard stats row. */
|
|
398
|
+
declare const evalStatsConfigSchema: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
399
|
+
kind: z.ZodLiteral<"cases">;
|
|
400
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
401
|
+
kind: z.ZodLiteral<"passRate">;
|
|
402
|
+
accent: z.ZodOptional<z.ZodBoolean>;
|
|
403
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
404
|
+
kind: z.ZodLiteral<"duration">;
|
|
405
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
406
|
+
kind: z.ZodLiteral<"column">;
|
|
407
|
+
key: z.ZodString;
|
|
408
|
+
label: z.ZodOptional<z.ZodString>;
|
|
409
|
+
aggregate: z.ZodEnum<{
|
|
410
|
+
avg: "avg";
|
|
411
|
+
min: "min";
|
|
412
|
+
max: "max";
|
|
413
|
+
sum: "sum";
|
|
414
|
+
last: "last";
|
|
415
|
+
}>;
|
|
416
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
417
|
+
number: "number";
|
|
418
|
+
boolean: "boolean";
|
|
419
|
+
file: "file";
|
|
420
|
+
duration: "duration";
|
|
421
|
+
markdown: "markdown";
|
|
422
|
+
json: "json";
|
|
423
|
+
image: "image";
|
|
424
|
+
audio: "audio";
|
|
425
|
+
video: "video";
|
|
426
|
+
percent: "percent";
|
|
427
|
+
passFail: "passFail";
|
|
428
|
+
stars: "stars";
|
|
429
|
+
}>>;
|
|
430
|
+
accent: z.ZodOptional<z.ZodBoolean>;
|
|
431
|
+
}, z.core.$strip>], "kind">>;
|
|
432
|
+
/** Ordered list of stats rendered in the EvalCard stats row. */
|
|
433
|
+
type EvalStatsConfig = z.infer<typeof evalStatsConfigSchema>;
|
|
434
|
+
/** Schema summarizing a discovered eval for list and overview screens. */
|
|
435
|
+
declare const evalSummarySchema: z.ZodObject<{
|
|
436
|
+
id: z.ZodString;
|
|
437
|
+
title: z.ZodOptional<z.ZodString>;
|
|
438
|
+
filePath: z.ZodString;
|
|
439
|
+
stale: z.ZodBoolean;
|
|
440
|
+
outdated: z.ZodBoolean;
|
|
441
|
+
freshnessStatus: z.ZodEnum<{
|
|
442
|
+
fresh: "fresh";
|
|
443
|
+
stale: "stale";
|
|
444
|
+
outdated: "outdated";
|
|
445
|
+
}>;
|
|
446
|
+
latestRunAt: z.ZodNullable<z.ZodString>;
|
|
447
|
+
latestRunCommitSha: z.ZodNullable<z.ZodString>;
|
|
448
|
+
currentCommitSha: z.ZodNullable<z.ZodString>;
|
|
449
|
+
columnDefs: z.ZodArray<z.ZodObject<{
|
|
450
|
+
key: z.ZodString;
|
|
451
|
+
label: z.ZodString;
|
|
452
|
+
kind: z.ZodEnum<{
|
|
453
|
+
string: "string";
|
|
454
|
+
number: "number";
|
|
455
|
+
boolean: "boolean";
|
|
456
|
+
}>;
|
|
457
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
458
|
+
number: "number";
|
|
459
|
+
boolean: "boolean";
|
|
460
|
+
file: "file";
|
|
461
|
+
duration: "duration";
|
|
462
|
+
markdown: "markdown";
|
|
463
|
+
json: "json";
|
|
464
|
+
image: "image";
|
|
465
|
+
audio: "audio";
|
|
466
|
+
video: "video";
|
|
467
|
+
percent: "percent";
|
|
468
|
+
passFail: "passFail";
|
|
469
|
+
stars: "stars";
|
|
470
|
+
}>>;
|
|
471
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
472
|
+
isScore: z.ZodOptional<z.ZodBoolean>;
|
|
473
|
+
isManualScore: z.ZodOptional<z.ZodBoolean>;
|
|
474
|
+
passThreshold: z.ZodOptional<z.ZodNumber>;
|
|
475
|
+
maxStars: z.ZodOptional<z.ZodNumber>;
|
|
476
|
+
hideInTable: z.ZodOptional<z.ZodBoolean>;
|
|
477
|
+
sortable: z.ZodOptional<z.ZodBoolean>;
|
|
478
|
+
align: z.ZodOptional<z.ZodEnum<{
|
|
479
|
+
left: "left";
|
|
480
|
+
center: "center";
|
|
481
|
+
right: "right";
|
|
482
|
+
}>>;
|
|
483
|
+
}, z.core.$strip>>;
|
|
484
|
+
caseCount: z.ZodNullable<z.ZodNumber>;
|
|
485
|
+
lastRunStatus: z.ZodNullable<z.ZodEnum<{
|
|
486
|
+
error: "error";
|
|
487
|
+
pass: "pass";
|
|
488
|
+
fail: "fail";
|
|
489
|
+
running: "running";
|
|
490
|
+
cancelled: "cancelled";
|
|
491
|
+
unscored: "unscored";
|
|
492
|
+
}>>;
|
|
493
|
+
stats: z.ZodOptional<z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
494
|
+
kind: z.ZodLiteral<"cases">;
|
|
495
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
496
|
+
kind: z.ZodLiteral<"passRate">;
|
|
497
|
+
accent: z.ZodOptional<z.ZodBoolean>;
|
|
498
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
499
|
+
kind: z.ZodLiteral<"duration">;
|
|
500
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
501
|
+
kind: z.ZodLiteral<"column">;
|
|
502
|
+
key: z.ZodString;
|
|
503
|
+
label: z.ZodOptional<z.ZodString>;
|
|
504
|
+
aggregate: z.ZodEnum<{
|
|
505
|
+
avg: "avg";
|
|
506
|
+
min: "min";
|
|
507
|
+
max: "max";
|
|
508
|
+
sum: "sum";
|
|
509
|
+
last: "last";
|
|
510
|
+
}>;
|
|
511
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
512
|
+
number: "number";
|
|
513
|
+
boolean: "boolean";
|
|
514
|
+
file: "file";
|
|
515
|
+
duration: "duration";
|
|
516
|
+
markdown: "markdown";
|
|
517
|
+
json: "json";
|
|
518
|
+
image: "image";
|
|
519
|
+
audio: "audio";
|
|
520
|
+
video: "video";
|
|
521
|
+
percent: "percent";
|
|
522
|
+
passFail: "passFail";
|
|
523
|
+
stars: "stars";
|
|
524
|
+
}>>;
|
|
525
|
+
accent: z.ZodOptional<z.ZodBoolean>;
|
|
526
|
+
}, z.core.$strip>], "kind">>>;
|
|
527
|
+
charts: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
528
|
+
heading: z.ZodOptional<z.ZodString>;
|
|
529
|
+
type: z.ZodEnum<{
|
|
530
|
+
area: "area";
|
|
531
|
+
line: "line";
|
|
532
|
+
bar: "bar";
|
|
533
|
+
}>;
|
|
534
|
+
metrics: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
535
|
+
source: z.ZodLiteral<"builtin">;
|
|
536
|
+
metric: z.ZodEnum<{
|
|
537
|
+
passRate: "passRate";
|
|
538
|
+
durationMs: "durationMs";
|
|
539
|
+
}>;
|
|
540
|
+
label: z.ZodOptional<z.ZodString>;
|
|
541
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
542
|
+
success: "success";
|
|
543
|
+
accent: "accent";
|
|
544
|
+
error: "error";
|
|
545
|
+
accentDim: "accentDim";
|
|
546
|
+
warning: "warning";
|
|
547
|
+
textMuted: "textMuted";
|
|
548
|
+
}>>;
|
|
549
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
550
|
+
left: "left";
|
|
551
|
+
right: "right";
|
|
552
|
+
}>>;
|
|
553
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
554
|
+
source: z.ZodLiteral<"column">;
|
|
555
|
+
key: z.ZodString;
|
|
556
|
+
aggregate: z.ZodEnum<{
|
|
557
|
+
avg: "avg";
|
|
558
|
+
min: "min";
|
|
559
|
+
max: "max";
|
|
560
|
+
sum: "sum";
|
|
561
|
+
latest: "latest";
|
|
562
|
+
passThresholdRate: "passThresholdRate";
|
|
563
|
+
}>;
|
|
564
|
+
label: z.ZodOptional<z.ZodString>;
|
|
565
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
566
|
+
success: "success";
|
|
567
|
+
accent: "accent";
|
|
568
|
+
error: "error";
|
|
569
|
+
accentDim: "accentDim";
|
|
570
|
+
warning: "warning";
|
|
571
|
+
textMuted: "textMuted";
|
|
572
|
+
}>>;
|
|
573
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
574
|
+
left: "left";
|
|
575
|
+
right: "right";
|
|
576
|
+
}>>;
|
|
577
|
+
}, z.core.$strip>], "source">>;
|
|
578
|
+
yDomain: z.ZodOptional<z.ZodObject<{
|
|
579
|
+
left: z.ZodOptional<z.ZodObject<{
|
|
580
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
581
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
582
|
+
}, z.core.$strip>>;
|
|
583
|
+
right: z.ZodOptional<z.ZodObject<{
|
|
584
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
585
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
586
|
+
}, z.core.$strip>>;
|
|
587
|
+
}, z.core.$strip>>;
|
|
588
|
+
tooltipExtras: z.ZodOptional<z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
589
|
+
source: z.ZodLiteral<"builtin">;
|
|
590
|
+
metric: z.ZodEnum<{
|
|
591
|
+
passRate: "passRate";
|
|
592
|
+
durationMs: "durationMs";
|
|
593
|
+
}>;
|
|
594
|
+
label: z.ZodOptional<z.ZodString>;
|
|
595
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
596
|
+
source: z.ZodLiteral<"column">;
|
|
597
|
+
key: z.ZodString;
|
|
598
|
+
aggregate: z.ZodEnum<{
|
|
599
|
+
avg: "avg";
|
|
600
|
+
min: "min";
|
|
601
|
+
max: "max";
|
|
602
|
+
sum: "sum";
|
|
603
|
+
latest: "latest";
|
|
604
|
+
passThresholdRate: "passThresholdRate";
|
|
605
|
+
}>;
|
|
606
|
+
label: z.ZodOptional<z.ZodString>;
|
|
607
|
+
}, z.core.$strip>], "source">>>;
|
|
608
|
+
}, z.core.$strip>>>;
|
|
609
|
+
}, z.core.$strip>;
|
|
610
|
+
/** Metadata shown for one discovered eval in the explorer UI. */
|
|
611
|
+
type EvalSummary = z.infer<typeof evalSummarySchema>;
|
|
612
|
+
/** Schema for one case row in an eval run result table. */
|
|
613
|
+
declare const caseRowSchema: z.ZodObject<{
|
|
614
|
+
caseId: z.ZodString;
|
|
615
|
+
evalId: z.ZodString;
|
|
616
|
+
status: z.ZodEnum<{
|
|
617
|
+
error: "error";
|
|
618
|
+
pass: "pass";
|
|
619
|
+
fail: "fail";
|
|
620
|
+
running: "running";
|
|
621
|
+
cancelled: "cancelled";
|
|
622
|
+
pending: "pending";
|
|
623
|
+
}>;
|
|
624
|
+
latencyMs: z.ZodNullable<z.ZodNumber>;
|
|
625
|
+
costUsd: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
626
|
+
columns: z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z.ZodUnion<readonly [z.ZodObject<{
|
|
627
|
+
source: z.ZodLiteral<"repo">;
|
|
628
|
+
path: z.ZodString;
|
|
629
|
+
mimeType: z.ZodOptional<z.ZodString>;
|
|
630
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
631
|
+
source: z.ZodLiteral<"run">;
|
|
632
|
+
artifactId: z.ZodString;
|
|
633
|
+
mimeType: z.ZodString;
|
|
634
|
+
fileName: z.ZodOptional<z.ZodString>;
|
|
635
|
+
}, z.core.$strip>]>]>>;
|
|
636
|
+
trial: z.ZodNumber;
|
|
637
|
+
}, z.core.$strip>;
|
|
638
|
+
/** Flattened per-case row rendered in run tables and streamed updates. */
|
|
639
|
+
type CaseRow = z.infer<typeof caseRowSchema>;
|
|
640
|
+
/** Structured assertion failure metadata captured for one case run. */
|
|
641
|
+
declare const assertionFailureSchema: z.ZodObject<{
|
|
642
|
+
message: z.ZodString;
|
|
643
|
+
stack: z.ZodOptional<z.ZodString>;
|
|
644
|
+
}, z.core.$strip>;
|
|
645
|
+
/** Assertion failure metadata captured for one case run. */
|
|
646
|
+
type AssertionFailure = z.infer<typeof assertionFailureSchema>;
|
|
647
|
+
/** Trace payload captured while computing one score for a case. */
|
|
648
|
+
declare const scoreTraceSchema: z.ZodObject<{
|
|
649
|
+
trace: z.ZodArray<z.ZodObject<{
|
|
650
|
+
id: z.ZodString;
|
|
651
|
+
parentId: z.ZodNullable<z.ZodString>;
|
|
652
|
+
caseId: z.ZodString;
|
|
653
|
+
kind: z.ZodEnum<{
|
|
654
|
+
custom: "custom";
|
|
655
|
+
eval: "eval";
|
|
656
|
+
agent: "agent";
|
|
657
|
+
llm: "llm";
|
|
658
|
+
tool: "tool";
|
|
659
|
+
retrieval: "retrieval";
|
|
660
|
+
scorer: "scorer";
|
|
661
|
+
checkpoint: "checkpoint";
|
|
662
|
+
}>;
|
|
663
|
+
name: z.ZodString;
|
|
664
|
+
startedAt: z.ZodString;
|
|
665
|
+
endedAt: z.ZodNullable<z.ZodString>;
|
|
666
|
+
status: z.ZodEnum<{
|
|
667
|
+
error: "error";
|
|
668
|
+
running: "running";
|
|
669
|
+
cancelled: "cancelled";
|
|
670
|
+
ok: "ok";
|
|
671
|
+
}>;
|
|
672
|
+
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
673
|
+
error: z.ZodOptional<z.ZodObject<{
|
|
674
|
+
name: z.ZodOptional<z.ZodString>;
|
|
675
|
+
message: z.ZodString;
|
|
676
|
+
stack: z.ZodOptional<z.ZodString>;
|
|
677
|
+
}, z.core.$strip>>;
|
|
678
|
+
}, z.core.$strip>>;
|
|
679
|
+
traceDisplay: z.ZodObject<{
|
|
680
|
+
attributes: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
681
|
+
key: z.ZodOptional<z.ZodString>;
|
|
682
|
+
path: z.ZodString;
|
|
683
|
+
label: z.ZodOptional<z.ZodString>;
|
|
684
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
685
|
+
string: "string";
|
|
686
|
+
number: "number";
|
|
687
|
+
duration: "duration";
|
|
688
|
+
json: "json";
|
|
689
|
+
}>>;
|
|
690
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
691
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
692
|
+
tree: "tree";
|
|
693
|
+
detail: "detail";
|
|
694
|
+
section: "section";
|
|
695
|
+
}>>>;
|
|
696
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
697
|
+
self: "self";
|
|
698
|
+
subtree: "subtree";
|
|
699
|
+
}>>;
|
|
700
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
701
|
+
sum: "sum";
|
|
702
|
+
last: "last";
|
|
703
|
+
all: "all";
|
|
704
|
+
}>>;
|
|
705
|
+
}, z.core.$strip>>>;
|
|
706
|
+
}, z.core.$strip>;
|
|
707
|
+
}, z.core.$strip>;
|
|
708
|
+
/** Trace payload captured while computing one score for a case. */
|
|
709
|
+
type ScoreTrace = z.infer<typeof scoreTraceSchema>;
|
|
710
|
+
/** Schema for the detailed payload shown when opening a specific case. */
|
|
711
|
+
declare const caseDetailSchema: z.ZodObject<{
|
|
712
|
+
caseId: z.ZodString;
|
|
713
|
+
evalId: z.ZodString;
|
|
714
|
+
status: z.ZodEnum<{
|
|
715
|
+
error: "error";
|
|
716
|
+
pass: "pass";
|
|
717
|
+
fail: "fail";
|
|
718
|
+
running: "running";
|
|
719
|
+
cancelled: "cancelled";
|
|
720
|
+
pending: "pending";
|
|
721
|
+
}>;
|
|
722
|
+
input: z.ZodUnknown;
|
|
723
|
+
trace: z.ZodArray<z.ZodObject<{
|
|
724
|
+
id: z.ZodString;
|
|
725
|
+
parentId: z.ZodNullable<z.ZodString>;
|
|
726
|
+
caseId: z.ZodString;
|
|
727
|
+
kind: z.ZodEnum<{
|
|
728
|
+
custom: "custom";
|
|
729
|
+
eval: "eval";
|
|
730
|
+
agent: "agent";
|
|
731
|
+
llm: "llm";
|
|
732
|
+
tool: "tool";
|
|
733
|
+
retrieval: "retrieval";
|
|
734
|
+
scorer: "scorer";
|
|
735
|
+
checkpoint: "checkpoint";
|
|
736
|
+
}>;
|
|
737
|
+
name: z.ZodString;
|
|
738
|
+
startedAt: z.ZodString;
|
|
739
|
+
endedAt: z.ZodNullable<z.ZodString>;
|
|
740
|
+
status: z.ZodEnum<{
|
|
741
|
+
error: "error";
|
|
742
|
+
running: "running";
|
|
743
|
+
cancelled: "cancelled";
|
|
744
|
+
ok: "ok";
|
|
745
|
+
}>;
|
|
746
|
+
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
747
|
+
error: z.ZodOptional<z.ZodObject<{
|
|
748
|
+
name: z.ZodOptional<z.ZodString>;
|
|
749
|
+
message: z.ZodString;
|
|
750
|
+
stack: z.ZodOptional<z.ZodString>;
|
|
751
|
+
}, z.core.$strip>>;
|
|
752
|
+
}, z.core.$strip>>;
|
|
753
|
+
traceDisplay: z.ZodObject<{
|
|
754
|
+
attributes: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
755
|
+
key: z.ZodOptional<z.ZodString>;
|
|
756
|
+
path: z.ZodString;
|
|
757
|
+
label: z.ZodOptional<z.ZodString>;
|
|
758
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
759
|
+
string: "string";
|
|
760
|
+
number: "number";
|
|
761
|
+
duration: "duration";
|
|
762
|
+
json: "json";
|
|
763
|
+
}>>;
|
|
764
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
765
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
766
|
+
tree: "tree";
|
|
767
|
+
detail: "detail";
|
|
768
|
+
section: "section";
|
|
769
|
+
}>>>;
|
|
770
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
771
|
+
self: "self";
|
|
772
|
+
subtree: "subtree";
|
|
773
|
+
}>>;
|
|
774
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
775
|
+
sum: "sum";
|
|
776
|
+
last: "last";
|
|
777
|
+
all: "all";
|
|
778
|
+
}>>;
|
|
779
|
+
}, z.core.$strip>>>;
|
|
780
|
+
}, z.core.$strip>;
|
|
781
|
+
scoringTraces: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
782
|
+
trace: z.ZodArray<z.ZodObject<{
|
|
783
|
+
id: z.ZodString;
|
|
784
|
+
parentId: z.ZodNullable<z.ZodString>;
|
|
785
|
+
caseId: z.ZodString;
|
|
786
|
+
kind: z.ZodEnum<{
|
|
787
|
+
custom: "custom";
|
|
788
|
+
eval: "eval";
|
|
789
|
+
agent: "agent";
|
|
790
|
+
llm: "llm";
|
|
791
|
+
tool: "tool";
|
|
792
|
+
retrieval: "retrieval";
|
|
793
|
+
scorer: "scorer";
|
|
794
|
+
checkpoint: "checkpoint";
|
|
795
|
+
}>;
|
|
796
|
+
name: z.ZodString;
|
|
797
|
+
startedAt: z.ZodString;
|
|
798
|
+
endedAt: z.ZodNullable<z.ZodString>;
|
|
799
|
+
status: z.ZodEnum<{
|
|
800
|
+
error: "error";
|
|
801
|
+
running: "running";
|
|
802
|
+
cancelled: "cancelled";
|
|
803
|
+
ok: "ok";
|
|
804
|
+
}>;
|
|
805
|
+
attributes: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
806
|
+
error: z.ZodOptional<z.ZodObject<{
|
|
807
|
+
name: z.ZodOptional<z.ZodString>;
|
|
808
|
+
message: z.ZodString;
|
|
809
|
+
stack: z.ZodOptional<z.ZodString>;
|
|
810
|
+
}, z.core.$strip>>;
|
|
811
|
+
}, z.core.$strip>>;
|
|
812
|
+
traceDisplay: z.ZodObject<{
|
|
813
|
+
attributes: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
814
|
+
key: z.ZodOptional<z.ZodString>;
|
|
815
|
+
path: z.ZodString;
|
|
816
|
+
label: z.ZodOptional<z.ZodString>;
|
|
817
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
818
|
+
string: "string";
|
|
819
|
+
number: "number";
|
|
820
|
+
duration: "duration";
|
|
821
|
+
json: "json";
|
|
822
|
+
}>>;
|
|
823
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
824
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
825
|
+
tree: "tree";
|
|
826
|
+
detail: "detail";
|
|
827
|
+
section: "section";
|
|
828
|
+
}>>>;
|
|
829
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
830
|
+
self: "self";
|
|
831
|
+
subtree: "subtree";
|
|
832
|
+
}>>;
|
|
833
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
834
|
+
sum: "sum";
|
|
835
|
+
last: "last";
|
|
836
|
+
all: "all";
|
|
837
|
+
}>>;
|
|
838
|
+
}, z.core.$strip>>>;
|
|
839
|
+
}, z.core.$strip>;
|
|
840
|
+
}, z.core.$strip>>>;
|
|
841
|
+
columns: z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodType<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown, z.core.$ZodTypeInternals<string | number | boolean | Record<string, unknown> | unknown[] | null, unknown>>, z.ZodUnion<readonly [z.ZodObject<{
|
|
842
|
+
source: z.ZodLiteral<"repo">;
|
|
843
|
+
path: z.ZodString;
|
|
844
|
+
mimeType: z.ZodOptional<z.ZodString>;
|
|
845
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
846
|
+
source: z.ZodLiteral<"run">;
|
|
847
|
+
artifactId: z.ZodString;
|
|
848
|
+
mimeType: z.ZodString;
|
|
849
|
+
fileName: z.ZodOptional<z.ZodString>;
|
|
850
|
+
}, z.core.$strip>]>]>>;
|
|
851
|
+
assertionFailures: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
|
|
852
|
+
message: z.ZodString;
|
|
853
|
+
stack: z.ZodOptional<z.ZodString>;
|
|
854
|
+
}, z.core.$strip>, z.ZodPipe<z.ZodString, z.ZodTransform<{
|
|
855
|
+
message: string;
|
|
856
|
+
stack?: string | undefined;
|
|
857
|
+
}, string>>]>>;
|
|
858
|
+
error: z.ZodNullable<z.ZodObject<{
|
|
859
|
+
name: z.ZodOptional<z.ZodString>;
|
|
860
|
+
message: z.ZodString;
|
|
861
|
+
stack: z.ZodOptional<z.ZodString>;
|
|
862
|
+
}, z.core.$strip>>;
|
|
863
|
+
trial: z.ZodNumber;
|
|
864
|
+
}, z.core.$strip>;
|
|
865
|
+
/** Full case payload including inputs, trace, outputs, and failures. */
|
|
866
|
+
type CaseDetail = z.infer<typeof caseDetailSchema>;
|
|
867
|
+
//#endregion
|
|
868
|
+
//#region ../shared/src/schemas/chart.d.ts
|
|
869
|
+
/** Chart type rendered for a single eval history chart. */
|
|
870
|
+
declare const evalChartTypeSchema: z.ZodEnum<{
|
|
871
|
+
area: "area";
|
|
872
|
+
line: "line";
|
|
873
|
+
bar: "bar";
|
|
874
|
+
}>;
|
|
875
|
+
/** Chart type rendered for a single eval history chart. */
|
|
876
|
+
type EvalChartType = z.infer<typeof evalChartTypeSchema>;
|
|
877
|
+
/**
|
|
878
|
+
* Run-level metric sourced from the aggregated `RunSummary` for a run, rather
|
|
879
|
+
* than from a per-case column.
|
|
880
|
+
*/
|
|
881
|
+
declare const evalChartBuiltinMetricSchema: z.ZodEnum<{
|
|
882
|
+
passRate: "passRate";
|
|
883
|
+
durationMs: "durationMs";
|
|
884
|
+
}>;
|
|
885
|
+
/**
|
|
886
|
+
* Run-level metric sourced from the aggregated `RunSummary` for a run, rather
|
|
887
|
+
* than from a per-case column.
|
|
888
|
+
*/
|
|
889
|
+
type EvalChartBuiltinMetric = z.infer<typeof evalChartBuiltinMetricSchema>;
|
|
890
|
+
/** Reducer applied to a numeric column across all cases of a single run. */
|
|
891
|
+
declare const evalChartAggregateSchema: z.ZodEnum<{
|
|
892
|
+
avg: "avg";
|
|
893
|
+
sum: "sum";
|
|
894
|
+
min: "min";
|
|
895
|
+
max: "max";
|
|
896
|
+
latest: "latest";
|
|
897
|
+
passThresholdRate: "passThresholdRate";
|
|
898
|
+
}>;
|
|
899
|
+
/** Reducer applied to a numeric column across all cases of a single run. */
|
|
900
|
+
type EvalChartAggregate = z.infer<typeof evalChartAggregateSchema>;
|
|
901
|
+
/**
|
|
902
|
+
* Semantic color token resolved to a theme color by the web UI. The SDK does
|
|
903
|
+
* not emit raw hex so authored evals stay decoupled from the web theme.
|
|
904
|
+
*/
|
|
905
|
+
declare const evalChartColorSchema: z.ZodEnum<{
|
|
906
|
+
accent: "accent";
|
|
907
|
+
accentDim: "accentDim";
|
|
908
|
+
success: "success";
|
|
909
|
+
error: "error";
|
|
910
|
+
warning: "warning";
|
|
911
|
+
textMuted: "textMuted";
|
|
912
|
+
}>;
|
|
913
|
+
/** Semantic color token resolved to a theme color by the web UI. */
|
|
914
|
+
type EvalChartColor = z.infer<typeof evalChartColorSchema>;
|
|
915
|
+
/** Y-axis placement for a plotted series on a dual-axis chart. */
|
|
916
|
+
declare const evalChartAxisSchema: z.ZodEnum<{
|
|
917
|
+
left: "left";
|
|
918
|
+
right: "right";
|
|
919
|
+
}>;
|
|
920
|
+
/** Y-axis placement for a plotted series on a dual-axis chart. */
|
|
921
|
+
type EvalChartAxis = z.infer<typeof evalChartAxisSchema>;
|
|
922
|
+
/**
|
|
923
|
+
* One plotted series on an eval history chart. `builtin` metrics come from the
|
|
924
|
+
* per-run `RunSummary`; `column` metrics aggregate a per-case score or
|
|
925
|
+
* `setEvalOutput` column across the run using `aggregate`.
|
|
926
|
+
*/
|
|
927
|
+
declare const evalChartMetricSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
928
|
+
source: z.ZodLiteral<"builtin">;
|
|
929
|
+
metric: z.ZodEnum<{
|
|
930
|
+
passRate: "passRate";
|
|
931
|
+
durationMs: "durationMs";
|
|
932
|
+
}>;
|
|
933
|
+
label: z.ZodOptional<z.ZodString>;
|
|
934
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
935
|
+
accent: "accent";
|
|
936
|
+
accentDim: "accentDim";
|
|
937
|
+
success: "success";
|
|
938
|
+
error: "error";
|
|
939
|
+
warning: "warning";
|
|
940
|
+
textMuted: "textMuted";
|
|
941
|
+
}>>;
|
|
942
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
943
|
+
left: "left";
|
|
944
|
+
right: "right";
|
|
945
|
+
}>>;
|
|
946
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
947
|
+
source: z.ZodLiteral<"column">;
|
|
948
|
+
key: z.ZodString;
|
|
949
|
+
aggregate: z.ZodEnum<{
|
|
950
|
+
avg: "avg";
|
|
951
|
+
sum: "sum";
|
|
952
|
+
min: "min";
|
|
953
|
+
max: "max";
|
|
954
|
+
latest: "latest";
|
|
955
|
+
passThresholdRate: "passThresholdRate";
|
|
956
|
+
}>;
|
|
957
|
+
label: z.ZodOptional<z.ZodString>;
|
|
958
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
959
|
+
accent: "accent";
|
|
960
|
+
accentDim: "accentDim";
|
|
961
|
+
success: "success";
|
|
962
|
+
error: "error";
|
|
963
|
+
warning: "warning";
|
|
964
|
+
textMuted: "textMuted";
|
|
965
|
+
}>>;
|
|
966
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
967
|
+
left: "left";
|
|
968
|
+
right: "right";
|
|
969
|
+
}>>;
|
|
970
|
+
}, z.core.$strip>], "source">;
|
|
971
|
+
/** One plotted series on an eval history chart. */
|
|
972
|
+
type EvalChartMetric = z.infer<typeof evalChartMetricSchema>;
|
|
973
|
+
/** Extra field rendered only in the tooltip, not plotted as a series. */
|
|
974
|
+
declare const evalChartTooltipExtraSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
975
|
+
source: z.ZodLiteral<"builtin">;
|
|
976
|
+
metric: z.ZodEnum<{
|
|
977
|
+
passRate: "passRate";
|
|
978
|
+
durationMs: "durationMs";
|
|
979
|
+
}>;
|
|
980
|
+
label: z.ZodOptional<z.ZodString>;
|
|
981
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
982
|
+
source: z.ZodLiteral<"column">;
|
|
983
|
+
key: z.ZodString;
|
|
984
|
+
aggregate: z.ZodEnum<{
|
|
985
|
+
avg: "avg";
|
|
986
|
+
sum: "sum";
|
|
987
|
+
min: "min";
|
|
988
|
+
max: "max";
|
|
989
|
+
latest: "latest";
|
|
990
|
+
passThresholdRate: "passThresholdRate";
|
|
991
|
+
}>;
|
|
992
|
+
label: z.ZodOptional<z.ZodString>;
|
|
993
|
+
}, z.core.$strip>], "source">;
|
|
994
|
+
/** Extra field rendered only in the tooltip, not plotted as a series. */
|
|
995
|
+
type EvalChartTooltipExtra = z.infer<typeof evalChartTooltipExtraSchema>;
|
|
996
|
+
/**
|
|
997
|
+
* Authored configuration for one eval history chart rendered in `EvalCard`.
|
|
998
|
+
* Authors declare a list of these via `EvalDefinition.charts` — the UI renders
|
|
999
|
+
* each entry as its own chart frame, stacked in authoring order.
|
|
1000
|
+
*/
|
|
1001
|
+
declare const evalChartConfigSchema: z.ZodObject<{
|
|
1002
|
+
heading: z.ZodOptional<z.ZodString>;
|
|
1003
|
+
type: z.ZodEnum<{
|
|
1004
|
+
area: "area";
|
|
1005
|
+
line: "line";
|
|
1006
|
+
bar: "bar";
|
|
1007
|
+
}>;
|
|
1008
|
+
metrics: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
1009
|
+
source: z.ZodLiteral<"builtin">;
|
|
1010
|
+
metric: z.ZodEnum<{
|
|
1011
|
+
passRate: "passRate";
|
|
1012
|
+
durationMs: "durationMs";
|
|
1013
|
+
}>;
|
|
1014
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1015
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
1016
|
+
accent: "accent";
|
|
1017
|
+
accentDim: "accentDim";
|
|
1018
|
+
success: "success";
|
|
1019
|
+
error: "error";
|
|
1020
|
+
warning: "warning";
|
|
1021
|
+
textMuted: "textMuted";
|
|
1022
|
+
}>>;
|
|
1023
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
1024
|
+
left: "left";
|
|
1025
|
+
right: "right";
|
|
1026
|
+
}>>;
|
|
1027
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1028
|
+
source: z.ZodLiteral<"column">;
|
|
1029
|
+
key: z.ZodString;
|
|
1030
|
+
aggregate: z.ZodEnum<{
|
|
1031
|
+
avg: "avg";
|
|
1032
|
+
sum: "sum";
|
|
1033
|
+
min: "min";
|
|
1034
|
+
max: "max";
|
|
1035
|
+
latest: "latest";
|
|
1036
|
+
passThresholdRate: "passThresholdRate";
|
|
1037
|
+
}>;
|
|
1038
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1039
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
1040
|
+
accent: "accent";
|
|
1041
|
+
accentDim: "accentDim";
|
|
1042
|
+
success: "success";
|
|
1043
|
+
error: "error";
|
|
1044
|
+
warning: "warning";
|
|
1045
|
+
textMuted: "textMuted";
|
|
1046
|
+
}>>;
|
|
1047
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
1048
|
+
left: "left";
|
|
1049
|
+
right: "right";
|
|
1050
|
+
}>>;
|
|
1051
|
+
}, z.core.$strip>], "source">>;
|
|
1052
|
+
yDomain: z.ZodOptional<z.ZodObject<{
|
|
1053
|
+
left: z.ZodOptional<z.ZodObject<{
|
|
1054
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
1055
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
1056
|
+
}, z.core.$strip>>;
|
|
1057
|
+
right: z.ZodOptional<z.ZodObject<{
|
|
1058
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
1059
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
1060
|
+
}, z.core.$strip>>;
|
|
1061
|
+
}, z.core.$strip>>;
|
|
1062
|
+
tooltipExtras: z.ZodOptional<z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
1063
|
+
source: z.ZodLiteral<"builtin">;
|
|
1064
|
+
metric: z.ZodEnum<{
|
|
1065
|
+
passRate: "passRate";
|
|
1066
|
+
durationMs: "durationMs";
|
|
1067
|
+
}>;
|
|
1068
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1069
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1070
|
+
source: z.ZodLiteral<"column">;
|
|
1071
|
+
key: z.ZodString;
|
|
1072
|
+
aggregate: z.ZodEnum<{
|
|
1073
|
+
avg: "avg";
|
|
1074
|
+
sum: "sum";
|
|
1075
|
+
min: "min";
|
|
1076
|
+
max: "max";
|
|
1077
|
+
latest: "latest";
|
|
1078
|
+
passThresholdRate: "passThresholdRate";
|
|
1079
|
+
}>;
|
|
1080
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1081
|
+
}, z.core.$strip>], "source">>>;
|
|
1082
|
+
}, z.core.$strip>;
|
|
1083
|
+
/** Authored configuration for one eval history chart. */
|
|
1084
|
+
type EvalChartConfig = z.infer<typeof evalChartConfigSchema>;
|
|
1085
|
+
/**
|
|
1086
|
+
* Ordered list of history charts rendered for an eval. Opt-in: when omitted or
|
|
1087
|
+
* empty, the UI renders no history chart at all.
|
|
1088
|
+
*/
|
|
1089
|
+
declare const evalChartsConfigSchema: z.ZodArray<z.ZodObject<{
|
|
1090
|
+
heading: z.ZodOptional<z.ZodString>;
|
|
1091
|
+
type: z.ZodEnum<{
|
|
1092
|
+
area: "area";
|
|
1093
|
+
line: "line";
|
|
1094
|
+
bar: "bar";
|
|
1095
|
+
}>;
|
|
1096
|
+
metrics: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
1097
|
+
source: z.ZodLiteral<"builtin">;
|
|
1098
|
+
metric: z.ZodEnum<{
|
|
1099
|
+
passRate: "passRate";
|
|
1100
|
+
durationMs: "durationMs";
|
|
1101
|
+
}>;
|
|
1102
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1103
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
1104
|
+
accent: "accent";
|
|
1105
|
+
accentDim: "accentDim";
|
|
1106
|
+
success: "success";
|
|
1107
|
+
error: "error";
|
|
1108
|
+
warning: "warning";
|
|
1109
|
+
textMuted: "textMuted";
|
|
1110
|
+
}>>;
|
|
1111
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
1112
|
+
left: "left";
|
|
1113
|
+
right: "right";
|
|
1114
|
+
}>>;
|
|
1115
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1116
|
+
source: z.ZodLiteral<"column">;
|
|
1117
|
+
key: z.ZodString;
|
|
1118
|
+
aggregate: z.ZodEnum<{
|
|
1119
|
+
avg: "avg";
|
|
1120
|
+
sum: "sum";
|
|
1121
|
+
min: "min";
|
|
1122
|
+
max: "max";
|
|
1123
|
+
latest: "latest";
|
|
1124
|
+
passThresholdRate: "passThresholdRate";
|
|
1125
|
+
}>;
|
|
1126
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1127
|
+
color: z.ZodOptional<z.ZodEnum<{
|
|
1128
|
+
accent: "accent";
|
|
1129
|
+
accentDim: "accentDim";
|
|
1130
|
+
success: "success";
|
|
1131
|
+
error: "error";
|
|
1132
|
+
warning: "warning";
|
|
1133
|
+
textMuted: "textMuted";
|
|
1134
|
+
}>>;
|
|
1135
|
+
axis: z.ZodOptional<z.ZodEnum<{
|
|
1136
|
+
left: "left";
|
|
1137
|
+
right: "right";
|
|
1138
|
+
}>>;
|
|
1139
|
+
}, z.core.$strip>], "source">>;
|
|
1140
|
+
yDomain: z.ZodOptional<z.ZodObject<{
|
|
1141
|
+
left: z.ZodOptional<z.ZodObject<{
|
|
1142
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
1143
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
1144
|
+
}, z.core.$strip>>;
|
|
1145
|
+
right: z.ZodOptional<z.ZodObject<{
|
|
1146
|
+
min: z.ZodOptional<z.ZodNumber>;
|
|
1147
|
+
max: z.ZodOptional<z.ZodNumber>;
|
|
1148
|
+
}, z.core.$strip>>;
|
|
1149
|
+
}, z.core.$strip>>;
|
|
1150
|
+
tooltipExtras: z.ZodOptional<z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
1151
|
+
source: z.ZodLiteral<"builtin">;
|
|
1152
|
+
metric: z.ZodEnum<{
|
|
1153
|
+
passRate: "passRate";
|
|
1154
|
+
durationMs: "durationMs";
|
|
1155
|
+
}>;
|
|
1156
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1157
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1158
|
+
source: z.ZodLiteral<"column">;
|
|
1159
|
+
key: z.ZodString;
|
|
1160
|
+
aggregate: z.ZodEnum<{
|
|
1161
|
+
avg: "avg";
|
|
1162
|
+
sum: "sum";
|
|
1163
|
+
min: "min";
|
|
1164
|
+
max: "max";
|
|
1165
|
+
latest: "latest";
|
|
1166
|
+
passThresholdRate: "passThresholdRate";
|
|
1167
|
+
}>;
|
|
1168
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1169
|
+
}, z.core.$strip>], "source">>>;
|
|
1170
|
+
}, z.core.$strip>>;
|
|
1171
|
+
/** Ordered list of history charts rendered for an eval. */
|
|
1172
|
+
type EvalChartsConfig = z.infer<typeof evalChartsConfigSchema>;
|
|
1173
|
+
//#endregion
|
|
1174
|
+
//#region ../shared/src/schemas/run.d.ts
|
|
1175
|
+
/** Schema for persisted metadata about a single run invocation. */
|
|
1176
|
+
declare const runManifestSchema: z.ZodObject<{
|
|
1177
|
+
id: z.ZodString;
|
|
1178
|
+
shortId: z.ZodString;
|
|
1179
|
+
status: z.ZodEnum<{
|
|
1180
|
+
pending: "pending";
|
|
1181
|
+
running: "running";
|
|
1182
|
+
completed: "completed";
|
|
1183
|
+
cancelled: "cancelled";
|
|
1184
|
+
error: "error";
|
|
1185
|
+
}>;
|
|
1186
|
+
startedAt: z.ZodString;
|
|
1187
|
+
endedAt: z.ZodNullable<z.ZodString>;
|
|
1188
|
+
commitSha: z.ZodDefault<z.ZodOptional<z.ZodNullable<z.ZodString>>>;
|
|
1189
|
+
evalSourceFingerprints: z.ZodDefault<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>>;
|
|
1190
|
+
target: z.ZodObject<{
|
|
1191
|
+
mode: z.ZodEnum<{
|
|
1192
|
+
all: "all";
|
|
1193
|
+
evalIds: "evalIds";
|
|
1194
|
+
caseIds: "caseIds";
|
|
1195
|
+
}>;
|
|
1196
|
+
evalIds: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
1197
|
+
caseIds: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
1198
|
+
}, z.core.$strip>;
|
|
1199
|
+
trials: z.ZodNumber;
|
|
1200
|
+
trialSelection: z.ZodDefault<z.ZodOptional<z.ZodEnum<{
|
|
1201
|
+
lowestScore: "lowestScore";
|
|
1202
|
+
median: "median";
|
|
1203
|
+
}>>>;
|
|
1204
|
+
cacheMode: z.ZodOptional<z.ZodEnum<{
|
|
1205
|
+
use: "use";
|
|
1206
|
+
bypass: "bypass";
|
|
1207
|
+
refresh: "refresh";
|
|
1208
|
+
}>>;
|
|
1209
|
+
}, z.core.$strip>;
|
|
1210
|
+
/** Persisted lifecycle metadata for a single eval run. */
|
|
1211
|
+
type RunManifest = z.infer<typeof runManifestSchema>;
|
|
1212
|
+
/** Schema for aggregate metrics computed over a completed or active run. */
|
|
1213
|
+
declare const runSummarySchema: z.ZodObject<{
|
|
1214
|
+
runId: z.ZodString;
|
|
1215
|
+
status: z.ZodEnum<{
|
|
1216
|
+
pending: "pending";
|
|
1217
|
+
running: "running";
|
|
1218
|
+
completed: "completed";
|
|
1219
|
+
cancelled: "cancelled";
|
|
1220
|
+
error: "error";
|
|
1221
|
+
}>;
|
|
1222
|
+
totalCases: z.ZodNumber;
|
|
1223
|
+
passedCases: z.ZodNumber;
|
|
1224
|
+
failedCases: z.ZodNumber;
|
|
1225
|
+
errorCases: z.ZodNumber;
|
|
1226
|
+
cancelledCases: z.ZodNumber;
|
|
1227
|
+
totalDurationMs: z.ZodNullable<z.ZodNumber>;
|
|
1228
|
+
errorMessage: z.ZodDefault<z.ZodNullable<z.ZodString>>;
|
|
1229
|
+
}, z.core.$strip>;
|
|
1230
|
+
/** Roll-up statistics for one run. */
|
|
1231
|
+
type RunSummary = z.infer<typeof runSummarySchema>;
|
|
1232
|
+
//#endregion
|
|
1233
|
+
//#region ../shared/src/status.d.ts
|
|
1234
|
+
/**
|
|
1235
|
+
* Canonical derived result status used for aggregated displays and propagation
|
|
1236
|
+
* across case, eval, file, folder, and run result views.
|
|
1237
|
+
*/
|
|
1238
|
+
type DerivedStatus = 'pending' | 'running' | 'pass' | 'fail' | 'error' | 'cancelled';
|
|
1239
|
+
/**
|
|
1240
|
+
* Aggregate summary derived from a scoped set of case rows.
|
|
1241
|
+
*
|
|
1242
|
+
* This is intentionally separate from `RunSummary`: it represents a summary
|
|
1243
|
+
* over any slice of case rows, such as a single eval within a run.
|
|
1244
|
+
*/
|
|
1245
|
+
type ScopedCaseSummary = {
|
|
1246
|
+
status: DerivedStatus;
|
|
1247
|
+
totalCases: number;
|
|
1248
|
+
passedCases: number;
|
|
1249
|
+
failedCases: number;
|
|
1250
|
+
errorCases: number;
|
|
1251
|
+
cancelledCases: number;
|
|
1252
|
+
pendingCases: number;
|
|
1253
|
+
runningCases: number;
|
|
1254
|
+
totalDurationMs: number | null;
|
|
1255
|
+
};
|
|
1256
|
+
type RunLifecycleStatus = RunManifest['status'] | null | undefined;
|
|
1257
|
+
/**
|
|
1258
|
+
* Derive an aggregate status from child statuses, optionally allowing a raw run
|
|
1259
|
+
* lifecycle status to override active terminal states such as `running`,
|
|
1260
|
+
* `cancelled`, and `error`.
|
|
1261
|
+
*/
|
|
1262
|
+
declare function deriveStatusFromChildStatuses(params: {
|
|
1263
|
+
statuses: Iterable<DerivedStatus | null | undefined>;
|
|
1264
|
+
lifecycleStatus?: RunLifecycleStatus;
|
|
1265
|
+
}): DerivedStatus;
|
|
1266
|
+
/**
|
|
1267
|
+
* Derive an aggregate status from a scoped set of case rows.
|
|
1268
|
+
*
|
|
1269
|
+
* Pass `lifecycleStatus` only when the parent scope's raw run lifecycle should
|
|
1270
|
+
* override the derived child result, such as for a whole-run display.
|
|
1271
|
+
*/
|
|
1272
|
+
declare function deriveStatusFromCaseRows(params: {
|
|
1273
|
+
caseRows: Iterable<Pick<CaseRow, 'status'>>;
|
|
1274
|
+
lifecycleStatus?: RunLifecycleStatus;
|
|
1275
|
+
}): DerivedStatus;
|
|
1276
|
+
/**
|
|
1277
|
+
* Derive counts, aggregate metrics, and display status from a scoped set of
|
|
1278
|
+
* case rows.
|
|
1279
|
+
*/
|
|
1280
|
+
declare function deriveScopedSummaryFromCases(params: {
|
|
1281
|
+
caseRows: Iterable<CaseRow>;
|
|
1282
|
+
lifecycleStatus?: RunLifecycleStatus;
|
|
1283
|
+
}): ScopedCaseSummary;
|
|
1284
|
+
//#endregion
|
|
1285
|
+
//#region ../shared/src/evalStatus.d.ts
|
|
1286
|
+
/** Display status used for eval, file, and folder UI surfaces. */
|
|
1287
|
+
type EvalDisplayStatus = DerivedStatus | 'stale' | 'outdated' | 'unscored';
|
|
1288
|
+
/**
|
|
1289
|
+
* Derive the user-facing eval status from the raw latest run result plus
|
|
1290
|
+
* freshness state.
|
|
1291
|
+
*/
|
|
1292
|
+
declare function getEvalDisplayStatus(params: {
|
|
1293
|
+
freshnessStatus: EvalFreshnessStatus;
|
|
1294
|
+
stale: boolean;
|
|
1295
|
+
outdated: boolean;
|
|
1296
|
+
lastRunStatus: 'pass' | 'fail' | 'error' | 'running' | 'cancelled' | 'unscored' | null;
|
|
1297
|
+
isRunning?: boolean;
|
|
1298
|
+
}): EvalDisplayStatus;
|
|
1299
|
+
//#endregion
|
|
1300
|
+
//#region ../shared/src/evalTitle.d.ts
|
|
1301
|
+
type EvalTitleLike = {
|
|
1302
|
+
id: string;
|
|
1303
|
+
title?: string;
|
|
1304
|
+
};
|
|
1305
|
+
/**
|
|
1306
|
+
* Resolve the display title for an eval.
|
|
1307
|
+
*
|
|
1308
|
+
* Returns the authored `title` when present; otherwise derives a human-readable
|
|
1309
|
+
* label from the stable eval `id` so display surfaces can avoid repeating both
|
|
1310
|
+
* fields in common cases.
|
|
1311
|
+
*/
|
|
1312
|
+
declare function getEvalTitle(evalLike: EvalTitleLike): string;
|
|
1313
|
+
//#endregion
|
|
1314
|
+
//#region ../shared/src/schemas/sse.d.ts
|
|
1315
|
+
declare const sseEventTypeSchema: z.ZodEnum<{
|
|
1316
|
+
"discovery.updated": "discovery.updated";
|
|
1317
|
+
"run.started": "run.started";
|
|
1318
|
+
"run.summary": "run.summary";
|
|
1319
|
+
"case.started": "case.started";
|
|
1320
|
+
"case.updated": "case.updated";
|
|
1321
|
+
"case.finished": "case.finished";
|
|
1322
|
+
"trace.span": "trace.span";
|
|
1323
|
+
"run.finished": "run.finished";
|
|
1324
|
+
"run.cancelled": "run.cancelled";
|
|
1325
|
+
"run.error": "run.error";
|
|
1326
|
+
}>;
|
|
1327
|
+
/** Server-sent event name emitted by the runner or backend. */
|
|
1328
|
+
type SseEventType = z.infer<typeof sseEventTypeSchema>;
|
|
1329
|
+
/** Schema for the SSE envelope used to stream run updates to clients. */
|
|
1330
|
+
declare const sseEnvelopeSchema: z.ZodObject<{
|
|
1331
|
+
type: z.ZodString;
|
|
1332
|
+
runId: z.ZodOptional<z.ZodString>;
|
|
1333
|
+
timestamp: z.ZodString;
|
|
1334
|
+
payload: z.ZodUnknown;
|
|
1335
|
+
}, z.core.$strip>;
|
|
1336
|
+
/** Wire format for a streamed event emitted during eval execution. */
|
|
1337
|
+
type SseEnvelope = z.infer<typeof sseEnvelopeSchema>;
|
|
1338
|
+
//#endregion
|
|
1339
|
+
//#region ../shared/src/schemas/api.d.ts
|
|
1340
|
+
/** Schema for the API request that starts a new eval run. */
|
|
1341
|
+
declare const createRunRequestSchema: z.ZodObject<{
|
|
1342
|
+
target: z.ZodObject<{
|
|
1343
|
+
mode: z.ZodEnum<{
|
|
1344
|
+
all: "all";
|
|
1345
|
+
evalIds: "evalIds";
|
|
1346
|
+
caseIds: "caseIds";
|
|
1347
|
+
}>;
|
|
1348
|
+
evalIds: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
1349
|
+
caseIds: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
1350
|
+
}, z.core.$strip>;
|
|
1351
|
+
trials: z.ZodNumber;
|
|
1352
|
+
cache: z.ZodOptional<z.ZodObject<{
|
|
1353
|
+
mode: z.ZodDefault<z.ZodEnum<{
|
|
1354
|
+
use: "use";
|
|
1355
|
+
bypass: "bypass";
|
|
1356
|
+
refresh: "refresh";
|
|
1357
|
+
}>>;
|
|
1358
|
+
}, z.core.$strip>>;
|
|
1359
|
+
}, z.core.$strip>;
|
|
1360
|
+
/** Request payload accepted by the run creation endpoint. */
|
|
1361
|
+
type CreateRunRequest = z.infer<typeof createRunRequestSchema>;
|
|
1362
|
+
/** Schema for updating a UI-authored manual score on one persisted case. */
|
|
1363
|
+
declare const updateManualScoreRequestSchema: z.ZodObject<{
|
|
1364
|
+
value: z.ZodNullable<z.ZodNumber>;
|
|
1365
|
+
}, z.core.$strip>;
|
|
1366
|
+
/** Request payload accepted by the manual score update endpoint. */
|
|
1367
|
+
type UpdateManualScoreRequest = z.infer<typeof updateManualScoreRequestSchema>;
|
|
1368
|
+
//#endregion
|
|
1369
|
+
//#region ../shared/src/schemas/config.d.ts
|
|
1370
|
+
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
1371
|
+
declare const trialSelectionModeSchema: z.ZodEnum<{
|
|
1372
|
+
lowestScore: "lowestScore";
|
|
1373
|
+
median: "median";
|
|
1374
|
+
}>;
|
|
1375
|
+
/** Strategy used to collapse repeated trials into one stored case result. */
|
|
1376
|
+
type TrialSelectionMode = z.infer<typeof trialSelectionModeSchema>;
|
|
1377
|
+
/** Top-level config authored in `agent-evals.config.ts`. */
|
|
1378
|
+
type AgentEvalsConfig = {
|
|
1379
|
+
/** Root directory used to resolve all relative paths. Defaults to `process.cwd()`. */workspaceRoot?: string; /** Glob patterns (relative to `workspaceRoot`) used to discover eval files. */
|
|
1380
|
+
include: string[]; /** Number of trials per case when none is specified. Defaults to `1`. */
|
|
1381
|
+
defaultTrials?: number;
|
|
1382
|
+
/**
|
|
1383
|
+
* Strategy used to pick the single persisted result when `trials > 1`.
|
|
1384
|
+
*
|
|
1385
|
+
* `lowestScore` is the default. `median` uses the lower median when the
|
|
1386
|
+
* number of trials is even.
|
|
1387
|
+
*/
|
|
1388
|
+
trialSelection?: TrialSelectionMode;
|
|
1389
|
+
/**
|
|
1390
|
+
* Maximum number of case executions that may run in parallel across one run,
|
|
1391
|
+
* including trial fan-out. Defaults to `2`.
|
|
1392
|
+
*/
|
|
1393
|
+
concurrency?: number;
|
|
1394
|
+
/**
|
|
1395
|
+
* Age threshold, in days, before a latest run from a different commit is
|
|
1396
|
+
* considered outdated. Defaults to `14`.
|
|
1397
|
+
*/
|
|
1398
|
+
staleAfterDays?: number;
|
|
1399
|
+
/**
|
|
1400
|
+
* Global trace attribute display config for the UI.
|
|
1401
|
+
*
|
|
1402
|
+
* These rules are merged with per-eval `traceDisplay` rules, with the eval
|
|
1403
|
+
* definition taking precedence for matching `key` or `path` entries.
|
|
1404
|
+
*/
|
|
1405
|
+
traceDisplay?: TraceDisplayInputConfig;
|
|
1406
|
+
/**
|
|
1407
|
+
* Optional controls for the operation cache. When omitted, the cache is
|
|
1408
|
+
* enabled and stored under `<workspaceRoot>/.agent-evals/cache`.
|
|
1409
|
+
*/
|
|
1410
|
+
cache?: {
|
|
1411
|
+
/** Disable the cache entirely; spans with `cache` options execute as if uncached. */enabled?: boolean; /** Override the directory used to persist cache entries. */
|
|
1412
|
+
dir?: string;
|
|
1413
|
+
};
|
|
1414
|
+
};
|
|
1415
|
+
/** Zod schema for validating `agent-evals.config.ts` input. */
|
|
1416
|
+
declare const agentEvalsConfigSchema: z.ZodObject<{
|
|
1417
|
+
workspaceRoot: z.ZodOptional<z.ZodString>;
|
|
1418
|
+
include: z.ZodArray<z.ZodString>;
|
|
1419
|
+
defaultTrials: z.ZodOptional<z.ZodNumber>;
|
|
1420
|
+
trialSelection: z.ZodOptional<z.ZodEnum<{
|
|
1421
|
+
lowestScore: "lowestScore";
|
|
1422
|
+
median: "median";
|
|
1423
|
+
}>>;
|
|
1424
|
+
concurrency: z.ZodOptional<z.ZodNumber>;
|
|
1425
|
+
staleAfterDays: z.ZodOptional<z.ZodNumber>;
|
|
1426
|
+
traceDisplay: z.ZodOptional<z.ZodObject<{
|
|
1427
|
+
attributes: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
1428
|
+
key: z.ZodOptional<z.ZodString>;
|
|
1429
|
+
path: z.ZodString;
|
|
1430
|
+
label: z.ZodOptional<z.ZodString>;
|
|
1431
|
+
format: z.ZodOptional<z.ZodEnum<{
|
|
1432
|
+
string: "string";
|
|
1433
|
+
number: "number";
|
|
1434
|
+
duration: "duration";
|
|
1435
|
+
json: "json";
|
|
1436
|
+
}>>;
|
|
1437
|
+
numberFormat: z.ZodOptional<z.ZodType<NumberDisplayOptions, unknown, z.core.$ZodTypeInternals<NumberDisplayOptions, unknown>>>;
|
|
1438
|
+
placements: z.ZodOptional<z.ZodArray<z.ZodEnum<{
|
|
1439
|
+
tree: "tree";
|
|
1440
|
+
detail: "detail";
|
|
1441
|
+
section: "section";
|
|
1442
|
+
}>>>;
|
|
1443
|
+
scope: z.ZodOptional<z.ZodEnum<{
|
|
1444
|
+
self: "self";
|
|
1445
|
+
subtree: "subtree";
|
|
1446
|
+
}>>;
|
|
1447
|
+
mode: z.ZodOptional<z.ZodEnum<{
|
|
1448
|
+
all: "all";
|
|
1449
|
+
last: "last";
|
|
1450
|
+
sum: "sum";
|
|
1451
|
+
}>>;
|
|
1452
|
+
transform: z.ZodOptional<z.ZodCustom<TraceAttributeTransform, TraceAttributeTransform>>;
|
|
1453
|
+
}, z.core.$strip>>>;
|
|
1454
|
+
}, z.core.$strip>>;
|
|
1455
|
+
cache: z.ZodOptional<z.ZodObject<{
|
|
1456
|
+
enabled: z.ZodOptional<z.ZodBoolean>;
|
|
1457
|
+
dir: z.ZodOptional<z.ZodString>;
|
|
1458
|
+
}, z.core.$strip>>;
|
|
1459
|
+
}, z.core.$strip>;
|
|
1460
|
+
//#endregion
|
|
1461
|
+
//#region ../shared/src/schemas/cache.d.ts
|
|
1462
|
+
/**
|
|
1463
|
+
* Mode that controls how the cache is consulted for a given run.
|
|
1464
|
+
*
|
|
1465
|
+
* - `use`: read cache on hit, write on miss. Default.
|
|
1466
|
+
* - `bypass`: never read, never write.
|
|
1467
|
+
* - `refresh`: never read, always write (forces re-execution and overwrites).
|
|
1468
|
+
*/
|
|
1469
|
+
declare const cacheModeSchema: z.ZodEnum<{
|
|
1470
|
+
use: "use";
|
|
1471
|
+
bypass: "bypass";
|
|
1472
|
+
refresh: "refresh";
|
|
1473
|
+
}>;
|
|
1474
|
+
/** Mode controlling how cached spans behave during a run. */
|
|
1475
|
+
type CacheMode = z.infer<typeof cacheModeSchema>;
|
|
1476
|
+
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
1477
|
+
declare const spanCacheOptionsSchema: z.ZodObject<{
|
|
1478
|
+
key: z.ZodUnknown;
|
|
1479
|
+
namespace: z.ZodOptional<z.ZodString>;
|
|
1480
|
+
}, z.core.$strip>;
|
|
1481
|
+
/** Options accepted by an `evalTracer.span` call to opt the span into caching. */
|
|
1482
|
+
type SpanCacheOptions = z.infer<typeof spanCacheOptionsSchema>;
|
|
1483
|
+
/** Summary of a single persisted cache entry, used by list/delete endpoints. */
|
|
1484
|
+
declare const cacheListItemSchema: z.ZodObject<{
|
|
1485
|
+
key: z.ZodString;
|
|
1486
|
+
namespace: z.ZodString;
|
|
1487
|
+
spanName: z.ZodString;
|
|
1488
|
+
spanKind: z.ZodEnum<{
|
|
1489
|
+
eval: "eval";
|
|
1490
|
+
agent: "agent";
|
|
1491
|
+
llm: "llm";
|
|
1492
|
+
tool: "tool";
|
|
1493
|
+
retrieval: "retrieval";
|
|
1494
|
+
scorer: "scorer";
|
|
1495
|
+
checkpoint: "checkpoint";
|
|
1496
|
+
custom: "custom";
|
|
1497
|
+
}>;
|
|
1498
|
+
storedAt: z.ZodString;
|
|
1499
|
+
codeFingerprint: z.ZodString;
|
|
1500
|
+
sizeBytes: z.ZodNumber;
|
|
1501
|
+
}, z.core.$strip>;
|
|
1502
|
+
/** Summary row for a single cache entry. */
|
|
1503
|
+
type CacheListItem = z.infer<typeof cacheListItemSchema>;
|
|
1504
|
+
/** Serialized nested span captured while recording a cached operation. */
|
|
1505
|
+
type SerializedCacheSpan = {
|
|
1506
|
+
kind: TraceSpanKind;
|
|
1507
|
+
name: string;
|
|
1508
|
+
attributes?: Record<string, unknown>;
|
|
1509
|
+
status: 'running' | 'ok' | 'error' | 'cancelled';
|
|
1510
|
+
error?: {
|
|
1511
|
+
name?: string;
|
|
1512
|
+
message: string;
|
|
1513
|
+
stack?: string;
|
|
1514
|
+
};
|
|
1515
|
+
children: SerializedCacheSpan[];
|
|
1516
|
+
};
|
|
1517
|
+
/** Zod schema for `SerializedCacheSpan`, defined lazily for recursion. */
|
|
1518
|
+
declare const serializedCacheSpanSchema: z.ZodType<SerializedCacheSpan>;
|
|
1519
|
+
/**
|
|
1520
|
+
* One captured operation performed while a cached span's body executed.
|
|
1521
|
+
*
|
|
1522
|
+
* Operations are replayed in order against a fresh scope on cache hit to
|
|
1523
|
+
* reproduce the observable effects of the original run.
|
|
1524
|
+
*/
|
|
1525
|
+
declare const cacheRecordingOpSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
1526
|
+
kind: z.ZodLiteral<"setOutput">;
|
|
1527
|
+
key: z.ZodString;
|
|
1528
|
+
value: z.ZodUnknown;
|
|
1529
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1530
|
+
kind: z.ZodLiteral<"incrementOutput">;
|
|
1531
|
+
key: z.ZodString;
|
|
1532
|
+
delta: z.ZodNumber;
|
|
1533
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1534
|
+
kind: z.ZodLiteral<"checkpoint">;
|
|
1535
|
+
name: z.ZodString;
|
|
1536
|
+
data: z.ZodUnknown;
|
|
1537
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1538
|
+
kind: z.ZodLiteral<"subSpan">;
|
|
1539
|
+
span: z.ZodType<SerializedCacheSpan, unknown, z.core.$ZodTypeInternals<SerializedCacheSpan, unknown>>;
|
|
1540
|
+
}, z.core.$strip>], "kind">;
|
|
1541
|
+
/** Single effect captured by a cache recording. */
|
|
1542
|
+
type CacheRecordingOp = z.infer<typeof cacheRecordingOpSchema>;
|
|
1543
|
+
/** Captured observable effects + return value of a cached span body. */
|
|
1544
|
+
declare const cacheRecordingSchema: z.ZodObject<{
|
|
1545
|
+
returnValue: z.ZodUnknown;
|
|
1546
|
+
finalAttributes: z.ZodRecord<z.ZodString, z.ZodUnknown>;
|
|
1547
|
+
ops: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
1548
|
+
kind: z.ZodLiteral<"setOutput">;
|
|
1549
|
+
key: z.ZodString;
|
|
1550
|
+
value: z.ZodUnknown;
|
|
1551
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1552
|
+
kind: z.ZodLiteral<"incrementOutput">;
|
|
1553
|
+
key: z.ZodString;
|
|
1554
|
+
delta: z.ZodNumber;
|
|
1555
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1556
|
+
kind: z.ZodLiteral<"checkpoint">;
|
|
1557
|
+
name: z.ZodString;
|
|
1558
|
+
data: z.ZodUnknown;
|
|
1559
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1560
|
+
kind: z.ZodLiteral<"subSpan">;
|
|
1561
|
+
span: z.ZodType<SerializedCacheSpan, unknown, z.core.$ZodTypeInternals<SerializedCacheSpan, unknown>>;
|
|
1562
|
+
}, z.core.$strip>], "kind">>;
|
|
1563
|
+
}, z.core.$strip>;
|
|
1564
|
+
/** Captured observable effects + return value of a cached span body. */
|
|
1565
|
+
type CacheRecording = z.infer<typeof cacheRecordingSchema>;
|
|
1566
|
+
/** Persisted cache file containing metadata and a recording. */
|
|
1567
|
+
declare const cacheEntrySchema: z.ZodObject<{
|
|
1568
|
+
version: z.ZodLiteral<1>;
|
|
1569
|
+
key: z.ZodString;
|
|
1570
|
+
namespace: z.ZodString;
|
|
1571
|
+
spanName: z.ZodString;
|
|
1572
|
+
spanKind: z.ZodEnum<{
|
|
1573
|
+
eval: "eval";
|
|
1574
|
+
agent: "agent";
|
|
1575
|
+
llm: "llm";
|
|
1576
|
+
tool: "tool";
|
|
1577
|
+
retrieval: "retrieval";
|
|
1578
|
+
scorer: "scorer";
|
|
1579
|
+
checkpoint: "checkpoint";
|
|
1580
|
+
custom: "custom";
|
|
1581
|
+
}>;
|
|
1582
|
+
storedAt: z.ZodString;
|
|
1583
|
+
codeFingerprint: z.ZodString;
|
|
1584
|
+
recording: z.ZodObject<{
|
|
1585
|
+
returnValue: z.ZodUnknown;
|
|
1586
|
+
finalAttributes: z.ZodRecord<z.ZodString, z.ZodUnknown>;
|
|
1587
|
+
ops: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
1588
|
+
kind: z.ZodLiteral<"setOutput">;
|
|
1589
|
+
key: z.ZodString;
|
|
1590
|
+
value: z.ZodUnknown;
|
|
1591
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1592
|
+
kind: z.ZodLiteral<"incrementOutput">;
|
|
1593
|
+
key: z.ZodString;
|
|
1594
|
+
delta: z.ZodNumber;
|
|
1595
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1596
|
+
kind: z.ZodLiteral<"checkpoint">;
|
|
1597
|
+
name: z.ZodString;
|
|
1598
|
+
data: z.ZodUnknown;
|
|
1599
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
1600
|
+
kind: z.ZodLiteral<"subSpan">;
|
|
1601
|
+
span: z.ZodType<SerializedCacheSpan, unknown, z.core.$ZodTypeInternals<SerializedCacheSpan, unknown>>;
|
|
1602
|
+
}, z.core.$strip>], "kind">>;
|
|
1603
|
+
}, z.core.$strip>;
|
|
1604
|
+
}, z.core.$strip>;
|
|
1605
|
+
/** Persisted cache file contents. */
|
|
1606
|
+
type CacheEntry = z.infer<typeof cacheEntrySchema>;
|
|
1607
|
+
//#endregion
|
|
1608
|
+
//#region ../sdk/src/types.d.ts
|
|
1609
|
+
/** Single authored eval case with its stable identifier and input payload. */
|
|
1610
|
+
type EvalCase<TInput> = {
|
|
1611
|
+
id: string;
|
|
1612
|
+
input: TInput;
|
|
1613
|
+
tags?: string[];
|
|
1614
|
+
};
|
|
1615
|
+
/** UI overrides for a derived or scored column emitted by an eval. */
|
|
1616
|
+
type EvalColumnOverride = {
|
|
1617
|
+
/** Display label shown for the column in tables and detail views. */label?: string;
|
|
1618
|
+
/**
|
|
1619
|
+
* Presentation preset for the value.
|
|
1620
|
+
*
|
|
1621
|
+
* Use this to control how the UI renders the cell and infer table behavior,
|
|
1622
|
+
* for example `number`, `boolean`, `duration`, `markdown`, `json`, or
|
|
1623
|
+
* file/media previews.
|
|
1624
|
+
*/
|
|
1625
|
+
format?: ColumnFormat;
|
|
1626
|
+
/**
|
|
1627
|
+
* Extra options for `format: 'number'`.
|
|
1628
|
+
*
|
|
1629
|
+
* Use this to add a prefix or suffix, force a fixed number of decimal
|
|
1630
|
+
* places, or switch to compact notation such as `1.2K`.
|
|
1631
|
+
*/
|
|
1632
|
+
numberFormat?: NumberDisplayOptions;
|
|
1633
|
+
/**
|
|
1634
|
+
* Hides the column from the runs table while keeping it available in detail
|
|
1635
|
+
* views and raw output data.
|
|
1636
|
+
*/
|
|
1637
|
+
hideInTable?: boolean; /** Whether the UI should allow sorting rows by this column. */
|
|
1638
|
+
sortable?: boolean; /** Horizontal alignment used when rendering the column cells. */
|
|
1639
|
+
align?: 'left' | 'center' | 'right';
|
|
1640
|
+
/**
|
|
1641
|
+
* Maximum number of stars used when `format: 'stars'`.
|
|
1642
|
+
*
|
|
1643
|
+
* Values are still stored as normalized `0..1` numbers; the UI maps the
|
|
1644
|
+
* selected star count evenly across that range.
|
|
1645
|
+
*/
|
|
1646
|
+
maxStars?: number;
|
|
1647
|
+
};
|
|
1648
|
+
/** Column override map keyed by output or score field name. */
|
|
1649
|
+
type EvalColumns = Record<string, EvalColumnOverride>;
|
|
1650
|
+
/** Query helpers built from the flattened trace recorded for one eval case. */
|
|
1651
|
+
type EvalTraceTree = {
|
|
1652
|
+
spans: EvalTraceSpan[];
|
|
1653
|
+
rootSpans: EvalTraceSpan[];
|
|
1654
|
+
findSpan: (name: string) => EvalTraceSpan | undefined;
|
|
1655
|
+
findSpansByKind: (kind: EvalTraceSpan['kind']) => EvalTraceSpan[];
|
|
1656
|
+
flattenDfs: () => EvalTraceSpan[];
|
|
1657
|
+
checkpoints: Map<string, unknown>;
|
|
1658
|
+
};
|
|
1659
|
+
/** Context passed to an eval's `execute` function for a single case run. */
|
|
1660
|
+
type EvalExecuteContext<TInput> = {
|
|
1661
|
+
input: TInput;
|
|
1662
|
+
signal: AbortSignal;
|
|
1663
|
+
};
|
|
1664
|
+
/** Context passed to `deriveFromTracing` after execution has completed. */
|
|
1665
|
+
type EvalDeriveContext<TInput> = {
|
|
1666
|
+
trace: EvalTraceTree;
|
|
1667
|
+
input: TInput;
|
|
1668
|
+
case: EvalCase<TInput>;
|
|
1669
|
+
};
|
|
1670
|
+
/** Context passed to score functions after outputs have been collected. */
|
|
1671
|
+
type EvalScoreContext<TInput> = {
|
|
1672
|
+
input: TInput;
|
|
1673
|
+
outputs: Record<string, unknown>;
|
|
1674
|
+
case: EvalCase<TInput>;
|
|
1675
|
+
};
|
|
1676
|
+
/** Score callback that computes a numeric result for one case. */
|
|
1677
|
+
type EvalScoreFn<TInput> = (ctx: EvalScoreContext<TInput>) => number | Promise<number>;
|
|
1678
|
+
/**
|
|
1679
|
+
* Score definition accepted by `defineEval`, with optional UI metadata.
|
|
1680
|
+
*
|
|
1681
|
+
* When `passThreshold` is provided, this score gates the case pass/fail:
|
|
1682
|
+
* a case fails if its computed value is strictly below the threshold. A
|
|
1683
|
+
* score without a `passThreshold` is informational only and never causes
|
|
1684
|
+
* a case to fail on its own.
|
|
1685
|
+
*/
|
|
1686
|
+
type EvalScoreDef<TInput> = EvalScoreFn<TInput> | ({
|
|
1687
|
+
compute: EvalScoreFn<TInput>;
|
|
1688
|
+
passThreshold?: number;
|
|
1689
|
+
} & EvalColumnOverride);
|
|
1690
|
+
/**
|
|
1691
|
+
* Manual score definition accepted by `defineEval`.
|
|
1692
|
+
*
|
|
1693
|
+
* Manual scores are emitted as score columns with pending values during CLI
|
|
1694
|
+
* execution. The web UI is responsible for setting their normalized `0..1`
|
|
1695
|
+
* values after a run completes.
|
|
1696
|
+
*/
|
|
1697
|
+
type EvalManualScoreDef = EvalColumnOverride & {
|
|
1698
|
+
/**
|
|
1699
|
+
* Optional pass/fail gate applied after a value is filled. Pending manual
|
|
1700
|
+
* values keep the eval in an `unscored` state instead of failing the case.
|
|
1701
|
+
*/
|
|
1702
|
+
passThreshold?: number;
|
|
1703
|
+
};
|
|
1704
|
+
/** Complete authored eval definition consumed by `defineEval`. */
|
|
1705
|
+
type EvalDefinition<TInput = unknown> = {
|
|
1706
|
+
id: string;
|
|
1707
|
+
title?: string;
|
|
1708
|
+
/**
|
|
1709
|
+
* Authored cases for this eval.
|
|
1710
|
+
*
|
|
1711
|
+
* When omitted or resolved to an empty array, the runner still executes the
|
|
1712
|
+
* eval once using a synthetic case with empty object input.
|
|
1713
|
+
*/
|
|
1714
|
+
cases?: EvalCase<TInput>[] | (() => Promise<EvalCase<TInput>[]>);
|
|
1715
|
+
columns?: EvalColumns;
|
|
1716
|
+
/**
|
|
1717
|
+
* Per-eval trace attribute display rules for the UI.
|
|
1718
|
+
*
|
|
1719
|
+
* These are merged with the global `AgentEvalsConfig.traceDisplay` rules.
|
|
1720
|
+
* Matching entries override the global rule by `key`, or by `path` when no
|
|
1721
|
+
* `key` is provided.
|
|
1722
|
+
*/
|
|
1723
|
+
traceDisplay?: TraceDisplayInputConfig;
|
|
1724
|
+
execute: (ctx: EvalExecuteContext<TInput>) => Promise<void> | void;
|
|
1725
|
+
deriveFromTracing?: (ctx: EvalDeriveContext<TInput>) => Record<string, unknown> | Promise<Record<string, unknown>>;
|
|
1726
|
+
scores?: Record<string, EvalScoreDef<TInput>>;
|
|
1727
|
+
/**
|
|
1728
|
+
* Score columns whose values are entered in the web UI after a run.
|
|
1729
|
+
*
|
|
1730
|
+
* Keys become persisted score columns, initialized as pending (`null`) for
|
|
1731
|
+
* every case. Once filled, values are normalized numbers in the `0..1`
|
|
1732
|
+
* range and participate in summaries, stats, charts, and pass thresholds
|
|
1733
|
+
* like computed scores.
|
|
1734
|
+
*/
|
|
1735
|
+
manualScores?: Record<string, EvalManualScoreDef>;
|
|
1736
|
+
/**
|
|
1737
|
+
* Optional stats row configuration for the EvalCard in the web UI.
|
|
1738
|
+
*
|
|
1739
|
+
* Opt-in: when omitted (or empty) the EvalCard renders no stats row at all.
|
|
1740
|
+
* When provided, the stats render in order, left to right.
|
|
1741
|
+
*
|
|
1742
|
+
* Built-in kinds (`cases`, `passRate`, `duration`, `cost`) read from the
|
|
1743
|
+
* latest run summary. `kind: 'column'` aggregates a score or numeric output
|
|
1744
|
+
* column across the latest run's cases — `key` must match one of the eval's
|
|
1745
|
+
* score or column keys, and only finite numeric values participate in the
|
|
1746
|
+
* reduction. When no case has a numeric value for the key the stat renders
|
|
1747
|
+
* an em dash. `label` and `format` default to the matching `ColumnDef`.
|
|
1748
|
+
*/
|
|
1749
|
+
stats?: EvalStatsConfig;
|
|
1750
|
+
/**
|
|
1751
|
+
* Optional history chart configuration for the EvalCard in the web UI.
|
|
1752
|
+
*
|
|
1753
|
+
* Opt-in: when omitted (or empty) the EvalCard renders no history chart at
|
|
1754
|
+
* all. Each entry in the list renders as its own chart frame, stacked in
|
|
1755
|
+
* authoring order.
|
|
1756
|
+
*
|
|
1757
|
+
* Each chart declares its `type` (`area | line | bar`) and one or more
|
|
1758
|
+
* `metrics`. Built-in metrics (`passRate`, `durationMs`) aggregate
|
|
1759
|
+
* the run summary. Column metrics aggregate a score or numeric `setEvalOutput`
|
|
1760
|
+
* column across the run using an `aggregate` reducer (`avg`, `sum`, `min`,
|
|
1761
|
+
* `max`, `latest`, `passThresholdRate`). `passThresholdRate` requires a
|
|
1762
|
+
* score column with `passThreshold`.
|
|
1763
|
+
*/
|
|
1764
|
+
charts?: EvalChartsConfig;
|
|
1765
|
+
};
|
|
1766
|
+
//#endregion
|
|
1767
|
+
//#region ../sdk/src/defineEval.d.ts
|
|
1768
|
+
/**
|
|
1769
|
+
* Registered eval metadata tracked by the SDK during module loading.
|
|
1770
|
+
*
|
|
1771
|
+
* Consumers usually access these entries through `getEvalRegistry()`.
|
|
1772
|
+
*/
|
|
1773
|
+
type EvalRegistryEntry = {
|
|
1774
|
+
id: string;
|
|
1775
|
+
title?: string;
|
|
1776
|
+
use: <R>(fn: <TInput>(def: EvalDefinition<TInput>) => R) => R;
|
|
1777
|
+
};
|
|
1778
|
+
/** Return the in-memory registry of evals defined in the current process. */
|
|
1779
|
+
declare function getEvalRegistry(): Map<string, EvalRegistryEntry>;
|
|
1780
|
+
/**
|
|
1781
|
+
* Register an eval definition with the SDK so the runner can discover it
|
|
1782
|
+
* after importing the eval module.
|
|
1783
|
+
*/
|
|
1784
|
+
declare function defineEval<TInput>(definition: EvalDefinition<TInput>): void;
|
|
1785
|
+
//#endregion
|
|
1786
|
+
//#region ../sdk/src/repoFile.d.ts
|
|
1787
|
+
/**
|
|
1788
|
+
* Create a file reference that can be emitted via `setEvalOutput(...)` and rendered
|
|
1789
|
+
* by a column configured with `format: 'image' | 'audio' | 'video' | 'file'`.
|
|
1790
|
+
*
|
|
1791
|
+
* @param path Relative or absolute path to the repository file.
|
|
1792
|
+
* @param mimeType Optional MIME type hint for UI rendering.
|
|
1793
|
+
* @returns A repo-backed file reference suitable for file/media columns.
|
|
1794
|
+
*/
|
|
1795
|
+
declare function repoFile(path: string, mimeType?: string): RepoFileRef;
|
|
1796
|
+
//#endregion
|
|
1797
|
+
//#region ../sdk/src/runtime.d.ts
|
|
1798
|
+
/**
|
|
1799
|
+
* Adapter used by the SDK to read and write cache entries for cached spans.
|
|
1800
|
+
*
|
|
1801
|
+
* Implementations are typically injected by the runner before the eval case
|
|
1802
|
+
* starts executing.
|
|
1803
|
+
*/
|
|
1804
|
+
type CacheAdapter = {
|
|
1805
|
+
/** Return the stored entry for `keyHash` under `namespace`, or `null`. */lookup(namespace: string, keyHash: string): Promise<CacheEntry | null>; /** Persist a cache entry. Must be safe under concurrent calls. */
|
|
1806
|
+
write(entry: CacheEntry): Promise<void>;
|
|
1807
|
+
};
|
|
1808
|
+
/** Runner-supplied cache context attached to an eval case scope. */
|
|
1809
|
+
type CacheScopeContext = {
|
|
1810
|
+
adapter: CacheAdapter;
|
|
1811
|
+
mode: CacheMode;
|
|
1812
|
+
evalId: string; /** Hash of the eval source file; used to invalidate on code changes. */
|
|
1813
|
+
codeFingerprint: string;
|
|
1814
|
+
};
|
|
1815
|
+
/** Active recording frame captured while a cached span body executes. */
|
|
1816
|
+
type CacheRecordingFrame = {
|
|
1817
|
+
/** Length of `scope.spans` immediately before the cached body started. */baseSpanIndex: number; /** Id of the cached span that owns this recording. */
|
|
1818
|
+
cachedSpanId: string; /** Ordered observable effects recorded during the cached body. */
|
|
1819
|
+
ops: CacheRecordingOp[];
|
|
1820
|
+
};
|
|
1821
|
+
/** Mutable per-case runtime state stored in async local storage. */
|
|
1822
|
+
type EvalCaseScope = {
|
|
1823
|
+
caseId: string;
|
|
1824
|
+
outputs: Record<string, unknown>; /** Structured assertion failures recorded for the current case. */
|
|
1825
|
+
assertionFailures: AssertionFailure[];
|
|
1826
|
+
spans: EvalTraceSpan[];
|
|
1827
|
+
checkpoints: Map<string, unknown>;
|
|
1828
|
+
spanStack: string[];
|
|
1829
|
+
activeSpanStack: EvalTraceSpan[];
|
|
1830
|
+
/**
|
|
1831
|
+
* Stack of active cache recorders. Ops are written to the top-most frame
|
|
1832
|
+
* when it exists and `replayingDepth === 0`.
|
|
1833
|
+
*/
|
|
1834
|
+
recordingStack: CacheRecordingFrame[];
|
|
1835
|
+
/**
|
|
1836
|
+
* Incremented while replaying a cached span, so nested SDK calls do not
|
|
1837
|
+
* accidentally double-record ops into outer recorders.
|
|
1838
|
+
*/
|
|
1839
|
+
replayingDepth: number; /** Runner-provided cache adapter + mode; absent when caching is disabled. */
|
|
1840
|
+
cacheContext: CacheScopeContext | undefined;
|
|
1841
|
+
};
|
|
1842
|
+
/** Error thrown when an eval assertion fails during case execution. */
|
|
1843
|
+
declare class EvalAssertionError extends Error {
|
|
1844
|
+
constructor(message: string);
|
|
1845
|
+
}
|
|
1846
|
+
/** Return the current eval scope for the active async context, if any. */
|
|
1847
|
+
declare function getCurrentScope(): EvalCaseScope | undefined;
|
|
1848
|
+
/**
|
|
1849
|
+
* Return whether the current async execution is inside an active eval case.
|
|
1850
|
+
*
|
|
1851
|
+
* This is useful for shared workflow code that wants to branch on eval-only
|
|
1852
|
+
* behavior without importing or inspecting the full eval scope.
|
|
1853
|
+
*/
|
|
1854
|
+
declare function isInEvalScope(): boolean;
|
|
1855
|
+
/**
|
|
1856
|
+
* Attach cache context (adapter, mode, eval id, fingerprint) to a scope.
|
|
1857
|
+
*
|
|
1858
|
+
* Runner-internal helper called immediately before the user's `execute`
|
|
1859
|
+
* function runs inside `runInEvalScope`.
|
|
1860
|
+
*/
|
|
1861
|
+
declare function setScopeCacheContext(scope: EvalCaseScope, context: CacheScopeContext): void;
|
|
1862
|
+
/** Optional inputs accepted when starting a new eval case scope. */
|
|
1863
|
+
type RunInEvalScopeOptions = {
|
|
1864
|
+
/** Cache adapter + mode attached to the scope before `fn` runs. */cacheContext?: CacheScopeContext;
|
|
1865
|
+
};
|
|
1866
|
+
/**
|
|
1867
|
+
* Execute a callback inside a fresh eval case scope and capture its outputs,
|
|
1868
|
+
* trace data, and terminal error state.
|
|
1869
|
+
*/
|
|
1870
|
+
declare function runInEvalScope<T>(caseId: string, fn: () => Promise<T> | T, options?: RunInEvalScopeOptions): Promise<{
|
|
1871
|
+
result: T | undefined;
|
|
1872
|
+
scope: EvalCaseScope;
|
|
1873
|
+
error: Error | undefined;
|
|
1874
|
+
}>;
|
|
1875
|
+
/**
|
|
1876
|
+
* Record or replace an output value for the current case scope.
|
|
1877
|
+
*
|
|
1878
|
+
* Supported values include scalars, JSON-safe objects/arrays, explicit file
|
|
1879
|
+
* refs, and native `Blob`/`File` instances for media or file columns.
|
|
1880
|
+
*/
|
|
1881
|
+
declare function setEvalOutput(key: string, value: unknown): void;
|
|
1882
|
+
/**
|
|
1883
|
+
* Add a numeric delta to an output value in the current case scope.
|
|
1884
|
+
*
|
|
1885
|
+
* If the existing value is non-numeric, the operation is recorded as an
|
|
1886
|
+
* assertion failure instead of mutating the output.
|
|
1887
|
+
*/
|
|
1888
|
+
declare function incrementEvalOutput(key: string, delta: number): void;
|
|
1889
|
+
/**
|
|
1890
|
+
* Assert a condition for the current eval case and throw on failure.
|
|
1891
|
+
*
|
|
1892
|
+
* Calls made outside `runInEvalScope(...)` are ignored so shared workflow code
|
|
1893
|
+
* can safely reuse `evalAssert(...)` when it also runs outside an eval.
|
|
1894
|
+
*/
|
|
1895
|
+
declare function evalAssert(condition: boolean, message: string): void;
|
|
1896
|
+
//#endregion
|
|
1897
|
+
//#region ../sdk/src/tracer.d.ts
|
|
1898
|
+
/**
|
|
1899
|
+
* Mutable handle for the current span.
|
|
1900
|
+
*
|
|
1901
|
+
* Prefer the ambient `evalSpan` export for most code so helpers deeper in the call
|
|
1902
|
+
* stack can annotate the active span without receiving an injected argument.
|
|
1903
|
+
*/
|
|
1904
|
+
type TraceActiveSpan = {
|
|
1905
|
+
/** Rename the active span after it has been created. */setName(value: string): void; /** Set a single attribute on the active span. Later writes replace the same key. */
|
|
1906
|
+
setAttribute(key: string, value: unknown): void; /** Merge multiple attributes into the active span. */
|
|
1907
|
+
setAttributes(value: Record<string, unknown>): void;
|
|
1908
|
+
};
|
|
1909
|
+
/**
|
|
1910
|
+
* Ambient handle for the active span in the current async context.
|
|
1911
|
+
*
|
|
1912
|
+
* Calls are no-ops when executed outside of `evalTracer.span(...)`.
|
|
1913
|
+
*/
|
|
1914
|
+
declare const evalSpan: TraceActiveSpan;
|
|
1915
|
+
type TraceSpanInfoBase = {
|
|
1916
|
+
kind: EvalTraceSpan['kind'];
|
|
1917
|
+
name: string;
|
|
1918
|
+
attributes?: Record<string, unknown>;
|
|
1919
|
+
};
|
|
1920
|
+
/** Info accepted by `evalTracer.span(info, fn)` when creating an uncached span. */
|
|
1921
|
+
type TraceSpanInfoUncached = TraceSpanInfoBase & {
|
|
1922
|
+
cache?: undefined;
|
|
1923
|
+
};
|
|
1924
|
+
/**
|
|
1925
|
+
* Info accepted by `evalTracer.span(info, fn)` when opting in to caching.
|
|
1926
|
+
*
|
|
1927
|
+
* Cached spans return `Promise<unknown>` because the replayed value comes from
|
|
1928
|
+
* a JSON round-trip on cache hit. Narrow the value yourself when you need a
|
|
1929
|
+
* typed return.
|
|
1930
|
+
*/
|
|
1931
|
+
type TraceSpanInfoCached = TraceSpanInfoBase & {
|
|
1932
|
+
cache: SpanCacheOptions;
|
|
1933
|
+
};
|
|
1934
|
+
/** Info accepted by `evalTracer.span(info, fn)`. */
|
|
1935
|
+
type TraceSpanInfo = TraceSpanInfoUncached | TraceSpanInfoCached;
|
|
1936
|
+
declare function traceSpan<T>(info: TraceSpanInfoUncached, fn: () => Promise<T> | T): Promise<T>;
|
|
1937
|
+
declare function traceSpan<T>(info: TraceSpanInfoUncached, fn: (span: TraceActiveSpan) => Promise<T> | T): Promise<T>;
|
|
1938
|
+
declare function traceSpan(info: TraceSpanInfoCached, fn: () => unknown): Promise<unknown>;
|
|
1939
|
+
declare function traceSpan(info: TraceSpanInfoCached, fn: (span: TraceActiveSpan) => unknown): Promise<unknown>;
|
|
1940
|
+
/**
|
|
1941
|
+
* Trace builder used to create hierarchical spans and checkpoints during eval
|
|
1942
|
+
* execution.
|
|
1943
|
+
*/
|
|
1944
|
+
declare const evalTracer: {
|
|
1945
|
+
/** Run a callback inside a new trace span and record its lifecycle. */span: typeof traceSpan; /** Record a named point-in-time value alongside the trace. */
|
|
1946
|
+
checkpoint(name: string, data: unknown): void;
|
|
1947
|
+
};
|
|
1948
|
+
/** Build a queryable trace tree helper from a flat span list and checkpoints. */
|
|
1949
|
+
declare function buildTraceTree(spans: EvalTraceSpan[], checkpoints: Map<string, unknown>): EvalTraceTree;
|
|
1950
|
+
/** Hash the components of a cache key into a deterministic hex digest. */
|
|
1951
|
+
declare function hashCacheKey(input: {
|
|
1952
|
+
namespace: string;
|
|
1953
|
+
codeFingerprint: string;
|
|
1954
|
+
key: unknown;
|
|
1955
|
+
}): string;
|
|
1956
|
+
//#endregion
|
|
1957
|
+
//#region ../runner/src/cacheStore.d.ts
|
|
1958
|
+
/** Filter accepted by `FsCacheStore.clear` to narrow the set of entries removed. */
|
|
1959
|
+
type CacheClearFilter = {
|
|
1960
|
+
namespace?: string;
|
|
1961
|
+
key?: string;
|
|
1962
|
+
};
|
|
1963
|
+
//#endregion
|
|
1964
|
+
//#region ../runner/src/runner.d.ts
|
|
1965
|
+
/** Imperative runner interface used by the server and CLI. */
|
|
1966
|
+
type EvalRunner = {
|
|
1967
|
+
/** Load workspace config, discover evals, and start file watching when enabled. */init(): Promise<void>; /** Return the currently discovered eval summaries for the active workspace. */
|
|
1968
|
+
getEvals(): EvalSummary[]; /** Look up one discovered eval by id. */
|
|
1969
|
+
getEval(id: string): EvalSummary | undefined; /** Re-scan configured eval files and emit a discovery update to listeners. */
|
|
1970
|
+
refreshDiscovery(): Promise<void>;
|
|
1971
|
+
startRun(request: CreateRunRequest): Promise<{
|
|
1972
|
+
manifest: RunManifest;
|
|
1973
|
+
summary: RunSummary;
|
|
1974
|
+
cases: CaseRow[];
|
|
1975
|
+
}>; /** Return run manifests tracked in memory, including persisted runs loaded during init. */
|
|
1976
|
+
getRuns(): RunManifest[]; /** Return one run with its summary and case rows when available in memory. */
|
|
1977
|
+
getRun(id: string): {
|
|
1978
|
+
manifest: RunManifest;
|
|
1979
|
+
summary: RunSummary;
|
|
1980
|
+
cases: CaseRow[];
|
|
1981
|
+
} | undefined; /** Request cancellation for an in-flight run. */
|
|
1982
|
+
cancelRun(id: string): void; /** Return full details for a single case in a run, when available. */
|
|
1983
|
+
getCaseDetail(runId: string, caseId: string): CaseDetail | undefined; /** Subscribe to streamed events for a specific run. */
|
|
1984
|
+
subscribe(runId: string, listener: (event: SseEnvelope) => void): () => void; /** Subscribe to discovery updates triggered by file changes or manual refresh. */
|
|
1985
|
+
subscribeDiscovery(listener: (event: SseEnvelope) => void): () => void; /** Resolve the workspace root backing this runner instance. */
|
|
1986
|
+
getWorkspaceRoot(): string; /** Resolve a persisted artifact path when artifact storage is supported. */
|
|
1987
|
+
getArtifactPath(artifactId: string): string | undefined; /** Return summaries for every persisted cache entry in the workspace. */
|
|
1988
|
+
listCache(): Promise<CacheListItem[]>;
|
|
1989
|
+
/**
|
|
1990
|
+
* Remove cache entries matching `filter`, or all entries when no filter is
|
|
1991
|
+
* supplied.
|
|
1992
|
+
*/
|
|
1993
|
+
clearCache(filter?: CacheClearFilter): Promise<void>; /** Recompute persisted case and run statuses for terminal runs touching one eval. */
|
|
1994
|
+
recomputeStatusesForEval(evalId: string): Promise<{
|
|
1995
|
+
updatedRuns: number;
|
|
1996
|
+
}>; /** Delete terminal persisted runs that touch one eval from in-memory history and disk. */
|
|
1997
|
+
cleanRunsForEval(evalId: string): Promise<{
|
|
1998
|
+
deletedRuns: number;
|
|
1999
|
+
}>; /** Persist a UI-authored manual score for one case and recompute affected summaries. */
|
|
2000
|
+
updateManualScore(params: {
|
|
2001
|
+
runId: string;
|
|
2002
|
+
caseId: string;
|
|
2003
|
+
scoreKey: string;
|
|
2004
|
+
value: number | null;
|
|
2005
|
+
}): Promise<{
|
|
2006
|
+
updated: true;
|
|
2007
|
+
run: {
|
|
2008
|
+
manifest: RunManifest;
|
|
2009
|
+
summary: RunSummary;
|
|
2010
|
+
cases: CaseRow[];
|
|
2011
|
+
};
|
|
2012
|
+
caseDetail: CaseDetail;
|
|
2013
|
+
} | {
|
|
2014
|
+
updated: false;
|
|
2015
|
+
reason: string;
|
|
2016
|
+
}>;
|
|
2017
|
+
/**
|
|
2018
|
+
* Delete one persisted run from in-memory history and disk.
|
|
2019
|
+
*
|
|
2020
|
+
* Ignored for in-flight runs — cancel first, then delete.
|
|
2021
|
+
* Returns `deleted: false` when the run is missing or still running.
|
|
2022
|
+
*/
|
|
2023
|
+
deleteRun(runId: string): Promise<{
|
|
2024
|
+
deleted: boolean;
|
|
2025
|
+
}>;
|
|
2026
|
+
};
|
|
2027
|
+
type CreateRunnerOptions = {
|
|
2028
|
+
watchForChanges?: boolean;
|
|
2029
|
+
};
|
|
2030
|
+
/** Create an in-memory eval runner bound to the current workspace config. */
|
|
2031
|
+
declare function createRunner({
|
|
2032
|
+
watchForChanges
|
|
2033
|
+
}?: CreateRunnerOptions): EvalRunner;
|
|
2034
|
+
//#endregion
|
|
2035
|
+
//#region src/cli.d.ts
|
|
2036
|
+
/**
|
|
2037
|
+
* Run the Agent Evals CLI against the current workspace.
|
|
2038
|
+
*
|
|
2039
|
+
* @param argv Raw command-line arguments excluding the executable name.
|
|
2040
|
+
*/
|
|
2041
|
+
declare function runCli(argv: string[]): Promise<void>;
|
|
2042
|
+
//#endregion
|
|
2043
|
+
export { type AgentEvalsConfig, type AssertionFailure, type CacheAdapter, type CacheEntry, type CacheListItem, type CacheMode, type CacheRecording, type CacheRecordingFrame, type CacheRecordingOp, type CacheScopeContext, type CaseDetail, type CaseRow, type CellValue, type ColumnDef, type ColumnFormat, type ColumnKind, type CreateRunRequest, type DerivedStatus, EvalAssertionError, type EvalCase, type EvalCaseScope, type EvalChartAggregate, type EvalChartAxis, type EvalChartBuiltinMetric, type EvalChartColor, type EvalChartConfig, type EvalChartMetric, type EvalChartTooltipExtra, type EvalChartType, type EvalChartsConfig, type EvalColumnOverride, type EvalColumns, type EvalDefinition, type EvalDeriveContext, type EvalDisplayStatus, type EvalExecuteContext, type EvalFreshnessStatus, type EvalManualScoreDef, type EvalRunner, type EvalScoreContext, type EvalScoreDef, type EvalScoreFn, type EvalStatAggregate, type EvalStatItem, type EvalStatsConfig, type EvalSummary, type EvalTraceSpan, type EvalTraceTree, type FileRef, type JsonCell, type NumberDisplayOptions, type RepoFileRef, type RunArtifactRef, type RunInEvalScopeOptions, type RunManifest, type RunSummary, type ScalarCell, type ScopedCaseSummary, type ScoreTrace, type SerializedCacheSpan, type SpanCacheOptions, type SseEnvelope, type SseEventType, type TraceActiveSpan, type TraceAttributeDisplay, type TraceAttributeDisplayFormat, type TraceAttributeDisplayInput, type TraceAttributeDisplayPlacement, type TraceAttributeTransform, type TraceAttributeTransformContext, type TraceDisplayConfig, type TraceDisplayInputConfig, type TraceSpanInfo, type TraceSpanKind, type TrialSelectionMode, type UpdateManualScoreRequest, agentEvalsConfigSchema, assertionFailureSchema, buildTraceTree, cacheEntrySchema, cacheListItemSchema, cacheModeSchema, cacheRecordingOpSchema, cacheRecordingSchema, caseDetailSchema, caseRowSchema, cellValueSchema, columnDefSchema, columnFormatSchema, columnKindSchema, createRunRequestSchema, createRunner, defineEval, deriveScopedSummaryFromCases, deriveStatusFromCaseRows, deriveStatusFromChildStatuses, evalAssert, evalChartAggregateSchema, evalChartAxisSchema, evalChartBuiltinMetricSchema, evalChartColorSchema, evalChartConfigSchema, evalChartMetricSchema, evalChartTooltipExtraSchema, evalChartTypeSchema, evalChartsConfigSchema, evalFreshnessStatusSchema, evalSpan, evalStatAggregateSchema, evalStatItemSchema, evalStatsConfigSchema, evalSummarySchema, evalTracer, fileRefSchema, getCurrentScope, getEvalDisplayStatus, getEvalRegistry, getEvalTitle, hashCacheKey, incrementEvalOutput, isInEvalScope, jsonCellSchema, numberDisplayOptionsSchema, repoFile, repoFileRefSchema, runArtifactRefSchema, runCli, runInEvalScope, runManifestSchema, runSummarySchema, scoreTraceSchema, serializedCacheSpanSchema, setEvalOutput, setScopeCacheContext, spanCacheOptionsSchema, sseEnvelopeSchema, traceAttributeDisplayFormatSchema, traceAttributeDisplayInputSchema, traceAttributeDisplayPlacementSchema, traceAttributeDisplaySchema, traceDisplayConfigSchema, traceDisplayInputConfigSchema, traceSpanKindSchema, traceSpanSchema, trialSelectionModeSchema, updateManualScoreRequestSchema };
|