@agentgrader/core 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +756 -0
- package/dist/index.js +1114 -0
- package/package.json +33 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,756 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { AgrDb } from '@agentgrader/store';
|
|
3
|
+
|
|
4
|
+
declare const SuccessCriterionSchema: z.ZodUnion<[z.ZodObject<{
|
|
5
|
+
run: z.ZodString;
|
|
6
|
+
expect: z.ZodObject<{
|
|
7
|
+
exit_code: z.ZodDefault<z.ZodNumber>;
|
|
8
|
+
}, "strip", z.ZodTypeAny, {
|
|
9
|
+
exit_code: number;
|
|
10
|
+
}, {
|
|
11
|
+
exit_code?: number | undefined;
|
|
12
|
+
}>;
|
|
13
|
+
}, "strip", z.ZodTypeAny, {
|
|
14
|
+
run: string;
|
|
15
|
+
expect: {
|
|
16
|
+
exit_code: number;
|
|
17
|
+
};
|
|
18
|
+
}, {
|
|
19
|
+
run: string;
|
|
20
|
+
expect: {
|
|
21
|
+
exit_code?: number | undefined;
|
|
22
|
+
};
|
|
23
|
+
}>, z.ZodObject<{
|
|
24
|
+
assert: z.ZodString;
|
|
25
|
+
}, "strip", z.ZodTypeAny, {
|
|
26
|
+
assert: string;
|
|
27
|
+
}, {
|
|
28
|
+
assert: string;
|
|
29
|
+
}>]>;
|
|
30
|
+
type SuccessCriterion = z.infer<typeof SuccessCriterionSchema>;
|
|
31
|
+
declare const TestCaseSchema: z.ZodObject<{
|
|
32
|
+
id: z.ZodOptional<z.ZodString>;
|
|
33
|
+
name: z.ZodString;
|
|
34
|
+
description: z.ZodOptional<z.ZodString>;
|
|
35
|
+
fixture: z.ZodString;
|
|
36
|
+
prompt: z.ZodString;
|
|
37
|
+
success: z.ZodArray<z.ZodUnion<[z.ZodObject<{
|
|
38
|
+
run: z.ZodString;
|
|
39
|
+
expect: z.ZodObject<{
|
|
40
|
+
exit_code: z.ZodDefault<z.ZodNumber>;
|
|
41
|
+
}, "strip", z.ZodTypeAny, {
|
|
42
|
+
exit_code: number;
|
|
43
|
+
}, {
|
|
44
|
+
exit_code?: number | undefined;
|
|
45
|
+
}>;
|
|
46
|
+
}, "strip", z.ZodTypeAny, {
|
|
47
|
+
run: string;
|
|
48
|
+
expect: {
|
|
49
|
+
exit_code: number;
|
|
50
|
+
};
|
|
51
|
+
}, {
|
|
52
|
+
run: string;
|
|
53
|
+
expect: {
|
|
54
|
+
exit_code?: number | undefined;
|
|
55
|
+
};
|
|
56
|
+
}>, z.ZodObject<{
|
|
57
|
+
assert: z.ZodString;
|
|
58
|
+
}, "strip", z.ZodTypeAny, {
|
|
59
|
+
assert: string;
|
|
60
|
+
}, {
|
|
61
|
+
assert: string;
|
|
62
|
+
}>]>, "many">;
|
|
63
|
+
timeout_seconds: z.ZodDefault<z.ZodNumber>;
|
|
64
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
65
|
+
test_command: z.ZodOptional<z.ZodString>;
|
|
66
|
+
fail_to_pass: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
67
|
+
pass_to_pass: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
68
|
+
forbid_modified: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
69
|
+
expected_files: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
70
|
+
solution: z.ZodOptional<z.ZodString>;
|
|
71
|
+
test_patch: z.ZodOptional<z.ZodString>;
|
|
72
|
+
created_at: z.ZodOptional<z.ZodString>;
|
|
73
|
+
image: z.ZodOptional<z.ZodString>;
|
|
74
|
+
toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
75
|
+
}, "strip", z.ZodTypeAny, {
|
|
76
|
+
name: string;
|
|
77
|
+
fixture: string;
|
|
78
|
+
prompt: string;
|
|
79
|
+
success: ({
|
|
80
|
+
run: string;
|
|
81
|
+
expect: {
|
|
82
|
+
exit_code: number;
|
|
83
|
+
};
|
|
84
|
+
} | {
|
|
85
|
+
assert: string;
|
|
86
|
+
})[];
|
|
87
|
+
timeout_seconds: number;
|
|
88
|
+
id?: string | undefined;
|
|
89
|
+
description?: string | undefined;
|
|
90
|
+
tags?: string[] | undefined;
|
|
91
|
+
test_command?: string | undefined;
|
|
92
|
+
fail_to_pass?: string[] | undefined;
|
|
93
|
+
pass_to_pass?: string[] | undefined;
|
|
94
|
+
forbid_modified?: string[] | undefined;
|
|
95
|
+
expected_files?: string[] | undefined;
|
|
96
|
+
solution?: string | undefined;
|
|
97
|
+
test_patch?: string | undefined;
|
|
98
|
+
created_at?: string | undefined;
|
|
99
|
+
image?: string | undefined;
|
|
100
|
+
toolkits?: string[] | undefined;
|
|
101
|
+
}, {
|
|
102
|
+
name: string;
|
|
103
|
+
fixture: string;
|
|
104
|
+
prompt: string;
|
|
105
|
+
success: ({
|
|
106
|
+
run: string;
|
|
107
|
+
expect: {
|
|
108
|
+
exit_code?: number | undefined;
|
|
109
|
+
};
|
|
110
|
+
} | {
|
|
111
|
+
assert: string;
|
|
112
|
+
})[];
|
|
113
|
+
id?: string | undefined;
|
|
114
|
+
description?: string | undefined;
|
|
115
|
+
timeout_seconds?: number | undefined;
|
|
116
|
+
tags?: string[] | undefined;
|
|
117
|
+
test_command?: string | undefined;
|
|
118
|
+
fail_to_pass?: string[] | undefined;
|
|
119
|
+
pass_to_pass?: string[] | undefined;
|
|
120
|
+
forbid_modified?: string[] | undefined;
|
|
121
|
+
expected_files?: string[] | undefined;
|
|
122
|
+
solution?: string | undefined;
|
|
123
|
+
test_patch?: string | undefined;
|
|
124
|
+
created_at?: string | undefined;
|
|
125
|
+
image?: string | undefined;
|
|
126
|
+
toolkits?: string[] | undefined;
|
|
127
|
+
}>;
|
|
128
|
+
type TestCase = z.infer<typeof TestCaseSchema>;
|
|
129
|
+
|
|
130
|
+
declare const AgentConfigSchema: z.ZodObject<{
|
|
131
|
+
id: z.ZodOptional<z.ZodString>;
|
|
132
|
+
name: z.ZodString;
|
|
133
|
+
model: z.ZodString;
|
|
134
|
+
max_steps: z.ZodDefault<z.ZodNumber>;
|
|
135
|
+
temperature: z.ZodOptional<z.ZodNumber>;
|
|
136
|
+
system_prompt: z.ZodOptional<z.ZodString>;
|
|
137
|
+
tools: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
138
|
+
toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
139
|
+
mcp_servers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnion<[z.ZodObject<{
|
|
140
|
+
command: z.ZodString;
|
|
141
|
+
args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
142
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
143
|
+
}, "strip", z.ZodTypeAny, {
|
|
144
|
+
command: string;
|
|
145
|
+
args?: string[] | undefined;
|
|
146
|
+
env?: Record<string, string> | undefined;
|
|
147
|
+
}, {
|
|
148
|
+
command: string;
|
|
149
|
+
args?: string[] | undefined;
|
|
150
|
+
env?: Record<string, string> | undefined;
|
|
151
|
+
}>, z.ZodObject<{
|
|
152
|
+
type: z.ZodOptional<z.ZodEnum<["http", "sse"]>>;
|
|
153
|
+
url: z.ZodString;
|
|
154
|
+
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
155
|
+
}, "strip", z.ZodTypeAny, {
|
|
156
|
+
url: string;
|
|
157
|
+
type?: "http" | "sse" | undefined;
|
|
158
|
+
headers?: Record<string, string> | undefined;
|
|
159
|
+
}, {
|
|
160
|
+
url: string;
|
|
161
|
+
type?: "http" | "sse" | undefined;
|
|
162
|
+
headers?: Record<string, string> | undefined;
|
|
163
|
+
}>]>>>;
|
|
164
|
+
}, "strip", z.ZodTypeAny, {
|
|
165
|
+
name: string;
|
|
166
|
+
model: string;
|
|
167
|
+
max_steps: number;
|
|
168
|
+
id?: string | undefined;
|
|
169
|
+
toolkits?: string[] | undefined;
|
|
170
|
+
temperature?: number | undefined;
|
|
171
|
+
system_prompt?: string | undefined;
|
|
172
|
+
tools?: string[] | undefined;
|
|
173
|
+
mcp_servers?: Record<string, {
|
|
174
|
+
command: string;
|
|
175
|
+
args?: string[] | undefined;
|
|
176
|
+
env?: Record<string, string> | undefined;
|
|
177
|
+
} | {
|
|
178
|
+
url: string;
|
|
179
|
+
type?: "http" | "sse" | undefined;
|
|
180
|
+
headers?: Record<string, string> | undefined;
|
|
181
|
+
}> | undefined;
|
|
182
|
+
}, {
|
|
183
|
+
name: string;
|
|
184
|
+
model: string;
|
|
185
|
+
id?: string | undefined;
|
|
186
|
+
toolkits?: string[] | undefined;
|
|
187
|
+
max_steps?: number | undefined;
|
|
188
|
+
temperature?: number | undefined;
|
|
189
|
+
system_prompt?: string | undefined;
|
|
190
|
+
tools?: string[] | undefined;
|
|
191
|
+
mcp_servers?: Record<string, {
|
|
192
|
+
command: string;
|
|
193
|
+
args?: string[] | undefined;
|
|
194
|
+
env?: Record<string, string> | undefined;
|
|
195
|
+
} | {
|
|
196
|
+
url: string;
|
|
197
|
+
type?: "http" | "sse" | undefined;
|
|
198
|
+
headers?: Record<string, string> | undefined;
|
|
199
|
+
}> | undefined;
|
|
200
|
+
}>;
|
|
201
|
+
type AgentConfig = z.infer<typeof AgentConfigSchema>;
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Frontmatter for a Claude Agent Skill (`SKILL.md`).
|
|
205
|
+
*
|
|
206
|
+
* This is intentionally a conservative subset of the published Agent Skills
|
|
207
|
+
* spec: `name` and `description` are the two fields that are solidly
|
|
208
|
+
* documented as always loaded into context for skill discovery (progressive
|
|
209
|
+
* disclosure - the rest of SKILL.md is only read on demand). `allowed-tools`,
|
|
210
|
+
* `disallowed-tools`, and `license` are reasonably well documented optional
|
|
211
|
+
* fields. Anything else is passed through unvalidated via `.passthrough()`
|
|
212
|
+
* so we don't reject SKILL.md files that use additional frontmatter we
|
|
213
|
+
* haven't verified.
|
|
214
|
+
*/
|
|
215
|
+
declare const SkillFrontmatterSchema: z.ZodObject<{
|
|
216
|
+
/** lowercase letters, numbers, hyphens; max 64 chars */
|
|
217
|
+
name: z.ZodString;
|
|
218
|
+
/** third-person description of what the skill does and when to use it; max 1024 chars */
|
|
219
|
+
description: z.ZodString;
|
|
220
|
+
"allowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
221
|
+
"disallowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
222
|
+
license: z.ZodOptional<z.ZodString>;
|
|
223
|
+
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
|
|
224
|
+
/** lowercase letters, numbers, hyphens; max 64 chars */
|
|
225
|
+
name: z.ZodString;
|
|
226
|
+
/** third-person description of what the skill does and when to use it; max 1024 chars */
|
|
227
|
+
description: z.ZodString;
|
|
228
|
+
"allowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
229
|
+
"disallowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
230
|
+
license: z.ZodOptional<z.ZodString>;
|
|
231
|
+
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
|
|
232
|
+
/** lowercase letters, numbers, hyphens; max 64 chars */
|
|
233
|
+
name: z.ZodString;
|
|
234
|
+
/** third-person description of what the skill does and when to use it; max 1024 chars */
|
|
235
|
+
description: z.ZodString;
|
|
236
|
+
"allowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
237
|
+
"disallowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
238
|
+
license: z.ZodOptional<z.ZodString>;
|
|
239
|
+
}, z.ZodTypeAny, "passthrough">>;
|
|
240
|
+
type SkillFrontmatter = z.infer<typeof SkillFrontmatterSchema>;
|
|
241
|
+
/** A discovered skill: parsed frontmatter, markdown body, and its location on disk. */
|
|
242
|
+
interface Skill {
|
|
243
|
+
frontmatter: SkillFrontmatter;
|
|
244
|
+
/** markdown body of SKILL.md, with the frontmatter block stripped */
|
|
245
|
+
body: string;
|
|
246
|
+
/** absolute path to the SKILL.md file */
|
|
247
|
+
path: string;
|
|
248
|
+
/** absolute path to the skill's directory (for resolving bundled scripts/resources) */
|
|
249
|
+
dir: string;
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* MCP server configuration, mirroring the `mcpServers` entries used by
|
|
253
|
+
* `.mcp.json` configs:
|
|
254
|
+
*
|
|
255
|
+
* - stdio servers are launched as a local subprocess and spoken to over
|
|
256
|
+
* stdin/stdout (`command` + optional `args`/`env`).
|
|
257
|
+
* - http/sse servers are remote endpoints reached over HTTP(S) (`url` +
|
|
258
|
+
* optional `headers`).
|
|
259
|
+
*/
|
|
260
|
+
declare const McpServerConfigSchema: z.ZodUnion<[z.ZodObject<{
|
|
261
|
+
command: z.ZodString;
|
|
262
|
+
args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
263
|
+
env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
264
|
+
}, "strip", z.ZodTypeAny, {
|
|
265
|
+
command: string;
|
|
266
|
+
args?: string[] | undefined;
|
|
267
|
+
env?: Record<string, string> | undefined;
|
|
268
|
+
}, {
|
|
269
|
+
command: string;
|
|
270
|
+
args?: string[] | undefined;
|
|
271
|
+
env?: Record<string, string> | undefined;
|
|
272
|
+
}>, z.ZodObject<{
|
|
273
|
+
type: z.ZodOptional<z.ZodEnum<["http", "sse"]>>;
|
|
274
|
+
url: z.ZodString;
|
|
275
|
+
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
276
|
+
}, "strip", z.ZodTypeAny, {
|
|
277
|
+
url: string;
|
|
278
|
+
type?: "http" | "sse" | undefined;
|
|
279
|
+
headers?: Record<string, string> | undefined;
|
|
280
|
+
}, {
|
|
281
|
+
url: string;
|
|
282
|
+
type?: "http" | "sse" | undefined;
|
|
283
|
+
headers?: Record<string, string> | undefined;
|
|
284
|
+
}>]>;
|
|
285
|
+
type McpServerConfig = z.infer<typeof McpServerConfigSchema>;
|
|
286
|
+
|
|
287
|
+
declare const RunSchema: z.ZodObject<{
|
|
288
|
+
id: z.ZodString;
|
|
289
|
+
testCaseId: z.ZodString;
|
|
290
|
+
agentConfigId: z.ZodString;
|
|
291
|
+
sandboxProvider: z.ZodString;
|
|
292
|
+
status: z.ZodEnum<["running", "completed", "failed"]>;
|
|
293
|
+
passed: z.ZodOptional<z.ZodBoolean>;
|
|
294
|
+
score: z.ZodOptional<z.ZodNumber>;
|
|
295
|
+
stepsCount: z.ZodDefault<z.ZodNumber>;
|
|
296
|
+
tokensIn: z.ZodDefault<z.ZodNumber>;
|
|
297
|
+
tokensOut: z.ZodDefault<z.ZodNumber>;
|
|
298
|
+
costUsd: z.ZodDefault<z.ZodNumber>;
|
|
299
|
+
durationMs: z.ZodDefault<z.ZodNumber>;
|
|
300
|
+
error: z.ZodOptional<z.ZodString>;
|
|
301
|
+
finalDiff: z.ZodOptional<z.ZodString>;
|
|
302
|
+
metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
303
|
+
createdAt: z.ZodNumber;
|
|
304
|
+
completedAt: z.ZodOptional<z.ZodNumber>;
|
|
305
|
+
}, "strip", z.ZodTypeAny, {
|
|
306
|
+
status: "running" | "completed" | "failed";
|
|
307
|
+
id: string;
|
|
308
|
+
testCaseId: string;
|
|
309
|
+
agentConfigId: string;
|
|
310
|
+
sandboxProvider: string;
|
|
311
|
+
stepsCount: number;
|
|
312
|
+
tokensIn: number;
|
|
313
|
+
tokensOut: number;
|
|
314
|
+
costUsd: number;
|
|
315
|
+
durationMs: number;
|
|
316
|
+
createdAt: number;
|
|
317
|
+
passed?: boolean | undefined;
|
|
318
|
+
score?: number | undefined;
|
|
319
|
+
error?: string | undefined;
|
|
320
|
+
finalDiff?: string | undefined;
|
|
321
|
+
metrics?: Record<string, any> | undefined;
|
|
322
|
+
completedAt?: number | undefined;
|
|
323
|
+
}, {
|
|
324
|
+
status: "running" | "completed" | "failed";
|
|
325
|
+
id: string;
|
|
326
|
+
testCaseId: string;
|
|
327
|
+
agentConfigId: string;
|
|
328
|
+
sandboxProvider: string;
|
|
329
|
+
createdAt: number;
|
|
330
|
+
passed?: boolean | undefined;
|
|
331
|
+
score?: number | undefined;
|
|
332
|
+
stepsCount?: number | undefined;
|
|
333
|
+
tokensIn?: number | undefined;
|
|
334
|
+
tokensOut?: number | undefined;
|
|
335
|
+
costUsd?: number | undefined;
|
|
336
|
+
durationMs?: number | undefined;
|
|
337
|
+
error?: string | undefined;
|
|
338
|
+
finalDiff?: string | undefined;
|
|
339
|
+
metrics?: Record<string, any> | undefined;
|
|
340
|
+
completedAt?: number | undefined;
|
|
341
|
+
}>;
|
|
342
|
+
type Run = z.infer<typeof RunSchema>;
|
|
343
|
+
|
|
344
|
+
declare const StepEventSchema: z.ZodObject<{
|
|
345
|
+
index: z.ZodNumber;
|
|
346
|
+
kind: z.ZodEnum<["tool_call", "tool_result", "message", "thinking"]>;
|
|
347
|
+
tool: z.ZodOptional<z.ZodString>;
|
|
348
|
+
tokensIn: z.ZodDefault<z.ZodNumber>;
|
|
349
|
+
tokensOut: z.ZodDefault<z.ZodNumber>;
|
|
350
|
+
costUsd: z.ZodDefault<z.ZodNumber>;
|
|
351
|
+
timestamp: z.ZodNumber;
|
|
352
|
+
content: z.ZodOptional<z.ZodString>;
|
|
353
|
+
}, "strip", z.ZodTypeAny, {
|
|
354
|
+
tokensIn: number;
|
|
355
|
+
tokensOut: number;
|
|
356
|
+
costUsd: number;
|
|
357
|
+
index: number;
|
|
358
|
+
kind: "message" | "tool_call" | "tool_result" | "thinking";
|
|
359
|
+
timestamp: number;
|
|
360
|
+
tool?: string | undefined;
|
|
361
|
+
content?: string | undefined;
|
|
362
|
+
}, {
|
|
363
|
+
index: number;
|
|
364
|
+
kind: "message" | "tool_call" | "tool_result" | "thinking";
|
|
365
|
+
timestamp: number;
|
|
366
|
+
tokensIn?: number | undefined;
|
|
367
|
+
tokensOut?: number | undefined;
|
|
368
|
+
costUsd?: number | undefined;
|
|
369
|
+
tool?: string | undefined;
|
|
370
|
+
content?: string | undefined;
|
|
371
|
+
}>;
|
|
372
|
+
type StepEvent = z.infer<typeof StepEventSchema>;
|
|
373
|
+
declare const TraceSchema: z.ZodObject<{
|
|
374
|
+
runId: z.ZodString;
|
|
375
|
+
steps: z.ZodArray<z.ZodObject<{
|
|
376
|
+
index: z.ZodNumber;
|
|
377
|
+
kind: z.ZodEnum<["tool_call", "tool_result", "message", "thinking"]>;
|
|
378
|
+
tool: z.ZodOptional<z.ZodString>;
|
|
379
|
+
tokensIn: z.ZodDefault<z.ZodNumber>;
|
|
380
|
+
tokensOut: z.ZodDefault<z.ZodNumber>;
|
|
381
|
+
costUsd: z.ZodDefault<z.ZodNumber>;
|
|
382
|
+
timestamp: z.ZodNumber;
|
|
383
|
+
content: z.ZodOptional<z.ZodString>;
|
|
384
|
+
}, "strip", z.ZodTypeAny, {
|
|
385
|
+
tokensIn: number;
|
|
386
|
+
tokensOut: number;
|
|
387
|
+
costUsd: number;
|
|
388
|
+
index: number;
|
|
389
|
+
kind: "message" | "tool_call" | "tool_result" | "thinking";
|
|
390
|
+
timestamp: number;
|
|
391
|
+
tool?: string | undefined;
|
|
392
|
+
content?: string | undefined;
|
|
393
|
+
}, {
|
|
394
|
+
index: number;
|
|
395
|
+
kind: "message" | "tool_call" | "tool_result" | "thinking";
|
|
396
|
+
timestamp: number;
|
|
397
|
+
tokensIn?: number | undefined;
|
|
398
|
+
tokensOut?: number | undefined;
|
|
399
|
+
costUsd?: number | undefined;
|
|
400
|
+
tool?: string | undefined;
|
|
401
|
+
content?: string | undefined;
|
|
402
|
+
}>, "many">;
|
|
403
|
+
}, "strip", z.ZodTypeAny, {
|
|
404
|
+
runId: string;
|
|
405
|
+
steps: {
|
|
406
|
+
tokensIn: number;
|
|
407
|
+
tokensOut: number;
|
|
408
|
+
costUsd: number;
|
|
409
|
+
index: number;
|
|
410
|
+
kind: "message" | "tool_call" | "tool_result" | "thinking";
|
|
411
|
+
timestamp: number;
|
|
412
|
+
tool?: string | undefined;
|
|
413
|
+
content?: string | undefined;
|
|
414
|
+
}[];
|
|
415
|
+
}, {
|
|
416
|
+
runId: string;
|
|
417
|
+
steps: {
|
|
418
|
+
index: number;
|
|
419
|
+
kind: "message" | "tool_call" | "tool_result" | "thinking";
|
|
420
|
+
timestamp: number;
|
|
421
|
+
tokensIn?: number | undefined;
|
|
422
|
+
tokensOut?: number | undefined;
|
|
423
|
+
costUsd?: number | undefined;
|
|
424
|
+
tool?: string | undefined;
|
|
425
|
+
content?: string | undefined;
|
|
426
|
+
}[];
|
|
427
|
+
}>;
|
|
428
|
+
type Trace = z.infer<typeof TraceSchema>;
|
|
429
|
+
|
|
430
|
+
interface PatchApplyResult {
|
|
431
|
+
/** true if the patch was applied successfully (with or without repair) */
|
|
432
|
+
applied: boolean;
|
|
433
|
+
/** true if a fallback/repair strategy (3-way merge or `patch --fuzz`) was needed */
|
|
434
|
+
repaired: boolean;
|
|
435
|
+
/** combined output/diagnostics from the apply attempt(s) */
|
|
436
|
+
output: string;
|
|
437
|
+
}
|
|
438
|
+
interface SandboxHandle {
|
|
439
|
+
exec(cmd: string): Promise<{
|
|
440
|
+
stdout: string;
|
|
441
|
+
stderr: string;
|
|
442
|
+
exitCode: number;
|
|
443
|
+
}>;
|
|
444
|
+
writeFile(path: string, content: string): Promise<void>;
|
|
445
|
+
readFile(path: string): Promise<string>;
|
|
446
|
+
gitDiff(): Promise<string>;
|
|
447
|
+
/**
|
|
448
|
+
* Applies a unified diff to the sandbox's working tree.
|
|
449
|
+
*
|
|
450
|
+
* Mirrors SWE-bench's patch-application robustness: tries `git apply`
|
|
451
|
+
* first, then falls back to `git apply --3way`, then `patch --fuzz=3`.
|
|
452
|
+
* Implementations should report whether a fallback ("repair") was needed.
|
|
453
|
+
*/
|
|
454
|
+
applyPatch(diff: string): Promise<PatchApplyResult>;
|
|
455
|
+
destroy(): Promise<void>;
|
|
456
|
+
}
|
|
457
|
+
interface SandboxProvider {
|
|
458
|
+
readonly name: string;
|
|
459
|
+
create(opts: {
|
|
460
|
+
image?: string;
|
|
461
|
+
gitSnapshot?: string;
|
|
462
|
+
/**
|
|
463
|
+
* Absolute paths to local "toolkit" directories to inject into the
|
|
464
|
+
* sandbox, in addition to `gitSnapshot`. A toolkit may contain a `bin/`
|
|
465
|
+
* directory (custom CLI tools, made executable and put on `PATH`) and a
|
|
466
|
+
* `.claude/skills/` directory (Agent Skills documentation, see
|
|
467
|
+
* `runner/skills.ts`).
|
|
468
|
+
*/
|
|
469
|
+
toolkits?: string[];
|
|
470
|
+
}): Promise<SandboxHandle>;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
interface AgentResult {
|
|
474
|
+
finished: boolean;
|
|
475
|
+
finalDiff: string;
|
|
476
|
+
}
|
|
477
|
+
interface AgentAdapter {
|
|
478
|
+
readonly name: string;
|
|
479
|
+
/**
|
|
480
|
+
* run the agent against a problem in a prepared sandbox.
|
|
481
|
+
* the adapter doesn't know about scoring — it just solves and emits
|
|
482
|
+
* step events. the framework measures everything from the outside.
|
|
483
|
+
*/
|
|
484
|
+
solve(input: {
|
|
485
|
+
prompt: string;
|
|
486
|
+
sandbox: SandboxHandle;
|
|
487
|
+
config: AgentConfig;
|
|
488
|
+
onStep: (step: StepEvent) => void;
|
|
489
|
+
}): Promise<AgentResult>;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
interface ScorerResult {
|
|
493
|
+
passed: boolean;
|
|
494
|
+
detail: string;
|
|
495
|
+
score?: number;
|
|
496
|
+
}
|
|
497
|
+
interface Scorer {
|
|
498
|
+
readonly name: string;
|
|
499
|
+
score(input: {
|
|
500
|
+
testCase: TestCase;
|
|
501
|
+
result: AgentResult;
|
|
502
|
+
trace: Trace;
|
|
503
|
+
sandbox: SandboxHandle;
|
|
504
|
+
}): Promise<ScorerResult>;
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
/**
|
|
508
|
+
* Parses raw test-runner output into a per-test status map, so scorers can
|
|
509
|
+
* compare individual test outcomes against FAIL_TO_PASS / PASS_TO_PASS lists
|
|
510
|
+
* (SWE-bench style).
|
|
511
|
+
*/
|
|
512
|
+
type TestStatus = "PASS" | "FAIL" | "SKIP";
|
|
513
|
+
/** Maps a test's display name to its outcome. */
|
|
514
|
+
type TestStatusMap = Record<string, TestStatus>;
|
|
515
|
+
interface TestResultParser {
|
|
516
|
+
readonly name: string;
|
|
517
|
+
parse(output: string): TestStatusMap;
|
|
518
|
+
}
|
|
519
|
+
/**
|
|
520
|
+
* Parses TAP (Test Anything Protocol) output, as produced by
|
|
521
|
+
* `node --test --test-reporter=tap` (or `tsx --test --test-reporter=tap`).
|
|
522
|
+
*
|
|
523
|
+
* Handles lines of the form:
|
|
524
|
+
* ok 1 - should succeed on first attempt
|
|
525
|
+
* not ok 2 - should retry on failure and succeed
|
|
526
|
+
* ok 3 - should be skipped # SKIP
|
|
527
|
+
*/
|
|
528
|
+
declare class TapTestResultParser implements TestResultParser {
|
|
529
|
+
readonly name = "tap";
|
|
530
|
+
parse(output: string): TestStatusMap;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
declare class CommandScorer implements Scorer {
|
|
534
|
+
readonly name = "CommandScorer";
|
|
535
|
+
score(input: {
|
|
536
|
+
testCase: TestCase;
|
|
537
|
+
sandbox: SandboxHandle;
|
|
538
|
+
}): Promise<ScorerResult>;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
declare class AssertionScorer implements Scorer {
|
|
542
|
+
readonly name = "AssertionScorer";
|
|
543
|
+
score(input: {
|
|
544
|
+
testCase: TestCase;
|
|
545
|
+
trace: Trace;
|
|
546
|
+
}): Promise<ScorerResult>;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
/**
|
|
550
|
+
* SWE-bench style regression scorer.
|
|
551
|
+
*
|
|
552
|
+
* - FAIL_TO_PASS: tests that were failing before the agent's patch and MUST
|
|
553
|
+
* pass afterwards. This is the actual "did the agent fix the bug" signal.
|
|
554
|
+
* - PASS_TO_PASS: tests that were passing before the agent's patch and MUST
|
|
555
|
+
* keep passing afterwards (no regressions introduced).
|
|
556
|
+
* - forbid_modified: acts as a tamper guard - if the agent edited test files
|
|
557
|
+
* (or other forbidden paths) to make the suite pass trivially, fail hard.
|
|
558
|
+
*
|
|
559
|
+
* If `test_command` / fail_to_pass / pass_to_pass are not configured on the
|
|
560
|
+
* test case, this scorer is a no-op pass (keeps it backwards compatible).
|
|
561
|
+
*/
|
|
562
|
+
declare class RegressionScorer implements Scorer {
|
|
563
|
+
readonly name = "RegressionScorer";
|
|
564
|
+
score(input: {
|
|
565
|
+
testCase: TestCase;
|
|
566
|
+
sandbox: SandboxHandle;
|
|
567
|
+
/** pre-patch test status map, used to avoid penalizing already-broken PASS_TO_PASS tests */
|
|
568
|
+
baseline?: TestStatusMap;
|
|
569
|
+
}): Promise<ScorerResult>;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
interface DiffStats {
|
|
573
|
+
filesChanged: string[];
|
|
574
|
+
insertions: number;
|
|
575
|
+
deletions: number;
|
|
576
|
+
linesChanged: number;
|
|
577
|
+
}
|
|
578
|
+
/** Parses a unified diff (as produced by `git diff`) into basic stats. */
|
|
579
|
+
declare function parseDiffStats(diff: string): DiffStats;
|
|
580
|
+
/**
|
|
581
|
+
* Scores the "scope" of an agent's patch.
|
|
582
|
+
*
|
|
583
|
+
* SWE-bench analyses showed agents frequently produce patches that are far
|
|
584
|
+
* larger / touch far more files than the gold patch - editing unrelated
|
|
585
|
+
* code, leaving debug statements, etc. This scorer reports diff stats and,
|
|
586
|
+
* when a gold `solution` patch is available (loaded as raw diff content by
|
|
587
|
+
* the CLI's `loadTestCase` helper), penalizes patches that are much larger
|
|
588
|
+
* than the gold patch in terms of total changed lines.
|
|
589
|
+
*/
|
|
590
|
+
declare class DiffScorer implements Scorer {
|
|
591
|
+
readonly name = "DiffScorer";
|
|
592
|
+
score(input: {
|
|
593
|
+
testCase: TestCase;
|
|
594
|
+
result: AgentResult;
|
|
595
|
+
}): Promise<ScorerResult>;
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
/**
|
|
599
|
+
* Measures whether the agent edited the "right" files, using
|
|
600
|
+
* `expected_files` glob patterns (typically derived from the gold patch).
|
|
601
|
+
*
|
|
602
|
+
* - precision: fraction of files the agent touched that match an expected pattern
|
|
603
|
+
* - recall: fraction of expected patterns that were matched by at least one touched file
|
|
604
|
+
* - f1: harmonic mean of precision and recall
|
|
605
|
+
*/
|
|
606
|
+
declare class LocalizationScorer implements Scorer {
|
|
607
|
+
readonly name = "LocalizationScorer";
|
|
608
|
+
score(input: {
|
|
609
|
+
testCase: TestCase;
|
|
610
|
+
result: AgentResult;
|
|
611
|
+
}): Promise<ScorerResult>;
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
interface RunSingleInput {
|
|
615
|
+
testCase: TestCase;
|
|
616
|
+
agentConfig: AgentConfig;
|
|
617
|
+
adapter: AgentAdapter;
|
|
618
|
+
sandboxProvider: SandboxProvider;
|
|
619
|
+
db?: AgrDb;
|
|
620
|
+
runId: string;
|
|
621
|
+
}
|
|
622
|
+
interface RunSingleResult {
|
|
623
|
+
runId: string;
|
|
624
|
+
passed: boolean;
|
|
625
|
+
score?: number;
|
|
626
|
+
stepsCount: number;
|
|
627
|
+
tokensIn: number;
|
|
628
|
+
tokensOut: number;
|
|
629
|
+
costUsd: number;
|
|
630
|
+
durationMs: number;
|
|
631
|
+
error?: string;
|
|
632
|
+
finalDiff?: string;
|
|
633
|
+
metrics?: Record<string, any>;
|
|
634
|
+
}
|
|
635
|
+
declare function runSingle(input: RunSingleInput): Promise<RunSingleResult>;
|
|
636
|
+
|
|
637
|
+
interface BenchmarkInput {
|
|
638
|
+
testCases: TestCase[];
|
|
639
|
+
agentConfigs: AgentConfig[];
|
|
640
|
+
adapter: AgentAdapter;
|
|
641
|
+
sandboxProvider: SandboxProvider;
|
|
642
|
+
db?: AgrDb;
|
|
643
|
+
concurrency?: number;
|
|
644
|
+
onRunUpdate?: (run: RunSingleResult & {
|
|
645
|
+
testCaseId: string;
|
|
646
|
+
agentConfigId: string;
|
|
647
|
+
status: "running" | "completed" | "failed";
|
|
648
|
+
}) => void;
|
|
649
|
+
}
|
|
650
|
+
interface BenchmarkResult {
|
|
651
|
+
runs: RunSingleResult[];
|
|
652
|
+
}
|
|
653
|
+
declare function runBenchmark(input: BenchmarkInput): Promise<BenchmarkResult>;
|
|
654
|
+
|
|
655
|
+
/**
|
|
656
|
+
* Minimal glob matcher (no external dependency) supporting:
|
|
657
|
+
* - `*` matches any sequence of characters except `/`
|
|
658
|
+
* - `**` matches any sequence of characters, including `/`
|
|
659
|
+
* - `?` matches a single character except `/`
|
|
660
|
+
*
|
|
661
|
+
* Used for `forbid_modified` / `expected_files` patterns in test cases.
|
|
662
|
+
*/
|
|
663
|
+
declare function matchGlob(pattern: string, filePath: string): boolean;
|
|
664
|
+
declare function matchAnyGlob(patterns: string[], filePath: string): boolean;
|
|
665
|
+
|
|
666
|
+
/**
|
|
667
|
+
* Computes a stable SHA-256 hash over the contents (and relative paths) of a
|
|
668
|
+
* fixture directory. Used to key cached baseline test results - if the
|
|
669
|
+
* fixture changes, the baseline must be recomputed.
|
|
670
|
+
*/
|
|
671
|
+
declare function hashFixture(fixtureDir: string): string;
|
|
672
|
+
|
|
673
|
+
interface BaselineResult {
|
|
674
|
+
fixtureHash: string;
|
|
675
|
+
statusMap: TestStatusMap;
|
|
676
|
+
/** true if this baseline came from the cache rather than a fresh run */
|
|
677
|
+
cached: boolean;
|
|
678
|
+
}
|
|
679
|
+
/**
|
|
680
|
+
* Computes (or loads from cache) the pre-patch test status map for a test
|
|
681
|
+
* case's fixture. Used by RegressionScorer so PASS_TO_PASS tests that were
|
|
682
|
+
* already broken before the agent touched anything don't unfairly penalize
|
|
683
|
+
* the run.
|
|
684
|
+
*
|
|
685
|
+
* Returns `undefined` if the test case has no `test_command` configured -
|
|
686
|
+
* baseline computation is then skipped entirely.
|
|
687
|
+
*/
|
|
688
|
+
declare function getOrComputeBaseline(input: {
|
|
689
|
+
testCase: TestCase;
|
|
690
|
+
sandboxProvider: SandboxProvider;
|
|
691
|
+
db?: AgrDb;
|
|
692
|
+
}): Promise<BaselineResult | undefined>;
|
|
693
|
+
|
|
694
|
+
interface ValidationCheck {
|
|
695
|
+
name: string;
|
|
696
|
+
passed: boolean;
|
|
697
|
+
detail: string;
|
|
698
|
+
}
|
|
699
|
+
interface ValidationReport {
|
|
700
|
+
ok: boolean;
|
|
701
|
+
checks: ValidationCheck[];
|
|
702
|
+
}
|
|
703
|
+
/**
|
|
704
|
+
* validates a test case the way SWE-bench validates a candidate task
|
|
705
|
+
* instance before it's added to the benchmark:
|
|
706
|
+
*
|
|
707
|
+
* 1. static check required fields are present and internally consistent.
|
|
708
|
+
* 2. pre-patch run - FAIL_TO_PASS tests must currently be FAILING and
|
|
709
|
+
* PASS_TO_PASS tests must currently be PASSING (on the raw fixture, with
|
|
710
|
+
* `test_patch` applied if present).
|
|
711
|
+
* 3. post-patch run - if a gold `solution` patch is provided, apply it and
|
|
712
|
+
* verify FAIL_TO_PASS tests now PASS and PASS_TO_PASS tests still PASS.
|
|
713
|
+
*
|
|
714
|
+
* this catches the most common authoring mistakes: typo'd test names,
|
|
715
|
+
* tests that pass/fail for the wrong reason, gold patches that don't
|
|
716
|
+
* actually fix the issue, and forbidden-file globs that never match.
|
|
717
|
+
*/
|
|
718
|
+
declare function validateTestCase(input: {
|
|
719
|
+
testCase: TestCase;
|
|
720
|
+
sandboxProvider: SandboxProvider;
|
|
721
|
+
}): Promise<ValidationReport>;
|
|
722
|
+
|
|
723
|
+
/**
|
|
724
|
+
* Parses a SKILL.md file's content into validated frontmatter + body.
|
|
725
|
+
*
|
|
726
|
+
* Throws if the file has no `---`-delimited YAML frontmatter block, or if
|
|
727
|
+
* the frontmatter doesn't satisfy `SkillFrontmatterSchema` (e.g. missing
|
|
728
|
+
* `name`/`description`).
|
|
729
|
+
*/
|
|
730
|
+
declare function parseSkillMarkdown(content: string, path: string, dir: string): Skill;
|
|
731
|
+
/**
|
|
732
|
+
* Discovers all skills bundled in a toolkit directory, i.e. every
|
|
733
|
+
* `<toolkitDir>/.claude/skills/<skill-name>/SKILL.md`.
|
|
734
|
+
*
|
|
735
|
+
* Returns an empty array if the toolkit has no `.claude/skills` directory.
|
|
736
|
+
*/
|
|
737
|
+
declare function discoverSkills(toolkitDir: string): Skill[];
|
|
738
|
+
/** Discovers skills across multiple toolkit directories, in order. */
|
|
739
|
+
declare function discoverSkillsForToolkits(toolkitDirs: string[]): Skill[];
|
|
740
|
+
/**
|
|
741
|
+
* Builds a system-prompt addendum that tells the agent which skills are
|
|
742
|
+
* available, mirroring the "progressive disclosure" model used by Claude
|
|
743
|
+
* Agent Skills: only the skill's `name` + `description` are injected up
|
|
744
|
+
* front, and the full SKILL.md body is read on demand (via the agent's
|
|
745
|
+
* `readFile` tool) once the agent decides a skill is relevant.
|
|
746
|
+
*
|
|
747
|
+
* Assumes toolkits are injected into the sandbox at `/app`, so a skill at
|
|
748
|
+
* `<toolkitDir>/.claude/skills/<name>/SKILL.md` is readable at
|
|
749
|
+
* `/app/.claude/skills/<name>/SKILL.md` (see DockerSandboxProvider).
|
|
750
|
+
*
|
|
751
|
+
* Returns an empty string if there are no skills (so callers can append it
|
|
752
|
+
* unconditionally without producing an empty trailing section).
|
|
753
|
+
*/
|
|
754
|
+
declare function buildSkillsPromptAddendum(skills: Skill[]): string;
|
|
755
|
+
|
|
756
|
+
export { type AgentAdapter, type AgentConfig, AgentConfigSchema, type AgentResult, AssertionScorer, type BaselineResult, type BenchmarkInput, type BenchmarkResult, CommandScorer, DiffScorer, type DiffStats, LocalizationScorer, type McpServerConfig, McpServerConfigSchema, type PatchApplyResult, RegressionScorer, type Run, RunSchema, type RunSingleInput, type RunSingleResult, type SandboxHandle, type SandboxProvider, type Scorer, type ScorerResult, type Skill, type SkillFrontmatter, SkillFrontmatterSchema, type StepEvent, StepEventSchema, type SuccessCriterion, SuccessCriterionSchema, TapTestResultParser, type TestCase, TestCaseSchema, type TestResultParser, type TestStatus, type TestStatusMap, type Trace, TraceSchema, type ValidationCheck, type ValidationReport, buildSkillsPromptAddendum, discoverSkills, discoverSkillsForToolkits, getOrComputeBaseline, hashFixture, matchAnyGlob, matchGlob, parseDiffStats, parseSkillMarkdown, runBenchmark, runSingle, validateTestCase };
|