@agentgrader/core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +756 -0
  2. package/dist/index.js +1114 -0
  3. package/package.json +33 -0
@@ -0,0 +1,756 @@
1
+ import { z } from 'zod';
2
+ import { AgrDb } from '@agentgrader/store';
3
+
4
+ declare const SuccessCriterionSchema: z.ZodUnion<[z.ZodObject<{
5
+ run: z.ZodString;
6
+ expect: z.ZodObject<{
7
+ exit_code: z.ZodDefault<z.ZodNumber>;
8
+ }, "strip", z.ZodTypeAny, {
9
+ exit_code: number;
10
+ }, {
11
+ exit_code?: number | undefined;
12
+ }>;
13
+ }, "strip", z.ZodTypeAny, {
14
+ run: string;
15
+ expect: {
16
+ exit_code: number;
17
+ };
18
+ }, {
19
+ run: string;
20
+ expect: {
21
+ exit_code?: number | undefined;
22
+ };
23
+ }>, z.ZodObject<{
24
+ assert: z.ZodString;
25
+ }, "strip", z.ZodTypeAny, {
26
+ assert: string;
27
+ }, {
28
+ assert: string;
29
+ }>]>;
30
+ type SuccessCriterion = z.infer<typeof SuccessCriterionSchema>;
31
+ declare const TestCaseSchema: z.ZodObject<{
32
+ id: z.ZodOptional<z.ZodString>;
33
+ name: z.ZodString;
34
+ description: z.ZodOptional<z.ZodString>;
35
+ fixture: z.ZodString;
36
+ prompt: z.ZodString;
37
+ success: z.ZodArray<z.ZodUnion<[z.ZodObject<{
38
+ run: z.ZodString;
39
+ expect: z.ZodObject<{
40
+ exit_code: z.ZodDefault<z.ZodNumber>;
41
+ }, "strip", z.ZodTypeAny, {
42
+ exit_code: number;
43
+ }, {
44
+ exit_code?: number | undefined;
45
+ }>;
46
+ }, "strip", z.ZodTypeAny, {
47
+ run: string;
48
+ expect: {
49
+ exit_code: number;
50
+ };
51
+ }, {
52
+ run: string;
53
+ expect: {
54
+ exit_code?: number | undefined;
55
+ };
56
+ }>, z.ZodObject<{
57
+ assert: z.ZodString;
58
+ }, "strip", z.ZodTypeAny, {
59
+ assert: string;
60
+ }, {
61
+ assert: string;
62
+ }>]>, "many">;
63
+ timeout_seconds: z.ZodDefault<z.ZodNumber>;
64
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
65
+ test_command: z.ZodOptional<z.ZodString>;
66
+ fail_to_pass: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
67
+ pass_to_pass: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
68
+ forbid_modified: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
69
+ expected_files: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
70
+ solution: z.ZodOptional<z.ZodString>;
71
+ test_patch: z.ZodOptional<z.ZodString>;
72
+ created_at: z.ZodOptional<z.ZodString>;
73
+ image: z.ZodOptional<z.ZodString>;
74
+ toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
75
+ }, "strip", z.ZodTypeAny, {
76
+ name: string;
77
+ fixture: string;
78
+ prompt: string;
79
+ success: ({
80
+ run: string;
81
+ expect: {
82
+ exit_code: number;
83
+ };
84
+ } | {
85
+ assert: string;
86
+ })[];
87
+ timeout_seconds: number;
88
+ id?: string | undefined;
89
+ description?: string | undefined;
90
+ tags?: string[] | undefined;
91
+ test_command?: string | undefined;
92
+ fail_to_pass?: string[] | undefined;
93
+ pass_to_pass?: string[] | undefined;
94
+ forbid_modified?: string[] | undefined;
95
+ expected_files?: string[] | undefined;
96
+ solution?: string | undefined;
97
+ test_patch?: string | undefined;
98
+ created_at?: string | undefined;
99
+ image?: string | undefined;
100
+ toolkits?: string[] | undefined;
101
+ }, {
102
+ name: string;
103
+ fixture: string;
104
+ prompt: string;
105
+ success: ({
106
+ run: string;
107
+ expect: {
108
+ exit_code?: number | undefined;
109
+ };
110
+ } | {
111
+ assert: string;
112
+ })[];
113
+ id?: string | undefined;
114
+ description?: string | undefined;
115
+ timeout_seconds?: number | undefined;
116
+ tags?: string[] | undefined;
117
+ test_command?: string | undefined;
118
+ fail_to_pass?: string[] | undefined;
119
+ pass_to_pass?: string[] | undefined;
120
+ forbid_modified?: string[] | undefined;
121
+ expected_files?: string[] | undefined;
122
+ solution?: string | undefined;
123
+ test_patch?: string | undefined;
124
+ created_at?: string | undefined;
125
+ image?: string | undefined;
126
+ toolkits?: string[] | undefined;
127
+ }>;
128
+ type TestCase = z.infer<typeof TestCaseSchema>;
129
+
130
+ declare const AgentConfigSchema: z.ZodObject<{
131
+ id: z.ZodOptional<z.ZodString>;
132
+ name: z.ZodString;
133
+ model: z.ZodString;
134
+ max_steps: z.ZodDefault<z.ZodNumber>;
135
+ temperature: z.ZodOptional<z.ZodNumber>;
136
+ system_prompt: z.ZodOptional<z.ZodString>;
137
+ tools: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
138
+ toolkits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
139
+ mcp_servers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnion<[z.ZodObject<{
140
+ command: z.ZodString;
141
+ args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
142
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
143
+ }, "strip", z.ZodTypeAny, {
144
+ command: string;
145
+ args?: string[] | undefined;
146
+ env?: Record<string, string> | undefined;
147
+ }, {
148
+ command: string;
149
+ args?: string[] | undefined;
150
+ env?: Record<string, string> | undefined;
151
+ }>, z.ZodObject<{
152
+ type: z.ZodOptional<z.ZodEnum<["http", "sse"]>>;
153
+ url: z.ZodString;
154
+ headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
155
+ }, "strip", z.ZodTypeAny, {
156
+ url: string;
157
+ type?: "http" | "sse" | undefined;
158
+ headers?: Record<string, string> | undefined;
159
+ }, {
160
+ url: string;
161
+ type?: "http" | "sse" | undefined;
162
+ headers?: Record<string, string> | undefined;
163
+ }>]>>>;
164
+ }, "strip", z.ZodTypeAny, {
165
+ name: string;
166
+ model: string;
167
+ max_steps: number;
168
+ id?: string | undefined;
169
+ toolkits?: string[] | undefined;
170
+ temperature?: number | undefined;
171
+ system_prompt?: string | undefined;
172
+ tools?: string[] | undefined;
173
+ mcp_servers?: Record<string, {
174
+ command: string;
175
+ args?: string[] | undefined;
176
+ env?: Record<string, string> | undefined;
177
+ } | {
178
+ url: string;
179
+ type?: "http" | "sse" | undefined;
180
+ headers?: Record<string, string> | undefined;
181
+ }> | undefined;
182
+ }, {
183
+ name: string;
184
+ model: string;
185
+ id?: string | undefined;
186
+ toolkits?: string[] | undefined;
187
+ max_steps?: number | undefined;
188
+ temperature?: number | undefined;
189
+ system_prompt?: string | undefined;
190
+ tools?: string[] | undefined;
191
+ mcp_servers?: Record<string, {
192
+ command: string;
193
+ args?: string[] | undefined;
194
+ env?: Record<string, string> | undefined;
195
+ } | {
196
+ url: string;
197
+ type?: "http" | "sse" | undefined;
198
+ headers?: Record<string, string> | undefined;
199
+ }> | undefined;
200
+ }>;
201
+ type AgentConfig = z.infer<typeof AgentConfigSchema>;
202
+
203
+ /**
204
+ * Frontmatter for a Claude Agent Skill (`SKILL.md`).
205
+ *
206
+ * This is intentionally a conservative subset of the published Agent Skills
207
+ * spec: `name` and `description` are the two fields that are solidly
208
+ * documented as always loaded into context for skill discovery (progressive
209
+ * disclosure - the rest of SKILL.md is only read on demand). `allowed-tools`,
210
+ * `disallowed-tools`, and `license` are reasonably well documented optional
211
+ * fields. Anything else is passed through unvalidated via `.passthrough()`
212
+ * so we don't reject SKILL.md files that use additional frontmatter we
213
+ * haven't verified.
214
+ */
215
+ declare const SkillFrontmatterSchema: z.ZodObject<{
216
+ /** lowercase letters, numbers, hyphens; max 64 chars */
217
+ name: z.ZodString;
218
+ /** third-person description of what the skill does and when to use it; max 1024 chars */
219
+ description: z.ZodString;
220
+ "allowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
221
+ "disallowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
222
+ license: z.ZodOptional<z.ZodString>;
223
+ }, "passthrough", z.ZodTypeAny, z.objectOutputType<{
224
+ /** lowercase letters, numbers, hyphens; max 64 chars */
225
+ name: z.ZodString;
226
+ /** third-person description of what the skill does and when to use it; max 1024 chars */
227
+ description: z.ZodString;
228
+ "allowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
229
+ "disallowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
230
+ license: z.ZodOptional<z.ZodString>;
231
+ }, z.ZodTypeAny, "passthrough">, z.objectInputType<{
232
+ /** lowercase letters, numbers, hyphens; max 64 chars */
233
+ name: z.ZodString;
234
+ /** third-person description of what the skill does and when to use it; max 1024 chars */
235
+ description: z.ZodString;
236
+ "allowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
237
+ "disallowed-tools": z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
238
+ license: z.ZodOptional<z.ZodString>;
239
+ }, z.ZodTypeAny, "passthrough">>;
240
+ type SkillFrontmatter = z.infer<typeof SkillFrontmatterSchema>;
241
+ /** A discovered skill: parsed frontmatter, markdown body, and its location on disk. */
242
+ interface Skill {
243
+ frontmatter: SkillFrontmatter;
244
+ /** markdown body of SKILL.md, with the frontmatter block stripped */
245
+ body: string;
246
+ /** absolute path to the SKILL.md file */
247
+ path: string;
248
+ /** absolute path to the skill's directory (for resolving bundled scripts/resources) */
249
+ dir: string;
250
+ }
251
+ /**
252
+ * MCP server configuration, mirroring the `mcpServers` entries used by
253
+ * `.mcp.json` configs:
254
+ *
255
+ * - stdio servers are launched as a local subprocess and spoken to over
256
+ * stdin/stdout (`command` + optional `args`/`env`).
257
+ * - http/sse servers are remote endpoints reached over HTTP(S) (`url` +
258
+ * optional `headers`).
259
+ */
260
+ declare const McpServerConfigSchema: z.ZodUnion<[z.ZodObject<{
261
+ command: z.ZodString;
262
+ args: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
263
+ env: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
264
+ }, "strip", z.ZodTypeAny, {
265
+ command: string;
266
+ args?: string[] | undefined;
267
+ env?: Record<string, string> | undefined;
268
+ }, {
269
+ command: string;
270
+ args?: string[] | undefined;
271
+ env?: Record<string, string> | undefined;
272
+ }>, z.ZodObject<{
273
+ type: z.ZodOptional<z.ZodEnum<["http", "sse"]>>;
274
+ url: z.ZodString;
275
+ headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
276
+ }, "strip", z.ZodTypeAny, {
277
+ url: string;
278
+ type?: "http" | "sse" | undefined;
279
+ headers?: Record<string, string> | undefined;
280
+ }, {
281
+ url: string;
282
+ type?: "http" | "sse" | undefined;
283
+ headers?: Record<string, string> | undefined;
284
+ }>]>;
285
+ type McpServerConfig = z.infer<typeof McpServerConfigSchema>;
286
+
287
+ declare const RunSchema: z.ZodObject<{
288
+ id: z.ZodString;
289
+ testCaseId: z.ZodString;
290
+ agentConfigId: z.ZodString;
291
+ sandboxProvider: z.ZodString;
292
+ status: z.ZodEnum<["running", "completed", "failed"]>;
293
+ passed: z.ZodOptional<z.ZodBoolean>;
294
+ score: z.ZodOptional<z.ZodNumber>;
295
+ stepsCount: z.ZodDefault<z.ZodNumber>;
296
+ tokensIn: z.ZodDefault<z.ZodNumber>;
297
+ tokensOut: z.ZodDefault<z.ZodNumber>;
298
+ costUsd: z.ZodDefault<z.ZodNumber>;
299
+ durationMs: z.ZodDefault<z.ZodNumber>;
300
+ error: z.ZodOptional<z.ZodString>;
301
+ finalDiff: z.ZodOptional<z.ZodString>;
302
+ metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
303
+ createdAt: z.ZodNumber;
304
+ completedAt: z.ZodOptional<z.ZodNumber>;
305
+ }, "strip", z.ZodTypeAny, {
306
+ status: "running" | "completed" | "failed";
307
+ id: string;
308
+ testCaseId: string;
309
+ agentConfigId: string;
310
+ sandboxProvider: string;
311
+ stepsCount: number;
312
+ tokensIn: number;
313
+ tokensOut: number;
314
+ costUsd: number;
315
+ durationMs: number;
316
+ createdAt: number;
317
+ passed?: boolean | undefined;
318
+ score?: number | undefined;
319
+ error?: string | undefined;
320
+ finalDiff?: string | undefined;
321
+ metrics?: Record<string, any> | undefined;
322
+ completedAt?: number | undefined;
323
+ }, {
324
+ status: "running" | "completed" | "failed";
325
+ id: string;
326
+ testCaseId: string;
327
+ agentConfigId: string;
328
+ sandboxProvider: string;
329
+ createdAt: number;
330
+ passed?: boolean | undefined;
331
+ score?: number | undefined;
332
+ stepsCount?: number | undefined;
333
+ tokensIn?: number | undefined;
334
+ tokensOut?: number | undefined;
335
+ costUsd?: number | undefined;
336
+ durationMs?: number | undefined;
337
+ error?: string | undefined;
338
+ finalDiff?: string | undefined;
339
+ metrics?: Record<string, any> | undefined;
340
+ completedAt?: number | undefined;
341
+ }>;
342
+ type Run = z.infer<typeof RunSchema>;
343
+
344
+ declare const StepEventSchema: z.ZodObject<{
345
+ index: z.ZodNumber;
346
+ kind: z.ZodEnum<["tool_call", "tool_result", "message", "thinking"]>;
347
+ tool: z.ZodOptional<z.ZodString>;
348
+ tokensIn: z.ZodDefault<z.ZodNumber>;
349
+ tokensOut: z.ZodDefault<z.ZodNumber>;
350
+ costUsd: z.ZodDefault<z.ZodNumber>;
351
+ timestamp: z.ZodNumber;
352
+ content: z.ZodOptional<z.ZodString>;
353
+ }, "strip", z.ZodTypeAny, {
354
+ tokensIn: number;
355
+ tokensOut: number;
356
+ costUsd: number;
357
+ index: number;
358
+ kind: "message" | "tool_call" | "tool_result" | "thinking";
359
+ timestamp: number;
360
+ tool?: string | undefined;
361
+ content?: string | undefined;
362
+ }, {
363
+ index: number;
364
+ kind: "message" | "tool_call" | "tool_result" | "thinking";
365
+ timestamp: number;
366
+ tokensIn?: number | undefined;
367
+ tokensOut?: number | undefined;
368
+ costUsd?: number | undefined;
369
+ tool?: string | undefined;
370
+ content?: string | undefined;
371
+ }>;
372
+ type StepEvent = z.infer<typeof StepEventSchema>;
373
+ declare const TraceSchema: z.ZodObject<{
374
+ runId: z.ZodString;
375
+ steps: z.ZodArray<z.ZodObject<{
376
+ index: z.ZodNumber;
377
+ kind: z.ZodEnum<["tool_call", "tool_result", "message", "thinking"]>;
378
+ tool: z.ZodOptional<z.ZodString>;
379
+ tokensIn: z.ZodDefault<z.ZodNumber>;
380
+ tokensOut: z.ZodDefault<z.ZodNumber>;
381
+ costUsd: z.ZodDefault<z.ZodNumber>;
382
+ timestamp: z.ZodNumber;
383
+ content: z.ZodOptional<z.ZodString>;
384
+ }, "strip", z.ZodTypeAny, {
385
+ tokensIn: number;
386
+ tokensOut: number;
387
+ costUsd: number;
388
+ index: number;
389
+ kind: "message" | "tool_call" | "tool_result" | "thinking";
390
+ timestamp: number;
391
+ tool?: string | undefined;
392
+ content?: string | undefined;
393
+ }, {
394
+ index: number;
395
+ kind: "message" | "tool_call" | "tool_result" | "thinking";
396
+ timestamp: number;
397
+ tokensIn?: number | undefined;
398
+ tokensOut?: number | undefined;
399
+ costUsd?: number | undefined;
400
+ tool?: string | undefined;
401
+ content?: string | undefined;
402
+ }>, "many">;
403
+ }, "strip", z.ZodTypeAny, {
404
+ runId: string;
405
+ steps: {
406
+ tokensIn: number;
407
+ tokensOut: number;
408
+ costUsd: number;
409
+ index: number;
410
+ kind: "message" | "tool_call" | "tool_result" | "thinking";
411
+ timestamp: number;
412
+ tool?: string | undefined;
413
+ content?: string | undefined;
414
+ }[];
415
+ }, {
416
+ runId: string;
417
+ steps: {
418
+ index: number;
419
+ kind: "message" | "tool_call" | "tool_result" | "thinking";
420
+ timestamp: number;
421
+ tokensIn?: number | undefined;
422
+ tokensOut?: number | undefined;
423
+ costUsd?: number | undefined;
424
+ tool?: string | undefined;
425
+ content?: string | undefined;
426
+ }[];
427
+ }>;
428
+ type Trace = z.infer<typeof TraceSchema>;
429
+
430
+ interface PatchApplyResult {
431
+ /** true if the patch was applied successfully (with or without repair) */
432
+ applied: boolean;
433
+ /** true if a fallback/repair strategy (3-way merge or `patch --fuzz`) was needed */
434
+ repaired: boolean;
435
+ /** combined output/diagnostics from the apply attempt(s) */
436
+ output: string;
437
+ }
438
+ interface SandboxHandle {
439
+ exec(cmd: string): Promise<{
440
+ stdout: string;
441
+ stderr: string;
442
+ exitCode: number;
443
+ }>;
444
+ writeFile(path: string, content: string): Promise<void>;
445
+ readFile(path: string): Promise<string>;
446
+ gitDiff(): Promise<string>;
447
+ /**
448
+ * Applies a unified diff to the sandbox's working tree.
449
+ *
450
+ * Mirrors SWE-bench's patch-application robustness: tries `git apply`
451
+ * first, then falls back to `git apply --3way`, then `patch --fuzz=3`.
452
+ * Implementations should report whether a fallback ("repair") was needed.
453
+ */
454
+ applyPatch(diff: string): Promise<PatchApplyResult>;
455
+ destroy(): Promise<void>;
456
+ }
457
+ interface SandboxProvider {
458
+ readonly name: string;
459
+ create(opts: {
460
+ image?: string;
461
+ gitSnapshot?: string;
462
+ /**
463
+ * Absolute paths to local "toolkit" directories to inject into the
464
+ * sandbox, in addition to `gitSnapshot`. A toolkit may contain a `bin/`
465
+ * directory (custom CLI tools, made executable and put on `PATH`) and a
466
+ * `.claude/skills/` directory (Agent Skills documentation, see
467
+ * `runner/skills.ts`).
468
+ */
469
+ toolkits?: string[];
470
+ }): Promise<SandboxHandle>;
471
+ }
472
+
473
+ interface AgentResult {
474
+ finished: boolean;
475
+ finalDiff: string;
476
+ }
477
+ interface AgentAdapter {
478
+ readonly name: string;
479
+ /**
480
+ * run the agent against a problem in a prepared sandbox.
481
+ * the adapter doesn't know about scoring — it just solves and emits
482
+ * step events. the framework measures everything from the outside.
483
+ */
484
+ solve(input: {
485
+ prompt: string;
486
+ sandbox: SandboxHandle;
487
+ config: AgentConfig;
488
+ onStep: (step: StepEvent) => void;
489
+ }): Promise<AgentResult>;
490
+ }
491
+
492
+ interface ScorerResult {
493
+ passed: boolean;
494
+ detail: string;
495
+ score?: number;
496
+ }
497
+ interface Scorer {
498
+ readonly name: string;
499
+ score(input: {
500
+ testCase: TestCase;
501
+ result: AgentResult;
502
+ trace: Trace;
503
+ sandbox: SandboxHandle;
504
+ }): Promise<ScorerResult>;
505
+ }
506
+
507
+ /**
508
+ * Parses raw test-runner output into a per-test status map, so scorers can
509
+ * compare individual test outcomes against FAIL_TO_PASS / PASS_TO_PASS lists
510
+ * (SWE-bench style).
511
+ */
512
+ type TestStatus = "PASS" | "FAIL" | "SKIP";
513
+ /** Maps a test's display name to its outcome. */
514
+ type TestStatusMap = Record<string, TestStatus>;
515
+ interface TestResultParser {
516
+ readonly name: string;
517
+ parse(output: string): TestStatusMap;
518
+ }
519
+ /**
520
+ * Parses TAP (Test Anything Protocol) output, as produced by
521
+ * `node --test --test-reporter=tap` (or `tsx --test --test-reporter=tap`).
522
+ *
523
+ * Handles lines of the form:
524
+ * ok 1 - should succeed on first attempt
525
+ * not ok 2 - should retry on failure and succeed
526
+ * ok 3 - should be skipped # SKIP
527
+ */
528
+ declare class TapTestResultParser implements TestResultParser {
529
+ readonly name = "tap";
530
+ parse(output: string): TestStatusMap;
531
+ }
532
+
533
+ declare class CommandScorer implements Scorer {
534
+ readonly name = "CommandScorer";
535
+ score(input: {
536
+ testCase: TestCase;
537
+ sandbox: SandboxHandle;
538
+ }): Promise<ScorerResult>;
539
+ }
540
+
541
+ declare class AssertionScorer implements Scorer {
542
+ readonly name = "AssertionScorer";
543
+ score(input: {
544
+ testCase: TestCase;
545
+ trace: Trace;
546
+ }): Promise<ScorerResult>;
547
+ }
548
+
549
+ /**
550
+ * SWE-bench style regression scorer.
551
+ *
552
+ * - FAIL_TO_PASS: tests that were failing before the agent's patch and MUST
553
+ * pass afterwards. This is the actual "did the agent fix the bug" signal.
554
+ * - PASS_TO_PASS: tests that were passing before the agent's patch and MUST
555
+ * keep passing afterwards (no regressions introduced).
556
+ * - forbid_modified: acts as a tamper guard - if the agent edited test files
557
+ * (or other forbidden paths) to make the suite pass trivially, fail hard.
558
+ *
559
+ * If `test_command` / fail_to_pass / pass_to_pass are not configured on the
560
+ * test case, this scorer is a no-op pass (keeps it backwards compatible).
561
+ */
562
+ declare class RegressionScorer implements Scorer {
563
+ readonly name = "RegressionScorer";
564
+ score(input: {
565
+ testCase: TestCase;
566
+ sandbox: SandboxHandle;
567
+ /** pre-patch test status map, used to avoid penalizing already-broken PASS_TO_PASS tests */
568
+ baseline?: TestStatusMap;
569
+ }): Promise<ScorerResult>;
570
+ }
571
+
572
+ interface DiffStats {
573
+ filesChanged: string[];
574
+ insertions: number;
575
+ deletions: number;
576
+ linesChanged: number;
577
+ }
578
+ /** Parses a unified diff (as produced by `git diff`) into basic stats. */
579
+ declare function parseDiffStats(diff: string): DiffStats;
580
+ /**
581
+ * Scores the "scope" of an agent's patch.
582
+ *
583
+ * SWE-bench analyses showed agents frequently produce patches that are far
584
+ * larger / touch far more files than the gold patch - editing unrelated
585
+ * code, leaving debug statements, etc. This scorer reports diff stats and,
586
+ * when a gold `solution` patch is available (loaded as raw diff content by
587
+ * the CLI's `loadTestCase` helper), penalizes patches that are much larger
588
+ * than the gold patch in terms of total changed lines.
589
+ */
590
+ declare class DiffScorer implements Scorer {
591
+ readonly name = "DiffScorer";
592
+ score(input: {
593
+ testCase: TestCase;
594
+ result: AgentResult;
595
+ }): Promise<ScorerResult>;
596
+ }
597
+
598
+ /**
599
+ * Measures whether the agent edited the "right" files, using
600
+ * `expected_files` glob patterns (typically derived from the gold patch).
601
+ *
602
+ * - precision: fraction of files the agent touched that match an expected pattern
603
+ * - recall: fraction of expected patterns that were matched by at least one touched file
604
+ * - f1: harmonic mean of precision and recall
605
+ */
606
+ declare class LocalizationScorer implements Scorer {
607
+ readonly name = "LocalizationScorer";
608
+ score(input: {
609
+ testCase: TestCase;
610
+ result: AgentResult;
611
+ }): Promise<ScorerResult>;
612
+ }
613
+
614
+ interface RunSingleInput {
615
+ testCase: TestCase;
616
+ agentConfig: AgentConfig;
617
+ adapter: AgentAdapter;
618
+ sandboxProvider: SandboxProvider;
619
+ db?: AgrDb;
620
+ runId: string;
621
+ }
622
+ interface RunSingleResult {
623
+ runId: string;
624
+ passed: boolean;
625
+ score?: number;
626
+ stepsCount: number;
627
+ tokensIn: number;
628
+ tokensOut: number;
629
+ costUsd: number;
630
+ durationMs: number;
631
+ error?: string;
632
+ finalDiff?: string;
633
+ metrics?: Record<string, any>;
634
+ }
635
+ declare function runSingle(input: RunSingleInput): Promise<RunSingleResult>;
636
+
637
+ interface BenchmarkInput {
638
+ testCases: TestCase[];
639
+ agentConfigs: AgentConfig[];
640
+ adapter: AgentAdapter;
641
+ sandboxProvider: SandboxProvider;
642
+ db?: AgrDb;
643
+ concurrency?: number;
644
+ onRunUpdate?: (run: RunSingleResult & {
645
+ testCaseId: string;
646
+ agentConfigId: string;
647
+ status: "running" | "completed" | "failed";
648
+ }) => void;
649
+ }
650
+ interface BenchmarkResult {
651
+ runs: RunSingleResult[];
652
+ }
653
+ declare function runBenchmark(input: BenchmarkInput): Promise<BenchmarkResult>;
654
+
655
+ /**
656
+ * Minimal glob matcher (no external dependency) supporting:
657
+ * - `*` matches any sequence of characters except `/`
658
+ * - `**` matches any sequence of characters, including `/`
659
+ * - `?` matches a single character except `/`
660
+ *
661
+ * Used for `forbid_modified` / `expected_files` patterns in test cases.
662
+ */
663
+ declare function matchGlob(pattern: string, filePath: string): boolean;
664
+ declare function matchAnyGlob(patterns: string[], filePath: string): boolean;
665
+
666
+ /**
667
+ * Computes a stable SHA-256 hash over the contents (and relative paths) of a
668
+ * fixture directory. Used to key cached baseline test results - if the
669
+ * fixture changes, the baseline must be recomputed.
670
+ */
671
+ declare function hashFixture(fixtureDir: string): string;
672
+
673
+ interface BaselineResult {
674
+ fixtureHash: string;
675
+ statusMap: TestStatusMap;
676
+ /** true if this baseline came from the cache rather than a fresh run */
677
+ cached: boolean;
678
+ }
679
+ /**
680
+ * Computes (or loads from cache) the pre-patch test status map for a test
681
+ * case's fixture. Used by RegressionScorer so PASS_TO_PASS tests that were
682
+ * already broken before the agent touched anything don't unfairly penalize
683
+ * the run.
684
+ *
685
+ * Returns `undefined` if the test case has no `test_command` configured -
686
+ * baseline computation is then skipped entirely.
687
+ */
688
+ declare function getOrComputeBaseline(input: {
689
+ testCase: TestCase;
690
+ sandboxProvider: SandboxProvider;
691
+ db?: AgrDb;
692
+ }): Promise<BaselineResult | undefined>;
693
+
694
+ interface ValidationCheck {
695
+ name: string;
696
+ passed: boolean;
697
+ detail: string;
698
+ }
699
+ interface ValidationReport {
700
+ ok: boolean;
701
+ checks: ValidationCheck[];
702
+ }
703
+ /**
704
+ * validates a test case the way SWE-bench validates a candidate task
705
+ * instance before it's added to the benchmark:
706
+ *
707
+ * 1. static check required fields are present and internally consistent.
708
+ * 2. pre-patch run - FAIL_TO_PASS tests must currently be FAILING and
709
+ * PASS_TO_PASS tests must currently be PASSING (on the raw fixture, with
710
+ * `test_patch` applied if present).
711
+ * 3. post-patch run - if a gold `solution` patch is provided, apply it and
712
+ * verify FAIL_TO_PASS tests now PASS and PASS_TO_PASS tests still PASS.
713
+ *
714
+ * this catches the most common authoring mistakes: typo'd test names,
715
+ * tests that pass/fail for the wrong reason, gold patches that don't
716
+ * actually fix the issue, and forbidden-file globs that never match.
717
+ */
718
+ declare function validateTestCase(input: {
719
+ testCase: TestCase;
720
+ sandboxProvider: SandboxProvider;
721
+ }): Promise<ValidationReport>;
722
+
723
+ /**
724
+ * Parses a SKILL.md file's content into validated frontmatter + body.
725
+ *
726
+ * Throws if the file has no `---`-delimited YAML frontmatter block, or if
727
+ * the frontmatter doesn't satisfy `SkillFrontmatterSchema` (e.g. missing
728
+ * `name`/`description`).
729
+ */
730
+ declare function parseSkillMarkdown(content: string, path: string, dir: string): Skill;
731
+ /**
732
+ * Discovers all skills bundled in a toolkit directory, i.e. every
733
+ * `<toolkitDir>/.claude/skills/<skill-name>/SKILL.md`.
734
+ *
735
+ * Returns an empty array if the toolkit has no `.claude/skills` directory.
736
+ */
737
+ declare function discoverSkills(toolkitDir: string): Skill[];
738
+ /** Discovers skills across multiple toolkit directories, in order. */
739
+ declare function discoverSkillsForToolkits(toolkitDirs: string[]): Skill[];
740
+ /**
741
+ * Builds a system-prompt addendum that tells the agent which skills are
742
+ * available, mirroring the "progressive disclosure" model used by Claude
743
+ * Agent Skills: only the skill's `name` + `description` are injected up
744
+ * front, and the full SKILL.md body is read on demand (via the agent's
745
+ * `readFile` tool) once the agent decides a skill is relevant.
746
+ *
747
+ * Assumes toolkits are injected into the sandbox at `/app`, so a skill at
748
+ * `<toolkitDir>/.claude/skills/<name>/SKILL.md` is readable at
749
+ * `/app/.claude/skills/<name>/SKILL.md` (see DockerSandboxProvider).
750
+ *
751
+ * Returns an empty string if there are no skills (so callers can append it
752
+ * unconditionally without producing an empty trailing section).
753
+ */
754
+ declare function buildSkillsPromptAddendum(skills: Skill[]): string;
755
+
756
+ export { type AgentAdapter, type AgentConfig, AgentConfigSchema, type AgentResult, AssertionScorer, type BaselineResult, type BenchmarkInput, type BenchmarkResult, CommandScorer, DiffScorer, type DiffStats, LocalizationScorer, type McpServerConfig, McpServerConfigSchema, type PatchApplyResult, RegressionScorer, type Run, RunSchema, type RunSingleInput, type RunSingleResult, type SandboxHandle, type SandboxProvider, type Scorer, type ScorerResult, type Skill, type SkillFrontmatter, SkillFrontmatterSchema, type StepEvent, StepEventSchema, type SuccessCriterion, SuccessCriterionSchema, TapTestResultParser, type TestCase, TestCaseSchema, type TestResultParser, type TestStatus, type TestStatusMap, type Trace, TraceSchema, type ValidationCheck, type ValidationReport, buildSkillsPromptAddendum, discoverSkills, discoverSkillsForToolkits, getOrComputeBaseline, hashFixture, matchAnyGlob, matchGlob, parseDiffStats, parseSkillMarkdown, runBenchmark, runSingle, validateTestCase };