@agentgrader/core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/index.d.ts +756 -0
  2. package/dist/index.js +1114 -0
  3. package/package.json +33 -0
package/dist/index.js ADDED
@@ -0,0 +1,1114 @@
1
+ import { z } from 'zod';
2
+ import { getCachedBaseline, saveCachedBaseline, createRun, updateRun, addTrace } from '@agentgrader/store';
3
+ import { createStep, createWorkflow } from '@mastra/core/workflows';
4
+ import { createHash, randomUUID } from 'crypto';
5
+ import { readFileSync, readdirSync, statSync, existsSync } from 'fs';
6
+ import { relative, sep, join, resolve } from 'path';
7
+ import { parse } from 'yaml';
8
+
9
+ // src/schema/test-case.ts
10
+ var SuccessCriterionSchema = z.union([
11
+ z.object({
12
+ run: z.string(),
13
+ expect: z.object({
14
+ exit_code: z.number().default(0)
15
+ })
16
+ }),
17
+ z.object({
18
+ assert: z.string()
19
+ })
20
+ ]);
21
+ var TestCaseSchema = z.object({
22
+ id: z.string().optional(),
23
+ // Can be inferred from file name/folder if omitted
24
+ name: z.string(),
25
+ description: z.string().optional(),
26
+ fixture: z.string(),
27
+ prompt: z.string(),
28
+ success: z.array(SuccessCriterionSchema),
29
+ timeout_seconds: z.number().default(300),
30
+ // SWE-bench based fields (all optional, backwards compatible)
31
+ tags: z.array(z.string()).optional(),
32
+ // shell command used to run the test suite (TAP output expected for node:test)
33
+ test_command: z.string().optional(),
34
+ // names of tests that must flip from failing -> passing (FAIL_TO_PASS)
35
+ fail_to_pass: z.array(z.string()).optional(),
36
+ // names of tests that must remain passing throughout (PASS_TO_PASS)
37
+ pass_to_pass: z.array(z.string()).optional(),
38
+ // glob patterns of files the agent must NOT modify (e.g. test files - tamper guard)
39
+ forbid_modified: z.array(z.string()).optional(),
40
+ // glob patterns of files the agent is expected to touch (for localization scoring)
41
+ expected_files: z.array(z.string()).optional(),
42
+ // path (relative to the agr.yaml) to a gold-standard patch/diff for this task
43
+ solution: z.string().optional(),
44
+ // path (relative to the agr.yaml) to a patch that adds/updates the test suite itself
45
+ test_patch: z.string().optional(),
46
+ // original creation date of the underlying issue/PR (contamination/date-cutoff checks)
47
+ created_at: z.string().optional(),
48
+ // custom Docker image to run the sandbox from (defaults to the provider's default)
49
+ image: z.string().optional(),
50
+ // paths to toolkit directories (custom CLI tools + .claude/skills/) to
51
+ // inject into the sandbox and surface to the agent via the system prompt,
52
+ // in addition to any toolkits configured on the agent
53
+ toolkits: z.array(z.string()).optional()
54
+ });
55
+ var SkillFrontmatterSchema = z.object({
56
+ /** lowercase letters, numbers, hyphens; max 64 chars */
57
+ name: z.string().max(64).regex(/^[a-z0-9]+(-[a-z0-9]+)*$/, "name must be lowercase letters, numbers, and hyphens"),
58
+ /** third-person description of what the skill does and when to use it; max 1024 chars */
59
+ description: z.string().max(1024),
60
+ "allowed-tools": z.array(z.string()).optional(),
61
+ "disallowed-tools": z.array(z.string()).optional(),
62
+ license: z.string().optional()
63
+ }).passthrough();
64
+ var McpServerConfigSchema = z.union([
65
+ z.object({
66
+ command: z.string(),
67
+ args: z.array(z.string()).optional(),
68
+ env: z.record(z.string()).optional()
69
+ }),
70
+ z.object({
71
+ type: z.enum(["http", "sse"]).optional(),
72
+ url: z.string(),
73
+ headers: z.record(z.string()).optional()
74
+ })
75
+ ]);
76
+
77
+ // src/schema/agent-config.ts
78
+ var AgentConfigSchema = z.object({
79
+ id: z.string().optional(),
80
+ name: z.string(),
81
+ model: z.string(),
82
+ max_steps: z.number().default(30),
83
+ temperature: z.number().optional(),
84
+ system_prompt: z.string().optional(),
85
+ tools: z.array(z.string()).optional(),
86
+ // paths to toolkit directories (custom CLI tools + .claude/skills/) to
87
+ // inject into the sandbox and surface to the agent via the system prompt
88
+ toolkits: z.array(z.string()).optional(),
89
+ // MCP servers to connect to and expose as additional tools, keyed by name
90
+ mcp_servers: z.record(McpServerConfigSchema).optional()
91
+ });
92
+ var RunSchema = z.object({
93
+ id: z.string(),
94
+ testCaseId: z.string(),
95
+ agentConfigId: z.string(),
96
+ sandboxProvider: z.string(),
97
+ status: z.enum(["running", "completed", "failed"]),
98
+ passed: z.boolean().optional(),
99
+ score: z.number().optional(),
100
+ stepsCount: z.number().default(0),
101
+ tokensIn: z.number().default(0),
102
+ tokensOut: z.number().default(0),
103
+ costUsd: z.number().default(0),
104
+ durationMs: z.number().default(0),
105
+ error: z.string().optional(),
106
+ finalDiff: z.string().optional(),
107
+ // extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
108
+ // localization precision/recall, etc. Stored as JSON.
109
+ metrics: z.record(z.any()).optional(),
110
+ createdAt: z.number(),
111
+ completedAt: z.number().optional()
112
+ });
113
+ var StepEventSchema = z.object({
114
+ index: z.number(),
115
+ kind: z.enum(["tool_call", "tool_result", "message", "thinking"]),
116
+ tool: z.string().optional(),
117
+ tokensIn: z.number().default(0),
118
+ tokensOut: z.number().default(0),
119
+ costUsd: z.number().default(0),
120
+ timestamp: z.number(),
121
+ content: z.string().optional()
122
+ });
123
+ var TraceSchema = z.object({
124
+ runId: z.string(),
125
+ steps: z.array(StepEventSchema)
126
+ });
127
+
128
+ // src/adapters/test-result-parser.ts
129
+ var TapTestResultParser = class {
130
+ name = "tap";
131
+ parse(output) {
132
+ const map = {};
133
+ const lineRe = /^\s*(ok|not ok)\s+\d+\s*-?\s*(.*)$/;
134
+ for (const rawLine of output.split("\n")) {
135
+ const line = rawLine.replace(/\r$/, "");
136
+ const match = line.match(lineRe);
137
+ if (!match) continue;
138
+ const [, statusToken, rest] = match;
139
+ let name = rest.trim();
140
+ let directive;
141
+ const hashIdx = name.indexOf("#");
142
+ if (hashIdx !== -1) {
143
+ directive = name.slice(hashIdx + 1).trim();
144
+ name = name.slice(0, hashIdx).trim();
145
+ }
146
+ if (!name) continue;
147
+ if (directive && /^(skip|todo)/i.test(directive)) {
148
+ map[name] = "SKIP";
149
+ continue;
150
+ }
151
+ map[name] = statusToken === "ok" ? "PASS" : "FAIL";
152
+ }
153
+ return map;
154
+ }
155
+ };
156
+
157
+ // src/scorers/command-scorer.ts
158
+ var CommandScorer = class {
159
+ name = "CommandScorer";
160
+ async score(input) {
161
+ for (const criterion of input.testCase.success) {
162
+ if ("run" in criterion) {
163
+ const cmd = criterion.run;
164
+ const expectedExitCode = criterion.expect?.exit_code ?? 0;
165
+ try {
166
+ const res = await input.sandbox.exec(cmd);
167
+ if (res.exitCode !== expectedExitCode) {
168
+ return {
169
+ passed: false,
170
+ detail: `Command "${cmd}" exited with code ${res.exitCode}, expected ${expectedExitCode}.
171
+ Stdout: ${res.stdout}
172
+ Stderr: ${res.stderr}`
173
+ };
174
+ }
175
+ } catch (err) {
176
+ return {
177
+ passed: false,
178
+ detail: `Failed to execute command "${cmd}": ${err.message}`
179
+ };
180
+ }
181
+ }
182
+ }
183
+ return {
184
+ passed: true,
185
+ detail: "All success run commands completed with expected exit codes."
186
+ };
187
+ }
188
+ };
189
+
190
+ // src/scorers/assertion-scorer.ts
191
+ var AssertionScorer = class {
192
+ name = "AssertionScorer";
193
+ async score(input) {
194
+ const steps = input.trace.steps.filter(
195
+ (s) => s.kind === "tool_call" || s.kind === "message"
196
+ ).length;
197
+ let costUsd = 0;
198
+ for (const step of input.trace.steps) {
199
+ costUsd += step.costUsd || 0;
200
+ }
201
+ for (const criterion of input.testCase.success) {
202
+ if ("assert" in criterion) {
203
+ const expression = criterion.assert;
204
+ try {
205
+ const sanitizedExpr = expression.replace(/\bcost_usd\b/g, "costUsd").replace(/\btimeout_seconds\b/g, "timeoutSeconds").replace(/\bduration_ms\b/g, "durationMs");
206
+ const fn = new Function(
207
+ "steps",
208
+ "costUsd",
209
+ `try { return Boolean(${sanitizedExpr}); } catch(e) { return false; }`
210
+ );
211
+ const passed = fn(steps, costUsd);
212
+ if (!passed) {
213
+ return {
214
+ passed: false,
215
+ detail: `Assertion failed: "${expression}" (actual: steps=${steps}, cost_usd=$${costUsd.toFixed(4)})`
216
+ };
217
+ }
218
+ } catch (err) {
219
+ return {
220
+ passed: false,
221
+ detail: `Error evaluating assertion "${expression}": ${err.message}`
222
+ };
223
+ }
224
+ }
225
+ }
226
+ return {
227
+ passed: true,
228
+ detail: "All assertion checks passed successfully."
229
+ };
230
+ }
231
+ };
232
+
233
+ // src/runner/glob.ts
234
+ function matchGlob(pattern, filePath) {
235
+ const normalizedPattern = pattern.replace(/^\.\//, "");
236
+ const normalizedPath = filePath.replace(/^\.\//, "");
237
+ const regex = new RegExp(`^${globToRegExpSource(normalizedPattern)}$`);
238
+ return regex.test(normalizedPath);
239
+ }
240
+ function matchAnyGlob(patterns, filePath) {
241
+ return patterns.some((pattern) => matchGlob(pattern, filePath));
242
+ }
243
+ function globToRegExpSource(pattern) {
244
+ let result = "";
245
+ for (let i = 0; i < pattern.length; i++) {
246
+ const c = pattern[i];
247
+ if (c === "*") {
248
+ if (pattern[i + 1] === "*") {
249
+ result += ".*";
250
+ i++;
251
+ if (pattern[i + 1] === "/") {
252
+ i++;
253
+ }
254
+ } else {
255
+ result += "[^/]*";
256
+ }
257
+ } else if (c === "?") {
258
+ result += "[^/]";
259
+ } else if (/[.+^${}()|[\]\\]/.test(c)) {
260
+ result += `\\${c}`;
261
+ } else {
262
+ result += c;
263
+ }
264
+ }
265
+ return result;
266
+ }
267
+
268
+ // src/scorers/regression-scorer.ts
269
+ var RegressionScorer = class {
270
+ name = "RegressionScorer";
271
+ async score(input) {
272
+ const { testCase, sandbox, baseline } = input;
273
+ const failToPass = testCase.fail_to_pass ?? [];
274
+ const passToPass = testCase.pass_to_pass ?? [];
275
+ if (!testCase.test_command || failToPass.length === 0 && passToPass.length === 0) {
276
+ return {
277
+ passed: true,
278
+ score: 1,
279
+ detail: "No fail_to_pass/pass_to_pass criteria configured; skipping regression check."
280
+ };
281
+ }
282
+ if (testCase.forbid_modified?.length) {
283
+ const diff = await sandbox.gitDiff();
284
+ const changedFiles = parseChangedFiles(diff);
285
+ const tampered = changedFiles.filter((f) => matchAnyGlob(testCase.forbid_modified, f));
286
+ if (tampered.length > 0) {
287
+ return {
288
+ passed: false,
289
+ score: 0,
290
+ detail: `Tamper guard triggered: forbidden files were modified: ${tampered.join(", ")}`
291
+ };
292
+ }
293
+ }
294
+ let output = "";
295
+ try {
296
+ const res = await sandbox.exec(testCase.test_command);
297
+ output = `${res.stdout}
298
+ ${res.stderr}`;
299
+ } catch (err) {
300
+ output = err?.message ?? String(err);
301
+ }
302
+ const statusMap = new TapTestResultParser().parse(output);
303
+ const failures = [];
304
+ for (const name of failToPass) {
305
+ if (statusMap[name] !== "PASS") {
306
+ failures.push(
307
+ `FAIL_TO_PASS "${name}" did not pass (status: ${statusMap[name] ?? "not found"})`
308
+ );
309
+ }
310
+ }
311
+ for (const name of passToPass) {
312
+ if (statusMap[name] !== "PASS") {
313
+ if (baseline?.[name] && baseline[name] !== "PASS") {
314
+ continue;
315
+ }
316
+ failures.push(
317
+ `PASS_TO_PASS "${name}" regressed (status: ${statusMap[name] ?? "not found"})`
318
+ );
319
+ }
320
+ }
321
+ const total = failToPass.length + passToPass.length;
322
+ const score = total > 0 ? (total - failures.length) / total : 1;
323
+ if (failures.length > 0) {
324
+ return {
325
+ passed: false,
326
+ score,
327
+ detail: `Regression check failed (${total - failures.length}/${total} ok):
328
+ ${failures.join("\n")}`
329
+ };
330
+ }
331
+ return {
332
+ passed: true,
333
+ score: 1,
334
+ detail: `All ${total} FAIL_TO_PASS/PASS_TO_PASS checks passed.`
335
+ };
336
+ }
337
+ };
338
+ function parseChangedFiles(diff) {
339
+ const files = /* @__PURE__ */ new Set();
340
+ const re = /^diff --git a\/(.+?) b\/(.+)$/gm;
341
+ let match = re.exec(diff);
342
+ while (match) {
343
+ files.add(match[1]);
344
+ files.add(match[2]);
345
+ match = re.exec(diff);
346
+ }
347
+ return Array.from(files);
348
+ }
349
+
350
+ // src/scorers/diff-scorer.ts
351
+ function parseDiffStats(diff) {
352
+ const filesChanged = /* @__PURE__ */ new Set();
353
+ let insertions = 0;
354
+ let deletions = 0;
355
+ for (const line of diff.split("\n")) {
356
+ const fileMatch = line.match(/^diff --git a\/(.+?) b\/(.+)$/);
357
+ if (fileMatch) {
358
+ filesChanged.add(fileMatch[2]);
359
+ continue;
360
+ }
361
+ if (line.startsWith("+++") || line.startsWith("---")) continue;
362
+ if (line.startsWith("+")) insertions++;
363
+ else if (line.startsWith("-")) deletions++;
364
+ }
365
+ return {
366
+ filesChanged: Array.from(filesChanged),
367
+ insertions,
368
+ deletions,
369
+ linesChanged: insertions + deletions
370
+ };
371
+ }
372
+ var DiffScorer = class {
373
+ name = "DiffScorer";
374
+ async score(input) {
375
+ const stats = parseDiffStats(input.result.finalDiff ?? "");
376
+ if (stats.filesChanged.length === 0) {
377
+ return {
378
+ passed: false,
379
+ score: 0,
380
+ detail: "Agent's diff is empty - no files were changed."
381
+ };
382
+ }
383
+ const details = [
384
+ `Agent diff: ${stats.filesChanged.length} file(s), +${stats.insertions}/-${stats.deletions} lines.`
385
+ ];
386
+ let score = 1;
387
+ const solutionDiff = input.testCase.solution;
388
+ if (solutionDiff?.includes("diff --git")) {
389
+ const goldStats = parseDiffStats(solutionDiff);
390
+ details.push(
391
+ `Gold diff: ${goldStats.filesChanged.length} file(s), +${goldStats.insertions}/-${goldStats.deletions} lines.`
392
+ );
393
+ const goldLines = Math.max(goldStats.linesChanged, 1);
394
+ const ratio = stats.linesChanged / goldLines;
395
+ score = ratio <= 1 ? 1 : Math.max(0.1, 1 / ratio);
396
+ details.push(
397
+ `Scope ratio (agent/gold lines changed): ${ratio.toFixed(2)} -> score ${score.toFixed(2)}`
398
+ );
399
+ }
400
+ return {
401
+ passed: true,
402
+ score,
403
+ detail: details.join("\n")
404
+ };
405
+ }
406
+ };
407
+
408
+ // src/scorers/localization-scorer.ts
409
+ var LocalizationScorer = class {
410
+ name = "LocalizationScorer";
411
+ async score(input) {
412
+ const expectedFiles = input.testCase.expected_files;
413
+ if (!expectedFiles || expectedFiles.length === 0) {
414
+ return {
415
+ passed: true,
416
+ score: 1,
417
+ detail: "No expected_files configured; skipping localization check."
418
+ };
419
+ }
420
+ const stats = parseDiffStats(input.result.finalDiff ?? "");
421
+ const touched = stats.filesChanged;
422
+ if (touched.length === 0) {
423
+ return {
424
+ passed: false,
425
+ score: 0,
426
+ detail: "Agent did not modify any files."
427
+ };
428
+ }
429
+ const truePositives = touched.filter((f) => matchAnyGlob(expectedFiles, f));
430
+ const precision = truePositives.length / touched.length;
431
+ const matchedPatterns = expectedFiles.filter(
432
+ (pattern) => touched.some((f) => matchAnyGlob([pattern], f))
433
+ );
434
+ const recall = matchedPatterns.length / expectedFiles.length;
435
+ const f1 = precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
436
+ return {
437
+ passed: f1 > 0,
438
+ score: f1,
439
+ detail: [
440
+ `Localization: precision=${precision.toFixed(2)}, recall=${recall.toFixed(2)}, f1=${f1.toFixed(2)}`,
441
+ `Touched files: ${touched.join(", ")}`,
442
+ `Expected patterns: ${expectedFiles.join(", ")}`
443
+ ].join("\n")
444
+ };
445
+ }
446
+ };
447
+ var IGNORED_DIRS = /* @__PURE__ */ new Set(["node_modules", ".git", "dist", ".turbo"]);
448
+ function hashFixture(fixtureDir) {
449
+ const hash = createHash("sha256");
450
+ let files = [];
451
+ try {
452
+ files = listFilesRecursive(fixtureDir).sort();
453
+ } catch {
454
+ hash.update(fixtureDir);
455
+ return hash.digest("hex");
456
+ }
457
+ for (const file of files) {
458
+ const rel = relative(fixtureDir, file).split(sep).join("/");
459
+ hash.update(rel);
460
+ hash.update(readFileSync(file));
461
+ }
462
+ return hash.digest("hex");
463
+ }
464
+ function listFilesRecursive(dir) {
465
+ const out = [];
466
+ for (const entry of readdirSync(dir, { withFileTypes: true })) {
467
+ if (IGNORED_DIRS.has(entry.name)) continue;
468
+ const full = join(dir, entry.name);
469
+ if (entry.isDirectory()) {
470
+ out.push(...listFilesRecursive(full));
471
+ } else if (entry.isFile() || entry.isSymbolicLink()) {
472
+ try {
473
+ if (statSync(full).isFile()) out.push(full);
474
+ } catch {
475
+ }
476
+ }
477
+ }
478
+ return out;
479
+ }
480
+
481
+ // src/runner/baseline.ts
482
+ async function getOrComputeBaseline(input) {
483
+ const { testCase, sandboxProvider, db } = input;
484
+ if (!testCase.test_command) return void 0;
485
+ const testCaseId = testCase.id || testCase.name;
486
+ const fixtureHash = hashFixture(testCase.fixture);
487
+ if (db) {
488
+ try {
489
+ const cached = await getCachedBaseline(db, testCaseId, fixtureHash);
490
+ if (cached && cached.testCommand === testCase.test_command) {
491
+ return {
492
+ fixtureHash,
493
+ statusMap: JSON.parse(cached.statusMap),
494
+ cached: true
495
+ };
496
+ }
497
+ } catch (err) {
498
+ console.error(`Failed to read cached baseline: ${err.message}`);
499
+ }
500
+ }
501
+ const sandbox = await sandboxProvider.create({ gitSnapshot: testCase.fixture });
502
+ let statusMap = {};
503
+ try {
504
+ const res = await sandbox.exec(testCase.test_command);
505
+ statusMap = new TapTestResultParser().parse(`${res.stdout}
506
+ ${res.stderr}`);
507
+ } finally {
508
+ await sandbox.destroy();
509
+ }
510
+ if (db) {
511
+ try {
512
+ await saveCachedBaseline(db, {
513
+ id: `${testCaseId}:${fixtureHash}`,
514
+ testCaseId,
515
+ fixtureHash,
516
+ testCommand: testCase.test_command,
517
+ statusMap: JSON.stringify(statusMap),
518
+ createdAt: Math.floor(Date.now() / 1e3)
519
+ });
520
+ } catch (err) {
521
+ console.error(`Failed to persist baseline cache: ${err.message}`);
522
+ }
523
+ }
524
+ return { fixtureHash, statusMap, cached: false };
525
+ }
526
+ var FRONTMATTER_RE = /^---\r?\n([\s\S]*?)\r?\n---\r?\n?([\s\S]*)$/;
527
+ function parseSkillMarkdown(content, path, dir) {
528
+ const match = content.match(FRONTMATTER_RE);
529
+ if (!match) {
530
+ throw new Error(`SKILL.md at "${path}" is missing a YAML frontmatter block ("---" ... "---")`);
531
+ }
532
+ const [, frontmatterYaml, body] = match;
533
+ const raw = parse(frontmatterYaml);
534
+ let frontmatter;
535
+ try {
536
+ frontmatter = SkillFrontmatterSchema.parse(raw);
537
+ } catch (err) {
538
+ throw new Error(`Invalid SKILL.md frontmatter in "${path}": ${err.message}`);
539
+ }
540
+ return { frontmatter, body: body.trim(), path, dir };
541
+ }
542
+ function discoverSkills(toolkitDir) {
543
+ const skillsDir = resolve(toolkitDir, ".claude", "skills");
544
+ if (!existsSync(skillsDir)) return [];
545
+ const skills = [];
546
+ for (const entry of readdirSync(skillsDir)) {
547
+ const dir = join(skillsDir, entry);
548
+ if (!statSync(dir).isDirectory()) continue;
549
+ const skillPath = join(dir, "SKILL.md");
550
+ if (!existsSync(skillPath)) continue;
551
+ const content = readFileSync(skillPath, "utf-8");
552
+ skills.push(parseSkillMarkdown(content, skillPath, dir));
553
+ }
554
+ return skills;
555
+ }
556
+ function discoverSkillsForToolkits(toolkitDirs) {
557
+ return toolkitDirs.flatMap((dir) => discoverSkills(dir));
558
+ }
559
+ function buildSkillsPromptAddendum(skills) {
560
+ if (skills.length === 0) return "";
561
+ const entries = skills.map((skill) => {
562
+ const { name, description } = skill.frontmatter;
563
+ const sandboxPath = `/app/.claude/skills/${name}/SKILL.md`;
564
+ return `- **${name}**: ${description}
565
+ Read \`${sandboxPath}\` (e.g. via your readFile tool) for full instructions before using this skill.`;
566
+ }).join("\n");
567
+ return [
568
+ "## Available skills",
569
+ "",
570
+ 'The sandbox is preloaded with additional tools and documented "skills". Each skill below is available in the sandbox; read its SKILL.md for full usage instructions before relying on it.',
571
+ "",
572
+ entries
573
+ ].join("\n");
574
+ }
575
+
576
+ // src/runner/run-single.ts
577
+ async function runSingle(input) {
578
+ const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
579
+ const startTime = Date.now();
580
+ let sandbox = null;
581
+ let passed = false;
582
+ let score = 0;
583
+ let stepsCount = 0;
584
+ let tokensIn = 0;
585
+ let tokensOut = 0;
586
+ let costUsd = 0;
587
+ let durationMs = 0;
588
+ let errorMsg;
589
+ let finalDiff = "";
590
+ let agentDiff = "";
591
+ let agentResult;
592
+ const metrics = {};
593
+ const emittedSteps = [];
594
+ if (db) {
595
+ await createRun(db, {
596
+ id: runId,
597
+ testCaseId: testCase.id || testCase.name,
598
+ agentConfigId: agentConfig.id || agentConfig.name,
599
+ sandboxProvider: sandboxProvider.name,
600
+ status: "running",
601
+ createdAt: Math.floor(startTime / 1e3)
602
+ });
603
+ }
604
+ let baseline;
605
+ try {
606
+ baseline = await getOrComputeBaseline({ testCase, sandboxProvider, db });
607
+ } catch (err) {
608
+ console.error(`Failed to compute baseline: ${err.message}`);
609
+ }
610
+ const toolkits = Array.from(
611
+ /* @__PURE__ */ new Set([...agentConfig.toolkits ?? [], ...testCase.toolkits ?? []])
612
+ );
613
+ const setupSandboxStep = createStep({
614
+ id: "setupSandbox",
615
+ inputSchema: z.any(),
616
+ outputSchema: z.object({}),
617
+ execute: async () => {
618
+ sandbox = await sandboxProvider.create({
619
+ image: testCase.image,
620
+ gitSnapshot: testCase.fixture,
621
+ toolkits
622
+ });
623
+ return {};
624
+ }
625
+ });
626
+ const solveStep = createStep({
627
+ id: "solve",
628
+ inputSchema: z.any(),
629
+ outputSchema: z.object({}),
630
+ execute: async () => {
631
+ if (!sandbox) throw new Error("Sandbox not initialized");
632
+ const onStepCallback = (stepEvent) => {
633
+ emittedSteps.push(stepEvent);
634
+ stepsCount++;
635
+ tokensIn += stepEvent.tokensIn || 0;
636
+ tokensOut += stepEvent.tokensOut || 0;
637
+ costUsd += stepEvent.costUsd || 0;
638
+ if (db) {
639
+ addTrace(db, {
640
+ runId,
641
+ stepIndex: stepEvent.index,
642
+ kind: stepEvent.kind,
643
+ tool: stepEvent.tool,
644
+ tokensIn: stepEvent.tokensIn,
645
+ tokensOut: stepEvent.tokensOut,
646
+ costUsd: stepEvent.costUsd,
647
+ timestamp: stepEvent.timestamp,
648
+ content: stepEvent.content
649
+ }).catch((err) => {
650
+ console.error(`Failed to persist trace step: ${err.message}`);
651
+ });
652
+ }
653
+ };
654
+ let effectiveConfig = agentConfig;
655
+ if (toolkits.length > 0) {
656
+ try {
657
+ const skills = discoverSkillsForToolkits(toolkits);
658
+ const addendum = buildSkillsPromptAddendum(skills);
659
+ if (addendum) {
660
+ effectiveConfig = {
661
+ ...agentConfig,
662
+ system_prompt: agentConfig.system_prompt ? `${agentConfig.system_prompt}
663
+
664
+ ${addendum}` : addendum
665
+ };
666
+ }
667
+ } catch (e) {
668
+ console.error(`Failed to build skills prompt addendum: ${e.message}`);
669
+ }
670
+ }
671
+ const result = await adapter.solve({
672
+ prompt: testCase.prompt,
673
+ sandbox,
674
+ config: effectiveConfig,
675
+ onStep: onStepCallback
676
+ });
677
+ try {
678
+ agentDiff = await sandbox.gitDiff();
679
+ } catch (e) {
680
+ console.error(`Failed to capture agent diff: ${e.message}`);
681
+ }
682
+ agentResult = { ...result, finalDiff: agentDiff || result.finalDiff };
683
+ if (testCase.test_patch) {
684
+ try {
685
+ const patchResult = await sandbox.applyPatch(testCase.test_patch);
686
+ metrics.testPatchApply = patchResult;
687
+ } catch (e) {
688
+ metrics.testPatchApply = { applied: false, repaired: false, output: e.message };
689
+ }
690
+ }
691
+ return { result };
692
+ }
693
+ });
694
+ const scoreStep = createStep({
695
+ id: "score",
696
+ inputSchema: z.any(),
697
+ outputSchema: z.object({
698
+ passed: z.boolean(),
699
+ detail: z.string(),
700
+ score: z.number()
701
+ }),
702
+ execute: async () => {
703
+ if (!sandbox) throw new Error("Sandbox not initialized");
704
+ const cmdScorer = new CommandScorer();
705
+ const cmdResult = await cmdScorer.score({
706
+ testCase,
707
+ sandbox
708
+ });
709
+ metrics.command = { passed: cmdResult.passed, detail: cmdResult.detail };
710
+ if (!cmdResult.passed) {
711
+ return { passed: false, detail: cmdResult.detail, score: 0 };
712
+ }
713
+ const assertScorer = new AssertionScorer();
714
+ const trace = { runId, steps: emittedSteps };
715
+ const assertResult = await assertScorer.score({
716
+ testCase,
717
+ trace
718
+ });
719
+ metrics.assertion = { passed: assertResult.passed, detail: assertResult.detail };
720
+ if (!assertResult.passed) {
721
+ return { passed: false, detail: assertResult.detail, score: 0 };
722
+ }
723
+ let overallPassed = true;
724
+ let overallDetail = "All tests passed";
725
+ const regressionScorer = new RegressionScorer();
726
+ const regressionResult = await regressionScorer.score({
727
+ testCase,
728
+ sandbox,
729
+ baseline: baseline?.statusMap
730
+ });
731
+ metrics.regression = {
732
+ passed: regressionResult.passed,
733
+ score: regressionResult.score,
734
+ detail: regressionResult.detail
735
+ };
736
+ if (!regressionResult.passed) {
737
+ overallPassed = false;
738
+ overallDetail = regressionResult.detail;
739
+ }
740
+ if (agentResult) {
741
+ const diffScorer = new DiffScorer();
742
+ const diffResult = await diffScorer.score({ testCase, result: agentResult });
743
+ metrics.diff = { score: diffResult.score, detail: diffResult.detail };
744
+ const localizationScorer = new LocalizationScorer();
745
+ const localizationResult = await localizationScorer.score({
746
+ testCase,
747
+ result: agentResult
748
+ });
749
+ metrics.localization = {
750
+ score: localizationResult.score,
751
+ detail: localizationResult.detail
752
+ };
753
+ }
754
+ if (baseline) {
755
+ metrics.baseline = { cached: baseline.cached, fixtureHash: baseline.fixtureHash };
756
+ }
757
+ return {
758
+ passed: overallPassed,
759
+ detail: overallDetail,
760
+ score: overallPassed ? 100 : 0
761
+ };
762
+ }
763
+ });
764
+ const cleanupStep = createStep({
765
+ id: "cleanup",
766
+ inputSchema: z.any(),
767
+ outputSchema: z.object({}),
768
+ execute: async () => {
769
+ if (sandbox) {
770
+ try {
771
+ finalDiff = agentDiff || await sandbox.gitDiff();
772
+ await sandbox.destroy();
773
+ } catch (e) {
774
+ console.error(`Failed to clean up sandbox: ${e.message}`);
775
+ }
776
+ sandbox = null;
777
+ }
778
+ return {};
779
+ }
780
+ });
781
+ const workflow = createWorkflow({
782
+ id: `run-single-${runId}`,
783
+ inputSchema: z.any(),
784
+ outputSchema: z.any()
785
+ }).then(setupSandboxStep).then(solveStep).then(scoreStep).then(cleanupStep).commit();
786
+ try {
787
+ const runState = {};
788
+ const run = await workflow.createRun();
789
+ const res = await run.start({
790
+ inputData: {},
791
+ initialState: runState
792
+ });
793
+ const scoreResults = res.results?.score;
794
+ passed = scoreResults?.passed ?? false;
795
+ score = scoreResults?.score ?? 0;
796
+ errorMsg = scoreResults?.passed ? void 0 : scoreResults?.detail;
797
+ } catch (err) {
798
+ errorMsg = err.message || "Unknown execution error";
799
+ passed = false;
800
+ if (sandbox) {
801
+ try {
802
+ await sandbox.destroy();
803
+ } catch (e) {
804
+ }
805
+ }
806
+ }
807
+ durationMs = Date.now() - startTime;
808
+ if (db) {
809
+ await updateRun(db, runId, {
810
+ status: errorMsg ? "failed" : "completed",
811
+ passed,
812
+ score,
813
+ stepsCount,
814
+ tokensIn,
815
+ tokensOut,
816
+ costUsd,
817
+ durationMs,
818
+ error: errorMsg,
819
+ finalDiff,
820
+ metrics: Object.keys(metrics).length > 0 ? JSON.stringify(metrics) : void 0,
821
+ completedAt: Math.floor(Date.now() / 1e3)
822
+ });
823
+ }
824
+ return {
825
+ runId,
826
+ passed,
827
+ score,
828
+ stepsCount,
829
+ tokensIn,
830
+ tokensOut,
831
+ costUsd,
832
+ durationMs,
833
+ error: errorMsg,
834
+ finalDiff,
835
+ metrics
836
+ };
837
+ }
838
+ async function runBenchmark(input) {
839
+ const { testCases, agentConfigs, adapter, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
840
+ const generateCombinationsStep = createStep({
841
+ id: "generateCombinations",
842
+ inputSchema: z.any(),
843
+ outputSchema: z.array(z.any()),
844
+ execute: async ({ getInitData }) => {
845
+ const initData = getInitData();
846
+ const combinations = [];
847
+ for (const tc of initData.testCases) {
848
+ for (const config of initData.agentConfigs) {
849
+ combinations.push({
850
+ testCase: tc,
851
+ agentConfig: config
852
+ });
853
+ }
854
+ }
855
+ return combinations;
856
+ }
857
+ });
858
+ const executeSingleRunStep = createStep({
859
+ id: "executeSingleRun",
860
+ inputSchema: z.any(),
861
+ outputSchema: z.any(),
862
+ execute: async ({ inputData, requestContext }) => {
863
+ const { testCase, agentConfig } = inputData;
864
+ const ctx = requestContext?.context || requestContext;
865
+ const getVal = (key) => {
866
+ if (ctx instanceof Map) return ctx.get(key);
867
+ if (ctx && typeof ctx === "object" && key in ctx) return ctx[key];
868
+ if (typeof ctx?.get === "function") return ctx.get(key);
869
+ return void 0;
870
+ };
871
+ const adapter2 = getVal("adapter");
872
+ const sandboxProvider2 = getVal("sandboxProvider");
873
+ const db2 = getVal("db");
874
+ const onRunUpdate2 = getVal("onRunUpdate");
875
+ const runId = randomUUID();
876
+ if (onRunUpdate2) {
877
+ onRunUpdate2({
878
+ runId,
879
+ testCaseId: testCase.id || testCase.name,
880
+ agentConfigId: agentConfig.id || agentConfig.name,
881
+ status: "running",
882
+ passed: false,
883
+ stepsCount: 0,
884
+ tokensIn: 0,
885
+ tokensOut: 0,
886
+ costUsd: 0,
887
+ durationMs: 0
888
+ });
889
+ }
890
+ try {
891
+ const res2 = await runSingle({
892
+ testCase,
893
+ agentConfig,
894
+ adapter: adapter2,
895
+ sandboxProvider: sandboxProvider2,
896
+ db: db2,
897
+ runId
898
+ });
899
+ if (onRunUpdate2) {
900
+ onRunUpdate2({
901
+ ...res2,
902
+ testCaseId: testCase.id || testCase.name,
903
+ agentConfigId: agentConfig.id || agentConfig.name,
904
+ status: res2.error ? "failed" : "completed"
905
+ });
906
+ }
907
+ return res2;
908
+ } catch (err) {
909
+ const failedResult = {
910
+ runId,
911
+ passed: false,
912
+ stepsCount: 0,
913
+ tokensIn: 0,
914
+ tokensOut: 0,
915
+ costUsd: 0,
916
+ durationMs: 0,
917
+ error: err.message || "Failed during execution"
918
+ };
919
+ if (onRunUpdate2) {
920
+ onRunUpdate2({
921
+ ...failedResult,
922
+ testCaseId: testCase.id || testCase.name,
923
+ agentConfigId: agentConfig.id || agentConfig.name,
924
+ status: "failed"
925
+ });
926
+ }
927
+ return failedResult;
928
+ }
929
+ }
930
+ });
931
+ const workflow = createWorkflow({
932
+ id: `benchmark-orchestrator-${randomUUID()}`,
933
+ inputSchema: z.any(),
934
+ outputSchema: z.any()
935
+ }).then(generateCombinationsStep).foreach(executeSingleRunStep, { concurrency }).commit();
936
+ const runState = {};
937
+ const executionContext = /* @__PURE__ */ new Map([
938
+ ["adapter", adapter],
939
+ ["sandboxProvider", sandboxProvider],
940
+ ["db", db],
941
+ ["onRunUpdate", onRunUpdate]
942
+ ]);
943
+ const run = await workflow.createRun();
944
+ const res = await run.start({
945
+ inputData: { testCases, agentConfigs },
946
+ initialState: runState,
947
+ requestContext: executionContext
948
+ });
949
+ const rawRuns = res.results?.executeSingleRunStepResult || [];
950
+ return {
951
+ runs: Array.isArray(rawRuns) ? rawRuns : []
952
+ };
953
+ }
954
+
955
+ // src/runner/validate-test-case.ts
956
+ async function validateTestCase(input) {
957
+ const { testCase, sandboxProvider } = input;
958
+ const checks = [];
959
+ checks.push(...checkStaticFields(testCase));
960
+ if (!testCase.test_command) {
961
+ checks.push({
962
+ name: "execution-checks",
963
+ passed: true,
964
+ detail: "No test_command configured; skipping pre/post-patch execution checks."
965
+ });
966
+ return finalize(checks);
967
+ }
968
+ const sandbox = await sandboxProvider.create({ gitSnapshot: testCase.fixture });
969
+ try {
970
+ if (testCase.test_patch) {
971
+ const patchRes = await sandbox.applyPatch(testCase.test_patch);
972
+ checks.push({
973
+ name: "test_patch applies cleanly",
974
+ passed: patchRes.applied,
975
+ detail: patchRes.applied ? `Applied${patchRes.repaired ? " (required repair/fallback)" : ""}.` : `Failed to apply test_patch:
976
+ ${patchRes.output}`
977
+ });
978
+ if (!patchRes.applied) return finalize(checks);
979
+ }
980
+ const preRun = await sandbox.exec(testCase.test_command);
981
+ const preStatus = new TapTestResultParser().parse(`${preRun.stdout}
982
+ ${preRun.stderr}`);
983
+ checks.push(
984
+ checkStatusExpectation(
985
+ "pre-patch: fail_to_pass tests are currently failing",
986
+ testCase.fail_to_pass ?? [],
987
+ preStatus,
988
+ "FAIL"
989
+ )
990
+ );
991
+ checks.push(
992
+ checkStatusExpectation(
993
+ "pre-patch: pass_to_pass tests are currently passing",
994
+ testCase.pass_to_pass ?? [],
995
+ preStatus,
996
+ "PASS"
997
+ )
998
+ );
999
+ if (testCase.solution) {
1000
+ const solutionRes = await sandbox.applyPatch(testCase.solution);
1001
+ checks.push({
1002
+ name: "solution patch applies cleanly",
1003
+ passed: solutionRes.applied,
1004
+ detail: solutionRes.applied ? `Applied${solutionRes.repaired ? " (required repair/fallback)" : ""}.` : `Failed to apply solution patch:
1005
+ ${solutionRes.output}`
1006
+ });
1007
+ if (solutionRes.applied) {
1008
+ const postRun = await sandbox.exec(testCase.test_command);
1009
+ const postStatus = new TapTestResultParser().parse(`${postRun.stdout}
1010
+ ${postRun.stderr}`);
1011
+ checks.push(
1012
+ checkStatusExpectation(
1013
+ "post-patch: fail_to_pass tests now pass",
1014
+ testCase.fail_to_pass ?? [],
1015
+ postStatus,
1016
+ "PASS"
1017
+ )
1018
+ );
1019
+ checks.push(
1020
+ checkStatusExpectation(
1021
+ "post-patch: pass_to_pass tests still pass",
1022
+ testCase.pass_to_pass ?? [],
1023
+ postStatus,
1024
+ "PASS"
1025
+ )
1026
+ );
1027
+ }
1028
+ }
1029
+ } finally {
1030
+ await sandbox.destroy();
1031
+ }
1032
+ return finalize(checks);
1033
+ }
1034
+ function checkStaticFields(testCase) {
1035
+ const checks = [];
1036
+ checks.push({
1037
+ name: "has name and prompt",
1038
+ passed: Boolean(testCase.name && testCase.prompt),
1039
+ detail: testCase.name && testCase.prompt ? "ok" : "Test case is missing `name` and/or `prompt`."
1040
+ });
1041
+ const failToPass = testCase.fail_to_pass ?? [];
1042
+ const passToPass = testCase.pass_to_pass ?? [];
1043
+ if (failToPass.length > 0 || passToPass.length > 0) {
1044
+ checks.push({
1045
+ name: "test_command configured for fail_to_pass/pass_to_pass",
1046
+ passed: Boolean(testCase.test_command),
1047
+ detail: testCase.test_command ? "ok" : "fail_to_pass/pass_to_pass are set but no test_command was provided."
1048
+ });
1049
+ const overlap = failToPass.filter((n) => passToPass.includes(n));
1050
+ checks.push({
1051
+ name: "fail_to_pass and pass_to_pass do not overlap",
1052
+ passed: overlap.length === 0,
1053
+ detail: overlap.length === 0 ? "ok" : `Tests listed in both sets: ${overlap.join(", ")}`
1054
+ });
1055
+ }
1056
+ if (testCase.forbid_modified?.length) {
1057
+ const invalid = testCase.forbid_modified.filter((p) => !isLikelyValidGlob(p));
1058
+ checks.push({
1059
+ name: "forbid_modified patterns look valid",
1060
+ passed: invalid.length === 0,
1061
+ detail: invalid.length === 0 ? "ok" : `Suspicious glob pattern(s): ${invalid.join(", ")}`
1062
+ });
1063
+ }
1064
+ if (testCase.expected_files?.length) {
1065
+ const invalid = testCase.expected_files.filter((p) => !isLikelyValidGlob(p));
1066
+ checks.push({
1067
+ name: "expected_files patterns look valid",
1068
+ passed: invalid.length === 0,
1069
+ detail: invalid.length === 0 ? "ok" : `Suspicious glob pattern(s): ${invalid.join(", ")}`
1070
+ });
1071
+ }
1072
+ if (testCase.created_at) {
1073
+ const date = new Date(testCase.created_at);
1074
+ checks.push({
1075
+ name: "created_at is a parseable date (contamination/date-cutoff check)",
1076
+ passed: !Number.isNaN(date.getTime()),
1077
+ detail: !Number.isNaN(date.getTime()) ? `ok (${date.toISOString()})` : `Could not parse created_at: "${testCase.created_at}"`
1078
+ });
1079
+ }
1080
+ return checks;
1081
+ }
1082
+ function isLikelyValidGlob(pattern) {
1083
+ if (!pattern || pattern.trim().length === 0) return false;
1084
+ try {
1085
+ matchGlob(pattern, "x");
1086
+ return true;
1087
+ } catch {
1088
+ return false;
1089
+ }
1090
+ }
1091
+ function checkStatusExpectation(name, testNames, statusMap, expected) {
1092
+ if (testNames.length === 0) {
1093
+ return { name, passed: true, detail: "No tests configured for this check." };
1094
+ }
1095
+ const problems = [];
1096
+ for (const testName of testNames) {
1097
+ const actual = statusMap[testName];
1098
+ if (actual === void 0) {
1099
+ problems.push(`"${testName}" not found in test output`);
1100
+ } else if (actual !== expected) {
1101
+ problems.push(`"${testName}" is ${actual}, expected ${expected}`);
1102
+ }
1103
+ }
1104
+ return {
1105
+ name,
1106
+ passed: problems.length === 0,
1107
+ detail: problems.length === 0 ? `ok (${testNames.length} test(s))` : problems.join("; ")
1108
+ };
1109
+ }
1110
+ function finalize(checks) {
1111
+ return { ok: checks.every((c) => c.passed), checks };
1112
+ }
1113
+
1114
+ export { AgentConfigSchema, AssertionScorer, CommandScorer, DiffScorer, LocalizationScorer, McpServerConfigSchema, RegressionScorer, RunSchema, SkillFrontmatterSchema, StepEventSchema, SuccessCriterionSchema, TapTestResultParser, TestCaseSchema, TraceSchema, buildSkillsPromptAddendum, discoverSkills, discoverSkillsForToolkits, getOrComputeBaseline, hashFixture, matchAnyGlob, matchGlob, parseDiffStats, parseSkillMarkdown, runBenchmark, runSingle, validateTestCase };