agentgrader 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +161 -1
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -1,10 +1,10 @@
1
1
  #!/usr/bin/env node
2
2
  import 'dotenv/config';
3
3
  import { cac } from 'cac';
4
+ import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
4
5
  import { randomUUID } from 'crypto';
5
6
  import { resolve, dirname, isAbsolute, basename } from 'path';
6
7
  import { render, Box, Text } from 'ink';
7
- import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
8
8
  import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
9
9
  import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
10
10
  import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
@@ -16,6 +16,155 @@ import { parse, stringify } from 'yaml';
16
16
  import { z, ZodError } from 'zod';
17
17
  import { execFileSync } from 'child_process';
18
18
 
19
+ var CONTENT_PREVIEW_MAX = 200;
20
+ var ANSI = {
21
+ reset: "\x1B[0m",
22
+ gray: "\x1B[90m",
23
+ yellow: "\x1B[33m",
24
+ cyan: "\x1B[36m",
25
+ blue: "\x1B[34m"};
26
+ function paint(text, code) {
27
+ if (!process.stdout.isTTY) return text;
28
+ return `${code}${text}${ANSI.reset}`;
29
+ }
30
+ function truncateContent(content, full) {
31
+ if (full || content.length <= CONTENT_PREVIEW_MAX) return content;
32
+ return `${content.slice(0, CONTENT_PREVIEW_MAX)}...`;
33
+ }
34
+ function normalizeContent(content) {
35
+ return (content ?? "").trim();
36
+ }
37
+ function formatStepSummary(step, full) {
38
+ if (!step) return "(no step)";
39
+ const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
40
+ if (!step.content) return label;
41
+ const preview = truncateContent(step.content.replace(/\n/g, " "), full);
42
+ return `${label} ${preview}`;
43
+ }
44
+ function stepsByIndex(traces) {
45
+ const map = /* @__PURE__ */ new Map();
46
+ for (const step of traces) {
47
+ map.set(step.stepIndex, step);
48
+ }
49
+ return map;
50
+ }
51
+ function stepsDiverge(a, b) {
52
+ if (!a || !b) return true;
53
+ if (a.kind !== b.kind) return true;
54
+ if ((a.tool ?? "") !== (b.tool ?? "")) return true;
55
+ if (normalizeContent(a.content) !== normalizeContent(b.content)) return true;
56
+ return false;
57
+ }
58
+ function formatRunStatus(run) {
59
+ const passed = run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : "";
60
+ return `${run.status}${passed}`;
61
+ }
62
+ function printRunHeader(label, run) {
63
+ const tag = label === "A" ? paint(`Run A (${run.id})`, ANSI.cyan) : paint(`Run B (${run.id})`, ANSI.blue);
64
+ console.log(tag);
65
+ console.log(` test case: ${run.testCaseId}`);
66
+ console.log(` agent config: ${run.agentConfigId}`);
67
+ console.log(` status: ${formatRunStatus(run)}`);
68
+ console.log(` steps: ${run.stepsCount}`);
69
+ console.log(` cost: $${run.costUsd.toFixed(4)}`);
70
+ console.log(` duration: ${run.durationMs}ms`);
71
+ if (run.error) console.log(` error: ${run.error}`);
72
+ }
73
+ async function compareCommand(runIdA, runIdB, opts) {
74
+ const db = initDb();
75
+ const [runA, runB, tracesA, tracesB] = await Promise.all([
76
+ getRun(db, runIdA),
77
+ getRun(db, runIdB),
78
+ getTraces(db, runIdA),
79
+ getTraces(db, runIdB)
80
+ ]);
81
+ if (!runA) {
82
+ console.error(`Run not found: ${runIdA}`);
83
+ process.exit(1);
84
+ }
85
+ if (!runB) {
86
+ console.error(`Run not found: ${runIdB}`);
87
+ process.exit(1);
88
+ }
89
+ console.log("");
90
+ printRunHeader("A", runA);
91
+ console.log("");
92
+ printRunHeader("B", runB);
93
+ console.log("");
94
+ if (runA.testCaseId !== runB.testCaseId) {
95
+ console.log(
96
+ paint(
97
+ "\u26A0\uFE0F Comparing runs of different test cases - step alignment may not be meaningful.",
98
+ ANSI.yellow
99
+ )
100
+ );
101
+ console.log("");
102
+ }
103
+ const mapA = stepsByIndex(tracesA);
104
+ const mapB = stepsByIndex(tracesB);
105
+ const maxIndex = Math.max(
106
+ tracesA.length > 0 ? Math.max(...tracesA.map((s) => s.stepIndex)) : -1,
107
+ tracesB.length > 0 ? Math.max(...tracesB.map((s) => s.stepIndex)) : -1,
108
+ -1
109
+ );
110
+ if (maxIndex < 0) {
111
+ console.log("No steps recorded for either run.");
112
+ return;
113
+ }
114
+ const divergentIndices = /* @__PURE__ */ new Set();
115
+ for (let i = 0; i <= maxIndex; i++) {
116
+ if (stepsDiverge(mapA.get(i), mapB.get(i))) {
117
+ divergentIndices.add(i);
118
+ }
119
+ }
120
+ const visibleIndices = /* @__PURE__ */ new Set();
121
+ if (opts.onlyDiff) {
122
+ for (const idx of divergentIndices) {
123
+ visibleIndices.add(idx);
124
+ if (idx > 0) visibleIndices.add(idx - 1);
125
+ if (idx < maxIndex) visibleIndices.add(idx + 1);
126
+ }
127
+ } else {
128
+ for (let i = 0; i <= maxIndex; i++) visibleIndices.add(i);
129
+ }
130
+ const sortedVisible = [...visibleIndices].sort((a, b) => a - b);
131
+ if (sortedVisible.length === 0) {
132
+ console.log("No divergent steps (nothing to show with --only-diff).");
133
+ } else {
134
+ console.log("Step comparison:");
135
+ for (const i of sortedVisible) {
136
+ const stepA = mapA.get(i);
137
+ const stepB = mapB.get(i);
138
+ const divergent = stepsDiverge(stepA, stepB);
139
+ if (divergent) {
140
+ console.log(paint(`[step ${i}] DIVERGENT`, ANSI.yellow));
141
+ console.log(` A: ${formatStepSummary(stepA, opts.full ?? false)}`);
142
+ console.log(` B: ${formatStepSummary(stepB, opts.full ?? false)}`);
143
+ } else {
144
+ const line = formatStepSummary(stepA ?? stepB, opts.full ?? false);
145
+ console.log(paint(`[step ${i}] (same)`, ANSI.gray));
146
+ console.log(` ${line}`);
147
+ }
148
+ }
149
+ }
150
+ const totalSteps = maxIndex + 1;
151
+ const diffCount = divergentIndices.size;
152
+ let firstDivergence = null;
153
+ for (let i = 0; i <= maxIndex; i++) {
154
+ if (divergentIndices.has(i)) {
155
+ firstDivergence = i;
156
+ break;
157
+ }
158
+ }
159
+ console.log("");
160
+ console.log(`${diffCount} of ${totalSteps} step(s) differ.`);
161
+ if (firstDivergence !== null) {
162
+ console.log(`First divergence at step ${firstDivergence}.`);
163
+ } else {
164
+ console.log("No divergence detected.");
165
+ }
166
+ console.log("");
167
+ }
19
168
  var CONFIG_COL_WIDTH = 24;
20
169
  var CONFIG_LABEL_MAX = 20;
21
170
  function truncateLabel(name, max = CONFIG_LABEL_MAX) {
@@ -1119,6 +1268,17 @@ cli.command("trace <runId>", "Show the step trace and metrics for a single run")
1119
1268
  process.exit(1);
1120
1269
  }
1121
1270
  });
1271
+ cli.command("compare <runIdA> <runIdB>", "Compare the step traces of two runs side by side").option("--full", "Print full step content without truncation").option(
1272
+ "--only-diff",
1273
+ "Show only divergent steps plus one step of context before and after each"
1274
+ ).example("agr compare <runIdA> <runIdB> --only-diff").action(async (runIdA, runIdB, options) => {
1275
+ try {
1276
+ await compareCommand(runIdA, runIdB, options);
1277
+ } catch (err) {
1278
+ console.error(`Error executing compare: ${err.message}`);
1279
+ process.exit(1);
1280
+ }
1281
+ });
1122
1282
  cli.help();
1123
1283
  try {
1124
1284
  cli.parse();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",