@vercel/agent-eval-playground 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +50 -0
  2. package/app/compare/page.tsx +40 -0
  3. package/app/evals/[name]/page.tsx +22 -0
  4. package/app/evals/page.tsx +18 -0
  5. package/app/experiments/[name]/[timestamp]/page.tsx +23 -0
  6. package/app/experiments/page.tsx +28 -0
  7. package/app/globals.css +126 -0
  8. package/app/layout.tsx +102 -0
  9. package/app/page.tsx +179 -0
  10. package/app/transcript/[experiment]/[timestamp]/[evalName]/[run]/page.tsx +43 -0
  11. package/bin.mjs +86 -0
  12. package/components/ComparePage.tsx +312 -0
  13. package/components/EvalDetail.tsx +114 -0
  14. package/components/EvalsPage.tsx +80 -0
  15. package/components/ExperimentDetail.tsx +162 -0
  16. package/components/ExperimentList.tsx +103 -0
  17. package/components/O11ySummary.tsx +114 -0
  18. package/components/RunResultCard.tsx +72 -0
  19. package/components/ShowMore.tsx +60 -0
  20. package/components/TranscriptPage.tsx +46 -0
  21. package/components/TranscriptViewer.tsx +201 -0
  22. package/components/ui/alert-dialog.tsx +184 -0
  23. package/components/ui/badge.tsx +45 -0
  24. package/components/ui/button.tsx +60 -0
  25. package/components/ui/card.tsx +94 -0
  26. package/components/ui/collapsible.tsx +34 -0
  27. package/components/ui/combobox.tsx +297 -0
  28. package/components/ui/dropdown-menu.tsx +269 -0
  29. package/components/ui/field.tsx +227 -0
  30. package/components/ui/input-group.tsx +147 -0
  31. package/components/ui/input.tsx +19 -0
  32. package/components/ui/label.tsx +24 -0
  33. package/components/ui/progress.tsx +31 -0
  34. package/components/ui/scroll-area.tsx +58 -0
  35. package/components/ui/select.tsx +191 -0
  36. package/components/ui/separator.tsx +28 -0
  37. package/components/ui/table.tsx +116 -0
  38. package/components/ui/tabs.tsx +91 -0
  39. package/components/ui/textarea.tsx +18 -0
  40. package/components/ui/tooltip.tsx +57 -0
  41. package/components.json +25 -0
  42. package/lib/data.ts +297 -0
  43. package/lib/types.ts +113 -0
  44. package/lib/utils.ts +6 -0
  45. package/next.config.ts +5 -0
  46. package/package.json +51 -0
  47. package/postcss.config.mjs +7 -0
  48. package/public/vercel.svg +1 -0
  49. package/tsconfig.json +42 -0
package/bin.mjs ADDED
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { spawn } from "child_process";
4
+ import { resolve, dirname } from "path";
5
+ import { fileURLToPath } from "url";
6
+ import { createRequire } from "module";
7
+
8
+ const __dirname = dirname(fileURLToPath(import.meta.url));
9
+ const require = createRequire(import.meta.url);
10
+
11
+ // Parse CLI arguments
12
+ const args = process.argv.slice(2);
13
+ let resultsDir = "./results";
14
+ let evalsDir = "./evals";
15
+ let port = "3000";
16
+ let watch = false;
17
+
18
+ for (let i = 0; i < args.length; i++) {
19
+ switch (args[i]) {
20
+ case "--results-dir":
21
+ resultsDir = args[++i];
22
+ break;
23
+ case "--evals-dir":
24
+ evalsDir = args[++i];
25
+ break;
26
+ case "--port":
27
+ case "-p":
28
+ port = args[++i];
29
+ break;
30
+ case "--watch":
31
+ watch = true;
32
+ break;
33
+ case "--help":
34
+ case "-h":
35
+ console.log(`
36
+ Usage: agent-eval-playground [options]
37
+
38
+ Options:
39
+ --results-dir <dir> Path to results directory (default: ./results)
40
+ --evals-dir <dir> Path to evals directory (default: ./evals)
41
+ --port, -p <port> HTTP server port (default: 3000)
42
+ --watch Enable live mode — watch results for changes
43
+ --help, -h Show this help message
44
+ `);
45
+ process.exit(0);
46
+ }
47
+ }
48
+
49
+ // Set environment variables for the Next.js app
50
+ process.env.RESULTS_DIR = resolve(resultsDir);
51
+ process.env.EVALS_DIR = resolve(evalsDir);
52
+ if (watch) {
53
+ process.env.WATCH = "true";
54
+ }
55
+
56
+ // Find the next binary from this package's dependencies
57
+ let nextBin;
58
+ try {
59
+ const nextPkgPath = require.resolve("next/package.json");
60
+ nextBin = resolve(dirname(nextPkgPath), "dist", "bin", "next");
61
+ } catch {
62
+ console.error(
63
+ 'Error: "next" package not found. Make sure dependencies are installed.'
64
+ );
65
+ process.exit(1);
66
+ }
67
+
68
+ console.log(`Agent Eval Playground`);
69
+ console.log(` Results: ${process.env.RESULTS_DIR}`);
70
+ console.log(` Evals: ${process.env.EVALS_DIR}`);
71
+ console.log(` Port: ${port}`);
72
+ if (watch) console.log(` Watch: enabled`);
73
+ console.log();
74
+
75
+ // Run next dev from the package directory
76
+ const child = spawn(process.execPath, [nextBin, "dev", "-p", port], {
77
+ cwd: __dirname,
78
+ stdio: "inherit",
79
+ env: process.env,
80
+ });
81
+
82
+ child.on("exit", (code) => process.exit(code ?? 0));
83
+
84
+ // Forward signals
85
+ process.on("SIGINT", () => child.kill("SIGINT"));
86
+ process.on("SIGTERM", () => child.kill("SIGTERM"));
@@ -0,0 +1,312 @@
1
+ "use client";
2
+
3
+ import { useState } from "react";
4
+ import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
5
+ import { Badge } from "@/components/ui/badge";
6
+ import {
7
+ Select,
8
+ SelectContent,
9
+ SelectItem,
10
+ SelectTrigger,
11
+ SelectValue,
12
+ } from "@/components/ui/select";
13
+ import {
14
+ Table,
15
+ TableBody,
16
+ TableCell,
17
+ TableHead,
18
+ TableHeader,
19
+ TableRow,
20
+ } from "@/components/ui/table";
21
+ import { Separator } from "@/components/ui/separator";
22
+
23
+ interface EvalDetail {
24
+ name: string;
25
+ totalRuns: number;
26
+ passedRuns: number;
27
+ passRate: number;
28
+ meanDuration: number;
29
+ }
30
+
31
+ interface ExperimentDetailData {
32
+ name: string;
33
+ timestamp: string;
34
+ evals: EvalDetail[];
35
+ }
36
+
37
+ interface SelectOption {
38
+ value: string;
39
+ label: string;
40
+ }
41
+
42
+ interface ComparePageProps {
43
+ options: SelectOption[];
44
+ detailsMap: Record<string, ExperimentDetailData | null>;
45
+ }
46
+
47
+ export function ComparePage({ options, detailsMap }: ComparePageProps) {
48
+
49
+ // Pre-select the two most recent runs if available
50
+ const defaultLeft = options.length >= 1 ? options[0].value : "";
51
+ const defaultRight = options.length >= 2 ? options[1].value : "";
52
+
53
+ const [leftValue, setLeftValue] = useState(defaultLeft);
54
+ const [rightValue, setRightValue] = useState(defaultRight);
55
+
56
+ // Look up detail data directly from the pre-loaded map
57
+ const leftData = leftValue ? (detailsMap[leftValue] ?? null) : null;
58
+ const rightData = rightValue ? (detailsMap[rightValue] ?? null) : null;
59
+
60
+ // Merge eval names from both sides
61
+ const allEvalNames = new Set<string>();
62
+ leftData?.evals.forEach((e) => allEvalNames.add(e.name));
63
+ rightData?.evals.forEach((e) => allEvalNames.add(e.name));
64
+ const evalNames = Array.from(allEvalNames).sort();
65
+
66
+ const leftMap = new Map(leftData?.evals.map((e) => [e.name, e]) ?? []);
67
+ const rightMap = new Map(rightData?.evals.map((e) => [e.name, e]) ?? []);
68
+
69
+ return (
70
+ <div className="space-y-6">
71
+ <div>
72
+ <h1 className="text-2xl font-bold tracking-tight">Compare</h1>
73
+ <p className="text-muted-foreground mt-1">
74
+ Compare two experiment runs side-by-side.
75
+ </p>
76
+ </div>
77
+
78
+ {/* Selection */}
79
+ <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
80
+ <Card>
81
+ <CardHeader className="pb-3">
82
+ <CardTitle className="text-sm text-muted-foreground">
83
+ Left
84
+ </CardTitle>
85
+ </CardHeader>
86
+ <CardContent>
87
+ <Select
88
+ value={leftValue || undefined}
89
+ onValueChange={setLeftValue}
90
+ >
91
+ <SelectTrigger>
92
+ <SelectValue placeholder="Select experiment run..." />
93
+ </SelectTrigger>
94
+ <SelectContent>
95
+ {options.map((opt) => (
96
+ <SelectItem key={opt.value} value={opt.value}>
97
+ {opt.label}
98
+ </SelectItem>
99
+ ))}
100
+ </SelectContent>
101
+ </Select>
102
+ </CardContent>
103
+ </Card>
104
+
105
+ <Card>
106
+ <CardHeader className="pb-3">
107
+ <CardTitle className="text-sm text-muted-foreground">
108
+ Right
109
+ </CardTitle>
110
+ </CardHeader>
111
+ <CardContent>
112
+ <Select
113
+ value={rightValue || undefined}
114
+ onValueChange={setRightValue}
115
+ >
116
+ <SelectTrigger>
117
+ <SelectValue placeholder="Select experiment run..." />
118
+ </SelectTrigger>
119
+ <SelectContent>
120
+ {options.map((opt) => (
121
+ <SelectItem key={opt.value} value={opt.value}>
122
+ {opt.label}
123
+ </SelectItem>
124
+ ))}
125
+ </SelectContent>
126
+ </Select>
127
+ </CardContent>
128
+ </Card>
129
+ </div>
130
+
131
+ {/* Comparison table */}
132
+ {leftData && rightData && evalNames.length > 0 && (
133
+ <>
134
+ <Separator />
135
+
136
+ {/* Summary comparison */}
137
+ <div className="grid grid-cols-1 md:grid-cols-3 gap-4">
138
+ <ComparisonStat
139
+ label="Overall Pass Rate"
140
+ left={avgPassRate(leftData.evals)}
141
+ right={avgPassRate(rightData.evals)}
142
+ format={(v) => `${v.toFixed(0)}%`}
143
+ higherIsBetter
144
+ />
145
+ <ComparisonStat
146
+ label="Avg Duration"
147
+ left={avgDuration(leftData.evals)}
148
+ right={avgDuration(rightData.evals)}
149
+ format={(v) => `${v.toFixed(1)}s`}
150
+ higherIsBetter={false}
151
+ />
152
+ <ComparisonStat
153
+ label="Evals Passed"
154
+ left={leftData.evals.filter((e) => e.passedRuns === e.totalRuns).length}
155
+ right={rightData.evals.filter((e) => e.passedRuns === e.totalRuns).length}
156
+ format={(v) => `${v}`}
157
+ higherIsBetter
158
+ />
159
+ </div>
160
+
161
+ {/* Per-eval comparison table */}
162
+ <Card>
163
+ <CardHeader>
164
+ <CardTitle className="text-base">Per-Eval Comparison</CardTitle>
165
+ </CardHeader>
166
+ <CardContent>
167
+ <Table>
168
+ <TableHeader>
169
+ <TableRow>
170
+ <TableHead>Eval</TableHead>
171
+ <TableHead className="text-center">Left Pass Rate</TableHead>
172
+ <TableHead className="text-center">Right Pass Rate</TableHead>
173
+ <TableHead className="text-center">Delta</TableHead>
174
+ <TableHead className="text-center">Left Duration</TableHead>
175
+ <TableHead className="text-center">Right Duration</TableHead>
176
+ </TableRow>
177
+ </TableHeader>
178
+ <TableBody>
179
+ {evalNames.map((name) => {
180
+ const left = leftMap.get(name);
181
+ const right = rightMap.get(name);
182
+ const leftRate = left?.passRate ?? 0;
183
+ const rightRate = right?.passRate ?? 0;
184
+ const delta = rightRate - leftRate;
185
+
186
+ return (
187
+ <TableRow key={name}>
188
+ <TableCell className="font-medium">{name}</TableCell>
189
+ <TableCell className="text-center">
190
+ {left ? (
191
+ <Badge
192
+ variant={
193
+ left.passRate === 100 ? "default" : "destructive"
194
+ }
195
+ >
196
+ {left.passRate.toFixed(0)}%
197
+ </Badge>
198
+ ) : (
199
+ <span className="text-muted-foreground">--</span>
200
+ )}
201
+ </TableCell>
202
+ <TableCell className="text-center">
203
+ {right ? (
204
+ <Badge
205
+ variant={
206
+ right.passRate === 100 ? "default" : "destructive"
207
+ }
208
+ >
209
+ {right.passRate.toFixed(0)}%
210
+ </Badge>
211
+ ) : (
212
+ <span className="text-muted-foreground">--</span>
213
+ )}
214
+ </TableCell>
215
+ <TableCell className="text-center">
216
+ {left && right ? (
217
+ <span
218
+ className={
219
+ delta > 0
220
+ ? "text-green-500"
221
+ : delta < 0
222
+ ? "text-red-500"
223
+ : "text-muted-foreground"
224
+ }
225
+ >
226
+ {delta > 0 ? "+" : ""}
227
+ {delta.toFixed(0)}%
228
+ </span>
229
+ ) : (
230
+ <span className="text-muted-foreground">--</span>
231
+ )}
232
+ </TableCell>
233
+ <TableCell className="text-center text-sm text-muted-foreground">
234
+ {left ? `${left.meanDuration.toFixed(1)}s` : "--"}
235
+ </TableCell>
236
+ <TableCell className="text-center text-sm text-muted-foreground">
237
+ {right ? `${right.meanDuration.toFixed(1)}s` : "--"}
238
+ </TableCell>
239
+ </TableRow>
240
+ );
241
+ })}
242
+ </TableBody>
243
+ </Table>
244
+ </CardContent>
245
+ </Card>
246
+ </>
247
+ )}
248
+
249
+ {/* Prompt to select both */}
250
+ {(!leftData || !rightData) && (
251
+ <Card>
252
+ <CardContent className="py-12 text-center">
253
+ <p className="text-muted-foreground">
254
+ Select two experiment runs above to compare them.
255
+ </p>
256
+ </CardContent>
257
+ </Card>
258
+ )}
259
+ </div>
260
+ );
261
+ }
262
+
263
+ function ComparisonStat({
264
+ label,
265
+ left,
266
+ right,
267
+ format,
268
+ higherIsBetter,
269
+ }: {
270
+ label: string;
271
+ left: number;
272
+ right: number;
273
+ format: (v: number) => string;
274
+ higherIsBetter: boolean;
275
+ }) {
276
+ const delta = right - left;
277
+ const improved = higherIsBetter ? delta > 0 : delta < 0;
278
+ const regressed = higherIsBetter ? delta < 0 : delta > 0;
279
+
280
+ return (
281
+ <Card>
282
+ <CardContent className="py-4 px-4">
283
+ <div className="text-xs text-muted-foreground">{label}</div>
284
+ <div className="flex items-baseline gap-4 mt-1">
285
+ <span className="text-lg font-medium">{format(left)}</span>
286
+ <span className="text-muted-foreground">→</span>
287
+ <span className="text-lg font-medium">{format(right)}</span>
288
+ {delta !== 0 && (
289
+ <Badge
290
+ variant={improved ? "default" : regressed ? "destructive" : "secondary"}
291
+ className="text-xs"
292
+ >
293
+ {delta > 0 ? "+" : ""}
294
+ {format(Math.abs(delta))}
295
+ </Badge>
296
+ )}
297
+ </div>
298
+ </CardContent>
299
+ </Card>
300
+ );
301
+ }
302
+
303
+ function avgPassRate(evals: EvalDetail[]): number {
304
+ if (evals.length === 0) return 0;
305
+ return evals.reduce((s, e) => s + e.passRate, 0) / evals.length;
306
+ }
307
+
308
+ function avgDuration(evals: EvalDetail[]): number {
309
+ if (evals.length === 0) return 0;
310
+ return evals.reduce((s, e) => s + e.meanDuration, 0) / evals.length;
311
+ }
312
+
@@ -0,0 +1,114 @@
1
+ import Link from "next/link";
2
+ import { Badge } from "@/components/ui/badge";
3
+ import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
4
+ import { ScrollArea } from "@/components/ui/scroll-area";
5
+
6
+ interface EvalDetailData {
7
+ name: string;
8
+ prompt: string;
9
+ files: string[];
10
+ fileContents?: Record<string, string>;
11
+ }
12
+
13
+ interface EvalDetailProps {
14
+ data: EvalDetailData;
15
+ }
16
+
17
+ export function EvalDetail({ data }: EvalDetailProps) {
18
+ const evalFile = data.fileContents?.["EVAL.ts"] || data.fileContents?.["EVAL.tsx"];
19
+ const evalFileName = data.fileContents?.["EVAL.ts"] ? "EVAL.ts" : "EVAL.tsx";
20
+ const packageJson = data.fileContents?.["package.json"];
21
+
22
+ return (
23
+ <div className="space-y-6">
24
+ {/* Breadcrumb */}
25
+ <div className="flex items-center gap-2 text-sm text-muted-foreground">
26
+ <Link href="/evals" className="cursor-pointer hover:underline underline-offset-4">
27
+ Evals
28
+ </Link>
29
+ <span>/</span>
30
+ <span>{data.name}</span>
31
+ </div>
32
+
33
+ <div>
34
+ <h1 className="text-2xl font-bold tracking-tight">{data.name}</h1>
35
+ </div>
36
+
37
+ <Tabs defaultValue="prompt" className="w-full">
38
+ <TabsList>
39
+ <TabsTrigger value="prompt">PROMPT.md</TabsTrigger>
40
+ {evalFile && <TabsTrigger value="eval">{evalFileName}</TabsTrigger>}
41
+ {packageJson && <TabsTrigger value="package">package.json</TabsTrigger>}
42
+ <TabsTrigger value="files">Files</TabsTrigger>
43
+ </TabsList>
44
+
45
+ <TabsContent value="prompt" className="mt-4">
46
+ <ScrollArea className="h-[calc(100vh-300px)]">
47
+ <pre className="text-sm font-mono whitespace-pre-wrap rounded-lg bg-muted p-4">
48
+ {data.prompt || "No PROMPT.md found."}
49
+ </pre>
50
+ </ScrollArea>
51
+ </TabsContent>
52
+
53
+ {evalFile && (
54
+ <TabsContent value="eval" className="mt-4">
55
+ <ScrollArea className="h-[calc(100vh-300px)]">
56
+ <pre className="text-sm font-mono whitespace-pre-wrap rounded-lg bg-muted p-4 overflow-x-auto">
57
+ {evalFile}
58
+ </pre>
59
+ </ScrollArea>
60
+ </TabsContent>
61
+ )}
62
+
63
+ {packageJson && (
64
+ <TabsContent value="package" className="mt-4">
65
+ <ScrollArea className="h-[calc(100vh-300px)]">
66
+ <pre className="text-sm font-mono whitespace-pre-wrap rounded-lg bg-muted p-4 overflow-x-auto">
67
+ {packageJson}
68
+ </pre>
69
+ </ScrollArea>
70
+ </TabsContent>
71
+ )}
72
+
73
+ <TabsContent value="files" className="mt-4">
74
+ <div className="text-sm text-muted-foreground mb-3">
75
+ {data.files.length} files
76
+ </div>
77
+ <div className="space-y-1">
78
+ {data.files.map((file) => (
79
+ <div
80
+ key={file}
81
+ className="flex items-center gap-2 text-sm font-mono bg-muted rounded-lg px-3 py-1.5"
82
+ >
83
+ <FileIcon filename={file} />
84
+ <span>{file}</span>
85
+ </div>
86
+ ))}
87
+ </div>
88
+ </TabsContent>
89
+ </Tabs>
90
+ </div>
91
+ );
92
+ }
93
+
94
+ function FileIcon({ filename }: { filename: string }) {
95
+ const ext = filename.split(".").pop()?.toLowerCase();
96
+
97
+ const iconMap: Record<string, string> = {
98
+ ts: "TS",
99
+ tsx: "TX",
100
+ js: "JS",
101
+ json: "{}",
102
+ md: "MD",
103
+ css: "CS",
104
+ html: "HT",
105
+ };
106
+
107
+ const label = iconMap[ext || ""] || "F";
108
+
109
+ return (
110
+ <Badge variant="outline" className="text-[10px] font-mono px-1 py-0 h-5 w-6 flex items-center justify-center">
111
+ {label}
112
+ </Badge>
113
+ );
114
+ }
@@ -0,0 +1,80 @@
1
+ import Link from "next/link";
2
+ import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
3
+ import { Badge } from "@/components/ui/badge";
4
+ import { ShowMore } from "@/components/ShowMore";
5
+
6
+ interface EvalInfo {
7
+ name: string;
8
+ prompt: string;
9
+ files: string[];
10
+ }
11
+
12
+ interface EvalsPageProps {
13
+ evals: EvalInfo[];
14
+ total: number;
15
+ showAll: boolean;
16
+ }
17
+
18
+ export function EvalsPage({ evals, total, showAll }: EvalsPageProps) {
19
+ return (
20
+ <div className="space-y-6">
21
+ <div>
22
+ <h1 className="text-2xl font-bold tracking-tight">Evals</h1>
23
+ <p className="text-muted-foreground mt-1">
24
+ Browse all eval fixtures in your project.
25
+ </p>
26
+ </div>
27
+
28
+ {evals.length === 0 ? (
29
+ <Card>
30
+ <CardContent className="py-12 text-center">
31
+ <p className="text-muted-foreground text-lg">No evals found</p>
32
+ <p className="text-muted-foreground text-sm mt-2">
33
+ Create evals in your{" "}
34
+ <code className="text-foreground bg-muted px-1.5 py-0.5 rounded text-xs">
35
+ evals/
36
+ </code>{" "}
37
+ directory.
38
+ </p>
39
+ </CardContent>
40
+ </Card>
41
+ ) : (
42
+ <ShowMore total={total} showAllHref={showAll ? undefined : "/evals?all"} className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
43
+ {evals.map((evalInfo) => (
44
+ <Link key={evalInfo.name} href={`/evals/${encodeURIComponent(evalInfo.name)}`} className="block cursor-pointer">
45
+ <Card className="transition-colors hover:bg-muted">
46
+ <CardHeader>
47
+ <CardTitle className="text-base">{evalInfo.name}</CardTitle>
48
+ </CardHeader>
49
+ <CardContent>
50
+ {evalInfo.prompt && (
51
+ <p className="text-sm text-muted-foreground line-clamp-3 mb-3">
52
+ {evalInfo.prompt.slice(0, 200)}
53
+ {evalInfo.prompt.length > 200 ? "..." : ""}
54
+ </p>
55
+ )}
56
+ <div className="flex flex-wrap gap-1">
57
+ {evalInfo.files.slice(0, 5).map((file) => (
58
+ <Badge
59
+ key={file}
60
+ variant="secondary"
61
+ className="text-xs font-mono"
62
+ >
63
+ {file}
64
+ </Badge>
65
+ ))}
66
+ {evalInfo.files.length > 5 && (
67
+ <Badge variant="outline" className="text-xs">
68
+ +{evalInfo.files.length - 5} more
69
+ </Badge>
70
+ )}
71
+ </div>
72
+ </CardContent>
73
+ </Card>
74
+ </Link>
75
+ ))}
76
+ </ShowMore>
77
+ )}
78
+ </div>
79
+ );
80
+ }