@vercel/agent-eval-playground 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +50 -0
  2. package/app/compare/page.tsx +40 -0
  3. package/app/evals/[name]/page.tsx +22 -0
  4. package/app/evals/page.tsx +18 -0
  5. package/app/experiments/[name]/[timestamp]/page.tsx +23 -0
  6. package/app/experiments/page.tsx +28 -0
  7. package/app/globals.css +126 -0
  8. package/app/layout.tsx +102 -0
  9. package/app/page.tsx +179 -0
  10. package/app/transcript/[experiment]/[timestamp]/[evalName]/[run]/page.tsx +43 -0
  11. package/bin.mjs +86 -0
  12. package/components/ComparePage.tsx +312 -0
  13. package/components/EvalDetail.tsx +114 -0
  14. package/components/EvalsPage.tsx +80 -0
  15. package/components/ExperimentDetail.tsx +162 -0
  16. package/components/ExperimentList.tsx +103 -0
  17. package/components/O11ySummary.tsx +114 -0
  18. package/components/RunResultCard.tsx +72 -0
  19. package/components/ShowMore.tsx +60 -0
  20. package/components/TranscriptPage.tsx +46 -0
  21. package/components/TranscriptViewer.tsx +201 -0
  22. package/components/ui/alert-dialog.tsx +184 -0
  23. package/components/ui/badge.tsx +45 -0
  24. package/components/ui/button.tsx +60 -0
  25. package/components/ui/card.tsx +94 -0
  26. package/components/ui/collapsible.tsx +34 -0
  27. package/components/ui/combobox.tsx +297 -0
  28. package/components/ui/dropdown-menu.tsx +269 -0
  29. package/components/ui/field.tsx +227 -0
  30. package/components/ui/input-group.tsx +147 -0
  31. package/components/ui/input.tsx +19 -0
  32. package/components/ui/label.tsx +24 -0
  33. package/components/ui/progress.tsx +31 -0
  34. package/components/ui/scroll-area.tsx +58 -0
  35. package/components/ui/select.tsx +191 -0
  36. package/components/ui/separator.tsx +28 -0
  37. package/components/ui/table.tsx +116 -0
  38. package/components/ui/tabs.tsx +91 -0
  39. package/components/ui/textarea.tsx +18 -0
  40. package/components/ui/tooltip.tsx +57 -0
  41. package/components.json +25 -0
  42. package/lib/data.ts +297 -0
  43. package/lib/types.ts +113 -0
  44. package/lib/utils.ts +6 -0
  45. package/next.config.ts +5 -0
  46. package/package.json +51 -0
  47. package/postcss.config.mjs +7 -0
  48. package/public/vercel.svg +1 -0
  49. package/tsconfig.json +42 -0
@@ -0,0 +1,162 @@
1
+ import Link from "next/link";
2
+ import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card";
3
+ import { Badge } from "@/components/ui/badge";
4
+ import { Progress } from "@/components/ui/progress";
5
+ import { Separator } from "@/components/ui/separator";
6
+ import { RunResultCard } from "@/components/RunResultCard";
7
+
8
+ interface EvalDetail {
9
+ name: string;
10
+ totalRuns: number;
11
+ passedRuns: number;
12
+ passRate: number;
13
+ meanDuration: number;
14
+ runs: { name: string; result: RunResult | null }[];
15
+ }
16
+
17
+ interface RunResult {
18
+ status: "passed" | "failed";
19
+ error?: string;
20
+ duration: number;
21
+ o11y?: {
22
+ totalToolCalls: number;
23
+ thinkingBlocks: number;
24
+ errors: string[];
25
+ };
26
+ }
27
+
28
+ interface ExperimentDetailData {
29
+ name: string;
30
+ timestamp: string;
31
+ evals: EvalDetail[];
32
+ }
33
+
34
+ interface ExperimentDetailProps {
35
+ data: ExperimentDetailData;
36
+ }
37
+
38
+ export function ExperimentDetail({ data }: ExperimentDetailProps) {
39
+ const totalEvals = data.evals.length;
40
+ const passedEvals = data.evals.filter(
41
+ (e) => e.passedRuns === e.totalRuns
42
+ ).length;
43
+ const overallPassRate =
44
+ data.evals.reduce((sum, e) => sum + e.passRate, 0) / (totalEvals || 1);
45
+
46
+ return (
47
+ <div className="space-y-6">
48
+ {/* Header */}
49
+ <div>
50
+ <div className="flex items-center gap-2 text-sm text-muted-foreground mb-1">
51
+ <Link href="/experiments" className="cursor-pointer hover:underline underline-offset-4">
52
+ Experiments
53
+ </Link>
54
+ <span>/</span>
55
+ <span>{data.name}</span>
56
+ <span>/</span>
57
+ <span>{formatTimestamp(data.timestamp)}</span>
58
+ </div>
59
+ <h1 className="text-2xl font-bold tracking-tight">{data.name}</h1>
60
+ </div>
61
+
62
+ {/* Summary cards */}
63
+ <div className="grid grid-cols-1 md:grid-cols-4 gap-4">
64
+ <Card>
65
+ <CardContent className="py-4 px-4">
66
+ <div className="text-xs text-muted-foreground">
67
+ Overall Pass Rate
68
+ </div>
69
+ <div className="text-2xl font-bold mt-1">
70
+ {overallPassRate.toFixed(0)}%
71
+ </div>
72
+ <Progress value={overallPassRate} className="mt-2 h-1.5" />
73
+ </CardContent>
74
+ </Card>
75
+ <Card>
76
+ <CardContent className="py-4 px-4">
77
+ <div className="text-xs text-muted-foreground">Evals</div>
78
+ <div className="text-2xl font-bold mt-1">
79
+ {passedEvals}/{totalEvals}
80
+ </div>
81
+ <div className="text-xs text-muted-foreground mt-1">passed</div>
82
+ </CardContent>
83
+ </Card>
84
+ <Card>
85
+ <CardContent className="py-4 px-4">
86
+ <div className="text-xs text-muted-foreground">Avg Duration</div>
87
+ <div className="text-2xl font-bold mt-1">
88
+ {(
89
+ data.evals.reduce((s, e) => s + e.meanDuration, 0) /
90
+ (totalEvals || 1)
91
+ ).toFixed(1)}
92
+ s
93
+ </div>
94
+ </CardContent>
95
+ </Card>
96
+ <Card>
97
+ <CardContent className="py-4 px-4">
98
+ <div className="text-xs text-muted-foreground">Timestamp</div>
99
+ <div className="text-sm font-medium mt-1">
100
+ {formatTimestamp(data.timestamp)}
101
+ </div>
102
+ </CardContent>
103
+ </Card>
104
+ </div>
105
+
106
+ <Separator />
107
+
108
+ {/* Per-eval breakdown */}
109
+ <div className="space-y-6">
110
+ {data.evals.map((evalDetail) => (
111
+ <Card key={evalDetail.name}>
112
+ <CardHeader>
113
+ <div className="flex items-center justify-between">
114
+ <div className="flex items-center gap-3">
115
+ <CardTitle className="text-lg">{evalDetail.name}</CardTitle>
116
+ <Badge
117
+ variant={
118
+ evalDetail.passedRuns === evalDetail.totalRuns
119
+ ? "default"
120
+ : "destructive"
121
+ }
122
+ >
123
+ {evalDetail.passedRuns}/{evalDetail.totalRuns} passed
124
+ </Badge>
125
+ </div>
126
+ <div className="text-sm text-muted-foreground">
127
+ avg {evalDetail.meanDuration.toFixed(1)}s
128
+ </div>
129
+ </div>
130
+ <Progress value={evalDetail.passRate} className="h-1.5 mt-2" />
131
+ </CardHeader>
132
+ <CardContent>
133
+ <div className="space-y-2">
134
+ {evalDetail.runs.map((run) => (
135
+ <RunResultCard
136
+ key={run.name}
137
+ runName={run.name}
138
+ result={run.result}
139
+ experiment={data.name}
140
+ timestamp={data.timestamp}
141
+ evalName={evalDetail.name}
142
+ />
143
+ ))}
144
+ </div>
145
+ </CardContent>
146
+ </Card>
147
+ ))}
148
+ </div>
149
+ </div>
150
+ );
151
+ }
152
+
153
+ function formatTimestamp(ts: string): string {
154
+ try {
155
+ const isoString = ts.replace(/T(\d{2})-(\d{2})-(\d{2})/, "T$1:$2:$3");
156
+ const date = new Date(isoString);
157
+ if (isNaN(date.getTime())) return ts;
158
+ return date.toLocaleString();
159
+ } catch {
160
+ return ts;
161
+ }
162
+ }
@@ -0,0 +1,103 @@
1
+ import Link from "next/link";
2
+ import { Badge } from "@/components/ui/badge";
3
+ import { Card, CardContent } from "@/components/ui/card";
4
+ import { ShowMore } from "@/components/ShowMore";
5
+
6
+ interface ExperimentInfo {
7
+ name: string;
8
+ timestamps: string[];
9
+ latestTimestamp: string | null;
10
+ latestPassRate?: number;
11
+ latestTotalRuns?: number;
12
+ latestPassedRuns?: number;
13
+ }
14
+
15
+ interface ExperimentListProps {
16
+ experiments: ExperimentInfo[];
17
+ total: number;
18
+ showAll: boolean;
19
+ }
20
+
21
+ export function ExperimentList({ experiments, total, showAll }: ExperimentListProps) {
22
+ if (experiments.length === 0) {
23
+ return (
24
+ <Card>
25
+ <CardContent className="py-12 text-center">
26
+ <p className="text-muted-foreground text-lg">No experiments found</p>
27
+ <p className="text-muted-foreground text-sm mt-2">
28
+ Run an experiment with <code className="text-foreground bg-muted px-1.5 py-0.5 rounded text-xs">agent-eval &lt;config&gt;</code> to see results here.
29
+ </p>
30
+ </CardContent>
31
+ </Card>
32
+ );
33
+ }
34
+
35
+ return (
36
+ <Card>
37
+ <CardContent className="pt-0">
38
+ <div>
39
+ {/* Header */}
40
+ <div className="grid grid-cols-[1fr_auto_auto_auto] gap-4 px-3 py-2 text-xs text-muted-foreground font-medium border-b border-border">
41
+ <span>Name</span>
42
+ <span className="w-12 text-right">Runs</span>
43
+ <span className="w-24">Pass Rate</span>
44
+ <span className="w-44">Latest Run</span>
45
+ </div>
46
+ {/* Rows */}
47
+ <ShowMore total={total} showAllHref={showAll ? undefined : "/experiments?all"}>
48
+ {experiments.map((exp) => (
49
+ <Link
50
+ key={exp.name}
51
+ href={
52
+ exp.latestTimestamp
53
+ ? `/experiments/${encodeURIComponent(exp.name)}/${encodeURIComponent(exp.latestTimestamp)}`
54
+ : "#"
55
+ }
56
+ className="grid grid-cols-[1fr_auto_auto_auto] gap-4 items-center px-3 py-2.5 cursor-pointer transition-colors hover:bg-muted rounded-md"
57
+ >
58
+ <span className="font-medium truncate">{exp.name}</span>
59
+ <span className="w-12 text-right text-muted-foreground">{exp.timestamps.length}</span>
60
+ <span className="w-24">
61
+ {exp.latestPassRate !== undefined ? (
62
+ <span className="flex items-center gap-2">
63
+ <Badge
64
+ variant={
65
+ exp.latestPassRate === 100
66
+ ? "default"
67
+ : exp.latestPassRate >= 50
68
+ ? "secondary"
69
+ : "destructive"
70
+ }
71
+ >
72
+ {exp.latestPassRate.toFixed(0)}%
73
+ </Badge>
74
+ <span className="text-xs text-muted-foreground">
75
+ {exp.latestPassedRuns}/{exp.latestTotalRuns}
76
+ </span>
77
+ </span>
78
+ ) : (
79
+ <span className="text-muted-foreground">--</span>
80
+ )}
81
+ </span>
82
+ <span className="w-44 text-xs text-muted-foreground">
83
+ {exp.latestTimestamp ? formatTimestamp(exp.latestTimestamp) : "--"}
84
+ </span>
85
+ </Link>
86
+ ))}
87
+ </ShowMore>
88
+ </div>
89
+ </CardContent>
90
+ </Card>
91
+ );
92
+ }
93
+
94
+ function formatTimestamp(ts: string): string {
95
+ try {
96
+ const isoString = ts.replace(/T(\d{2})-(\d{2})-(\d{2})/, "T$1:$2:$3");
97
+ const date = new Date(isoString);
98
+ if (isNaN(date.getTime())) return ts;
99
+ return date.toLocaleString();
100
+ } catch {
101
+ return ts;
102
+ }
103
+ }
@@ -0,0 +1,114 @@
1
+ import { Badge } from "@/components/ui/badge";
2
+ import type { TranscriptSummary, ToolName } from "@/lib/types";
3
+
4
+ interface O11ySummaryProps {
5
+ summary: TranscriptSummary;
6
+ }
7
+
8
+ const TOOL_LABELS: Record<ToolName, string> = {
9
+ file_read: "File Read",
10
+ file_write: "File Write",
11
+ file_edit: "File Edit",
12
+ shell: "Shell",
13
+ web_fetch: "Web Fetch",
14
+ web_search: "Web Search",
15
+ glob: "Glob",
16
+ grep: "Grep",
17
+ list_dir: "List Dir",
18
+ agent_task: "Agent Task",
19
+ unknown: "Unknown",
20
+ };
21
+
22
+ export function O11ySummary({ summary }: O11ySummaryProps) {
23
+ const toolEntries = Object.entries(summary.toolCalls)
24
+ .filter(([, count]) => count > 0)
25
+ .sort(([, a], [, b]) => b - a);
26
+
27
+ return (
28
+ <div className="space-y-3 rounded-lg bg-muted/50 p-4">
29
+ {/* Inline stats */}
30
+ <div className="flex flex-wrap gap-x-5 gap-y-1 text-xs">
31
+ <span><span className="text-muted-foreground">Turns</span> <span className="font-medium">{summary.totalTurns}</span></span>
32
+ <span><span className="text-muted-foreground">Tool Calls</span> <span className="font-medium">{summary.totalToolCalls}</span></span>
33
+ <span><span className="text-muted-foreground">Thinking</span> <span className="font-medium">{summary.thinkingBlocks}</span></span>
34
+ <span>
35
+ <span className="text-muted-foreground">Errors</span>{" "}
36
+ <span className={`font-medium ${summary.errors.length > 0 ? "text-destructive" : ""}`}>{summary.errors.length}</span>
37
+ </span>
38
+ </div>
39
+
40
+ {/* Tool breakdown - inline */}
41
+ {toolEntries.length > 0 && (
42
+ <div className="flex flex-wrap gap-1.5">
43
+ {toolEntries.map(([tool, count]) => (
44
+ <Badge key={tool} variant="secondary" className="text-xs font-normal">
45
+ {TOOL_LABELS[tool as ToolName] ?? tool} <span className="font-mono ml-1">{count}</span>
46
+ </Badge>
47
+ ))}
48
+ </div>
49
+ )}
50
+
51
+ {/* Files - inline */}
52
+ {(summary.filesRead.length > 0 || summary.filesModified.length > 0) && (
53
+ <div className="space-y-1.5 text-xs">
54
+ {summary.filesRead.length > 0 && (
55
+ <div className="flex flex-wrap gap-1 items-center">
56
+ <span className="text-muted-foreground shrink-0">Read ({summary.filesRead.length})</span>
57
+ {summary.filesRead.slice(0, 10).map((f) => (
58
+ <Badge key={f} variant="outline" className="text-xs font-mono font-normal">
59
+ {f}
60
+ </Badge>
61
+ ))}
62
+ {summary.filesRead.length > 10 && (
63
+ <span className="text-muted-foreground">+{summary.filesRead.length - 10} more</span>
64
+ )}
65
+ </div>
66
+ )}
67
+ {summary.filesModified.length > 0 && (
68
+ <div className="flex flex-wrap gap-1 items-center">
69
+ <span className="text-muted-foreground shrink-0">Modified ({summary.filesModified.length})</span>
70
+ {summary.filesModified.slice(0, 10).map((f) => (
71
+ <Badge key={f} variant="outline" className="text-xs font-mono font-normal">
72
+ {f}
73
+ </Badge>
74
+ ))}
75
+ {summary.filesModified.length > 10 && (
76
+ <span className="text-muted-foreground">+{summary.filesModified.length - 10} more</span>
77
+ )}
78
+ </div>
79
+ )}
80
+ </div>
81
+ )}
82
+
83
+ {/* Shell commands - compact */}
84
+ {summary.shellCommands.length > 0 && (
85
+ <div className="space-y-1 text-xs">
86
+ <span className="text-muted-foreground">Shell ({summary.shellCommands.length})</span>
87
+ <div className="flex flex-wrap gap-1">
88
+ {summary.shellCommands.slice(0, 10).map((cmd, i) => (
89
+ <Badge
90
+ key={i}
91
+ variant={cmd.exitCode === 0 ? "outline" : "destructive"}
92
+ className="text-xs font-mono font-normal max-w-64 truncate"
93
+ >
94
+ {cmd.command}
95
+ </Badge>
96
+ ))}
97
+ </div>
98
+ </div>
99
+ )}
100
+
101
+ {/* Errors - compact */}
102
+ {summary.errors.length > 0 && (
103
+ <div className="space-y-1 text-xs">
104
+ <span className="text-destructive font-medium">Errors ({summary.errors.length})</span>
105
+ {summary.errors.map((err, i) => (
106
+ <div key={i} className="text-xs font-mono text-destructive truncate">
107
+ {err}
108
+ </div>
109
+ ))}
110
+ </div>
111
+ )}
112
+ </div>
113
+ );
114
+ }
@@ -0,0 +1,72 @@
1
+ import Link from "next/link";
2
+ import { Badge } from "@/components/ui/badge";
3
+ import { Card, CardContent } from "@/components/ui/card";
4
+
5
+
6
+ interface RunResult {
7
+ status: "passed" | "failed";
8
+ error?: string;
9
+ duration: number;
10
+ o11y?: {
11
+ totalToolCalls: number;
12
+ thinkingBlocks: number;
13
+ errors: string[];
14
+ };
15
+ }
16
+
17
+ interface RunResultCardProps {
18
+ runName: string;
19
+ result: RunResult | null;
20
+ experiment: string;
21
+ timestamp: string;
22
+ evalName: string;
23
+ }
24
+
25
+ export function RunResultCard({
26
+ runName,
27
+ result,
28
+ experiment,
29
+ timestamp,
30
+ evalName,
31
+ }: RunResultCardProps) {
32
+ if (!result) {
33
+ return (
34
+ <Card className="opacity-50">
35
+ <CardContent className="py-3 px-4">
36
+ <span className="text-sm text-muted-foreground">{runName}: No result</span>
37
+ </CardContent>
38
+ </Card>
39
+ );
40
+ }
41
+
42
+ return (
43
+ <Link
44
+ href={`/transcript/${encodeURIComponent(experiment)}/${encodeURIComponent(timestamp)}/${encodeURIComponent(evalName)}/${encodeURIComponent(runName)}`}
45
+ className="block cursor-pointer"
46
+ >
47
+ <Card className="transition-colors hover:bg-muted">
48
+ <CardContent className="py-3 px-4 flex items-center justify-between">
49
+ <div className="flex items-center gap-3">
50
+ <span className="text-sm font-medium">{runName}</span>
51
+ <Badge
52
+ variant={result.status === "passed" ? "default" : "destructive"}
53
+ >
54
+ {result.status}
55
+ </Badge>
56
+ {result.error && (
57
+ <span className="text-xs text-destructive truncate max-w-64">
58
+ {result.error}
59
+ </span>
60
+ )}
61
+ </div>
62
+ <div className="flex items-center gap-4 text-xs text-muted-foreground">
63
+ {result.o11y && (
64
+ <span>{result.o11y.totalToolCalls} tool calls</span>
65
+ )}
66
+ <span>{result.duration.toFixed(1)}s</span>
67
+ </div>
68
+ </CardContent>
69
+ </Card>
70
+ </Link>
71
+ );
72
+ }
@@ -0,0 +1,60 @@
1
+ "use client";
2
+
3
+ import { useState } from "react";
4
+ import Link from "next/link";
5
+
6
+ const DEFAULT_LIMIT = 50;
7
+
8
+ interface ShowMoreProps {
9
+ children: React.ReactNode[];
10
+ limit?: number;
11
+ /** Total count of items (when server limits the fetch). Shows a Link instead of client-side toggle. */
12
+ total?: number;
13
+ /** URL to navigate to when showing all items (server-driven mode). */
14
+ showAllHref?: string;
15
+ className?: string;
16
+ }
17
+
18
+ export function ShowMore({
19
+ children,
20
+ limit = DEFAULT_LIMIT,
21
+ total,
22
+ showAllHref,
23
+ className,
24
+ }: ShowMoreProps) {
25
+ const [visibleCount, setVisibleCount] = useState(limit);
26
+
27
+ // Server-driven: fewer children than total, link to load all from server
28
+ const serverLimited = total !== undefined && total > children.length;
29
+
30
+ // Client-driven: all children passed, show in increments
31
+ const clientLimited = !serverLimited && children.length > visibleCount;
32
+
33
+ const visible = clientLimited ? children.slice(0, visibleCount) : children;
34
+
35
+ const totalCount = total ?? children.length;
36
+ const shownCount = visible.length;
37
+ const remainingCount = totalCount - shownCount;
38
+
39
+ return (
40
+ <div className={className}>
41
+ {visible}
42
+ {serverLimited && showAllHref && (
43
+ <Link
44
+ href={showAllHref}
45
+ className="block w-full py-2 text-center text-sm text-muted-foreground hover:text-foreground transition-colors cursor-pointer"
46
+ >
47
+ Show more ({remainingCount} remaining)
48
+ </Link>
49
+ )}
50
+ {clientLimited && (
51
+ <button
52
+ onClick={() => setVisibleCount((c) => c + limit)}
53
+ className="w-full py-2 text-sm text-muted-foreground hover:text-foreground transition-colors cursor-pointer"
54
+ >
55
+ Show more ({remainingCount} remaining)
56
+ </button>
57
+ )}
58
+ </div>
59
+ );
60
+ }
@@ -0,0 +1,46 @@
1
+ import Link from "next/link";
2
+ import { Card, CardContent } from "@/components/ui/card";
3
+ import { TranscriptViewer } from "@/components/TranscriptViewer";
4
+ import type { Transcript } from "@/lib/types";
5
+
6
+ interface TranscriptPageProps {
7
+ experiment: string;
8
+ timestamp: string;
9
+ evalName: string;
10
+ run: string;
11
+ transcript: Transcript;
12
+ }
13
+
14
+ export function TranscriptPage({
15
+ experiment,
16
+ timestamp,
17
+ evalName,
18
+ run,
19
+ transcript,
20
+ }: TranscriptPageProps) {
21
+ return (
22
+ <div className="space-y-4">
23
+ {/* Breadcrumb */}
24
+ <div className="flex items-center gap-2 text-sm text-muted-foreground">
25
+ <Link href="/experiments" className="cursor-pointer hover:underline underline-offset-4">
26
+ Experiments
27
+ </Link>
28
+ <span>/</span>
29
+ <Link
30
+ href={`/experiments/${encodeURIComponent(experiment)}/${encodeURIComponent(timestamp)}`}
31
+ className="cursor-pointer hover:underline underline-offset-4"
32
+ >
33
+ {experiment}
34
+ </Link>
35
+ <span>/</span>
36
+ <span>{evalName}</span>
37
+ <span>/</span>
38
+ <span>{run}</span>
39
+ </div>
40
+
41
+ <h1 className="text-2xl font-bold tracking-tight">Transcript</h1>
42
+
43
+ <TranscriptViewer transcript={transcript} />
44
+ </div>
45
+ );
46
+ }