@vercel/agent-eval-playground 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -0
- package/app/compare/page.tsx +40 -0
- package/app/evals/[name]/page.tsx +22 -0
- package/app/evals/page.tsx +18 -0
- package/app/experiments/[name]/[timestamp]/page.tsx +23 -0
- package/app/experiments/page.tsx +28 -0
- package/app/globals.css +126 -0
- package/app/layout.tsx +102 -0
- package/app/page.tsx +179 -0
- package/app/transcript/[experiment]/[timestamp]/[evalName]/[run]/page.tsx +43 -0
- package/bin.mjs +86 -0
- package/components/ComparePage.tsx +312 -0
- package/components/EvalDetail.tsx +114 -0
- package/components/EvalsPage.tsx +80 -0
- package/components/ExperimentDetail.tsx +162 -0
- package/components/ExperimentList.tsx +103 -0
- package/components/O11ySummary.tsx +114 -0
- package/components/RunResultCard.tsx +72 -0
- package/components/ShowMore.tsx +60 -0
- package/components/TranscriptPage.tsx +46 -0
- package/components/TranscriptViewer.tsx +201 -0
- package/components/ui/alert-dialog.tsx +184 -0
- package/components/ui/badge.tsx +45 -0
- package/components/ui/button.tsx +60 -0
- package/components/ui/card.tsx +94 -0
- package/components/ui/collapsible.tsx +34 -0
- package/components/ui/combobox.tsx +297 -0
- package/components/ui/dropdown-menu.tsx +269 -0
- package/components/ui/field.tsx +227 -0
- package/components/ui/input-group.tsx +147 -0
- package/components/ui/input.tsx +19 -0
- package/components/ui/label.tsx +24 -0
- package/components/ui/progress.tsx +31 -0
- package/components/ui/scroll-area.tsx +58 -0
- package/components/ui/select.tsx +191 -0
- package/components/ui/separator.tsx +28 -0
- package/components/ui/table.tsx +116 -0
- package/components/ui/tabs.tsx +91 -0
- package/components/ui/textarea.tsx +18 -0
- package/components/ui/tooltip.tsx +57 -0
- package/components.json +25 -0
- package/lib/data.ts +297 -0
- package/lib/types.ts +113 -0
- package/lib/utils.ts +6 -0
- package/next.config.ts +5 -0
- package/package.json +51 -0
- package/postcss.config.mjs +7 -0
- package/public/vercel.svg +1 -0
- package/tsconfig.json +42 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"use client"
|
|
2
|
+
|
|
3
|
+
import * as React from "react"
|
|
4
|
+
import { Separator as SeparatorPrimitive } from "radix-ui"
|
|
5
|
+
|
|
6
|
+
import { cn } from "@/lib/utils"
|
|
7
|
+
|
|
8
|
+
function Separator({
|
|
9
|
+
className,
|
|
10
|
+
orientation = "horizontal",
|
|
11
|
+
decorative = true,
|
|
12
|
+
...props
|
|
13
|
+
}: React.ComponentProps<typeof SeparatorPrimitive.Root>) {
|
|
14
|
+
return (
|
|
15
|
+
<SeparatorPrimitive.Root
|
|
16
|
+
data-slot="separator"
|
|
17
|
+
decorative={decorative}
|
|
18
|
+
orientation={orientation}
|
|
19
|
+
className={cn(
|
|
20
|
+
"bg-border shrink-0 data-horizontal:h-px data-horizontal:w-full data-vertical:w-px data-vertical:self-stretch",
|
|
21
|
+
className
|
|
22
|
+
)}
|
|
23
|
+
{...props}
|
|
24
|
+
/>
|
|
25
|
+
)
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export { Separator }
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"use client"
|
|
2
|
+
|
|
3
|
+
import * as React from "react"
|
|
4
|
+
|
|
5
|
+
import { cn } from "@/lib/utils"
|
|
6
|
+
|
|
7
|
+
function Table({ className, ...props }: React.ComponentProps<"table">) {
|
|
8
|
+
return (
|
|
9
|
+
<div
|
|
10
|
+
data-slot="table-container"
|
|
11
|
+
className="relative w-full overflow-x-auto"
|
|
12
|
+
>
|
|
13
|
+
<table
|
|
14
|
+
data-slot="table"
|
|
15
|
+
className={cn("w-full caption-bottom text-sm", className)}
|
|
16
|
+
{...props}
|
|
17
|
+
/>
|
|
18
|
+
</div>
|
|
19
|
+
)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function TableHeader({ className, ...props }: React.ComponentProps<"thead">) {
|
|
23
|
+
return (
|
|
24
|
+
<thead
|
|
25
|
+
data-slot="table-header"
|
|
26
|
+
className={cn("[&_tr]:border-b", className)}
|
|
27
|
+
{...props}
|
|
28
|
+
/>
|
|
29
|
+
)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function TableBody({ className, ...props }: React.ComponentProps<"tbody">) {
|
|
33
|
+
return (
|
|
34
|
+
<tbody
|
|
35
|
+
data-slot="table-body"
|
|
36
|
+
className={cn("[&_tr:last-child]:border-0", className)}
|
|
37
|
+
{...props}
|
|
38
|
+
/>
|
|
39
|
+
)
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function TableFooter({ className, ...props }: React.ComponentProps<"tfoot">) {
|
|
43
|
+
return (
|
|
44
|
+
<tfoot
|
|
45
|
+
data-slot="table-footer"
|
|
46
|
+
className={cn(
|
|
47
|
+
"bg-muted/50 border-t font-medium [&>tr]:last:border-b-0",
|
|
48
|
+
className
|
|
49
|
+
)}
|
|
50
|
+
{...props}
|
|
51
|
+
/>
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function TableRow({ className, ...props }: React.ComponentProps<"tr">) {
|
|
56
|
+
return (
|
|
57
|
+
<tr
|
|
58
|
+
data-slot="table-row"
|
|
59
|
+
className={cn(
|
|
60
|
+
"hover:bg-muted/50 data-[state=selected]:bg-muted border-b transition-colors",
|
|
61
|
+
className
|
|
62
|
+
)}
|
|
63
|
+
{...props}
|
|
64
|
+
/>
|
|
65
|
+
)
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function TableHead({ className, ...props }: React.ComponentProps<"th">) {
|
|
69
|
+
return (
|
|
70
|
+
<th
|
|
71
|
+
data-slot="table-head"
|
|
72
|
+
className={cn(
|
|
73
|
+
"text-foreground h-10 px-2 text-left align-middle font-medium whitespace-nowrap [&:has([role=checkbox])]:pr-0 [&>[role=checkbox]]:translate-y-[2px]",
|
|
74
|
+
className
|
|
75
|
+
)}
|
|
76
|
+
{...props}
|
|
77
|
+
/>
|
|
78
|
+
)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function TableCell({ className, ...props }: React.ComponentProps<"td">) {
|
|
82
|
+
return (
|
|
83
|
+
<td
|
|
84
|
+
data-slot="table-cell"
|
|
85
|
+
className={cn(
|
|
86
|
+
"p-2 align-middle whitespace-nowrap [&:has([role=checkbox])]:pr-0 [&>[role=checkbox]]:translate-y-[2px]",
|
|
87
|
+
className
|
|
88
|
+
)}
|
|
89
|
+
{...props}
|
|
90
|
+
/>
|
|
91
|
+
)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function TableCaption({
|
|
95
|
+
className,
|
|
96
|
+
...props
|
|
97
|
+
}: React.ComponentProps<"caption">) {
|
|
98
|
+
return (
|
|
99
|
+
<caption
|
|
100
|
+
data-slot="table-caption"
|
|
101
|
+
className={cn("text-muted-foreground mt-4 text-sm", className)}
|
|
102
|
+
{...props}
|
|
103
|
+
/>
|
|
104
|
+
)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export {
|
|
108
|
+
Table,
|
|
109
|
+
TableHeader,
|
|
110
|
+
TableBody,
|
|
111
|
+
TableFooter,
|
|
112
|
+
TableHead,
|
|
113
|
+
TableRow,
|
|
114
|
+
TableCell,
|
|
115
|
+
TableCaption,
|
|
116
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"use client"
|
|
2
|
+
|
|
3
|
+
import * as React from "react"
|
|
4
|
+
import { cva, type VariantProps } from "class-variance-authority"
|
|
5
|
+
import { Tabs as TabsPrimitive } from "radix-ui"
|
|
6
|
+
|
|
7
|
+
import { cn } from "@/lib/utils"
|
|
8
|
+
|
|
9
|
+
function Tabs({
|
|
10
|
+
className,
|
|
11
|
+
orientation = "horizontal",
|
|
12
|
+
...props
|
|
13
|
+
}: React.ComponentProps<typeof TabsPrimitive.Root>) {
|
|
14
|
+
return (
|
|
15
|
+
<TabsPrimitive.Root
|
|
16
|
+
data-slot="tabs"
|
|
17
|
+
data-orientation={orientation}
|
|
18
|
+
orientation={orientation}
|
|
19
|
+
className={cn(
|
|
20
|
+
"group/tabs flex gap-2 data-[orientation=horizontal]:flex-col",
|
|
21
|
+
className
|
|
22
|
+
)}
|
|
23
|
+
{...props}
|
|
24
|
+
/>
|
|
25
|
+
)
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const tabsListVariants = cva(
|
|
29
|
+
"rounded-lg p-[3px] group-data-[orientation=horizontal]/tabs:h-9 data-[variant=line]:rounded-none group/tabs-list text-muted-foreground inline-flex w-fit items-center justify-center group-data-[orientation=vertical]/tabs:h-fit group-data-[orientation=vertical]/tabs:flex-col",
|
|
30
|
+
{
|
|
31
|
+
variants: {
|
|
32
|
+
variant: {
|
|
33
|
+
default: "bg-muted",
|
|
34
|
+
line: "gap-1 bg-transparent",
|
|
35
|
+
},
|
|
36
|
+
},
|
|
37
|
+
defaultVariants: {
|
|
38
|
+
variant: "default",
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
function TabsList({
|
|
44
|
+
className,
|
|
45
|
+
variant = "default",
|
|
46
|
+
...props
|
|
47
|
+
}: React.ComponentProps<typeof TabsPrimitive.List> &
|
|
48
|
+
VariantProps<typeof tabsListVariants>) {
|
|
49
|
+
return (
|
|
50
|
+
<TabsPrimitive.List
|
|
51
|
+
data-slot="tabs-list"
|
|
52
|
+
data-variant={variant}
|
|
53
|
+
className={cn(tabsListVariants({ variant }), className)}
|
|
54
|
+
{...props}
|
|
55
|
+
/>
|
|
56
|
+
)
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function TabsTrigger({
|
|
60
|
+
className,
|
|
61
|
+
...props
|
|
62
|
+
}: React.ComponentProps<typeof TabsPrimitive.Trigger>) {
|
|
63
|
+
return (
|
|
64
|
+
<TabsPrimitive.Trigger
|
|
65
|
+
data-slot="tabs-trigger"
|
|
66
|
+
className={cn(
|
|
67
|
+
"cursor-pointer focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:outline-ring text-foreground/60 hover:text-foreground dark:text-muted-foreground dark:hover:text-foreground relative inline-flex h-[calc(100%-1px)] flex-1 items-center justify-center gap-1.5 rounded-md border border-transparent px-2 py-1 text-sm font-medium whitespace-nowrap transition-all group-data-[orientation=vertical]/tabs:w-full group-data-[orientation=vertical]/tabs:justify-start focus-visible:ring-[3px] focus-visible:outline-1 disabled:pointer-events-none disabled:opacity-50 group-data-[variant=default]/tabs-list:data-[state=active]:shadow-sm group-data-[variant=line]/tabs-list:data-[state=active]:shadow-none [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
|
|
68
|
+
"group-data-[variant=line]/tabs-list:bg-transparent group-data-[variant=line]/tabs-list:data-[state=active]:bg-transparent dark:group-data-[variant=line]/tabs-list:data-[state=active]:border-transparent dark:group-data-[variant=line]/tabs-list:data-[state=active]:bg-transparent",
|
|
69
|
+
"data-[state=active]:bg-background dark:data-[state=active]:text-foreground dark:data-[state=active]:border-input dark:data-[state=active]:bg-input/30 data-[state=active]:text-foreground",
|
|
70
|
+
"after:bg-foreground after:absolute after:opacity-0 after:transition-opacity group-data-[orientation=horizontal]/tabs:after:inset-x-0 group-data-[orientation=horizontal]/tabs:after:bottom-[-5px] group-data-[orientation=horizontal]/tabs:after:h-0.5 group-data-[orientation=vertical]/tabs:after:inset-y-0 group-data-[orientation=vertical]/tabs:after:-right-1 group-data-[orientation=vertical]/tabs:after:w-0.5 group-data-[variant=line]/tabs-list:data-[state=active]:after:opacity-100",
|
|
71
|
+
className
|
|
72
|
+
)}
|
|
73
|
+
{...props}
|
|
74
|
+
/>
|
|
75
|
+
)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function TabsContent({
|
|
79
|
+
className,
|
|
80
|
+
...props
|
|
81
|
+
}: React.ComponentProps<typeof TabsPrimitive.Content>) {
|
|
82
|
+
return (
|
|
83
|
+
<TabsPrimitive.Content
|
|
84
|
+
data-slot="tabs-content"
|
|
85
|
+
className={cn("flex-1 outline-none", className)}
|
|
86
|
+
{...props}
|
|
87
|
+
/>
|
|
88
|
+
)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export { Tabs, TabsList, TabsTrigger, TabsContent, tabsListVariants }
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import * as React from "react"
|
|
2
|
+
|
|
3
|
+
import { cn } from "@/lib/utils"
|
|
4
|
+
|
|
5
|
+
function Textarea({ className, ...props }: React.ComponentProps<"textarea">) {
|
|
6
|
+
return (
|
|
7
|
+
<textarea
|
|
8
|
+
data-slot="textarea"
|
|
9
|
+
className={cn(
|
|
10
|
+
"border-input bg-input/20 dark:bg-input/30 focus-visible:border-ring focus-visible:ring-ring/30 aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive dark:aria-invalid:border-destructive/50 resize-none rounded-md border px-2 py-2 text-sm transition-colors focus-visible:ring-2 aria-invalid:ring-2 md:text-xs/relaxed placeholder:text-muted-foreground flex field-sizing-content min-h-16 w-full outline-none disabled:cursor-not-allowed disabled:opacity-50",
|
|
11
|
+
className
|
|
12
|
+
)}
|
|
13
|
+
{...props}
|
|
14
|
+
/>
|
|
15
|
+
)
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export { Textarea }
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"use client"
|
|
2
|
+
|
|
3
|
+
import * as React from "react"
|
|
4
|
+
import { Tooltip as TooltipPrimitive } from "radix-ui"
|
|
5
|
+
|
|
6
|
+
import { cn } from "@/lib/utils"
|
|
7
|
+
|
|
8
|
+
function TooltipProvider({
|
|
9
|
+
delayDuration = 0,
|
|
10
|
+
...props
|
|
11
|
+
}: React.ComponentProps<typeof TooltipPrimitive.Provider>) {
|
|
12
|
+
return (
|
|
13
|
+
<TooltipPrimitive.Provider
|
|
14
|
+
data-slot="tooltip-provider"
|
|
15
|
+
delayDuration={delayDuration}
|
|
16
|
+
{...props}
|
|
17
|
+
/>
|
|
18
|
+
)
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function Tooltip({
|
|
22
|
+
...props
|
|
23
|
+
}: React.ComponentProps<typeof TooltipPrimitive.Root>) {
|
|
24
|
+
return <TooltipPrimitive.Root data-slot="tooltip" {...props} />
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function TooltipTrigger({
|
|
28
|
+
...props
|
|
29
|
+
}: React.ComponentProps<typeof TooltipPrimitive.Trigger>) {
|
|
30
|
+
return <TooltipPrimitive.Trigger data-slot="tooltip-trigger" {...props} />
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function TooltipContent({
|
|
34
|
+
className,
|
|
35
|
+
sideOffset = 0,
|
|
36
|
+
children,
|
|
37
|
+
...props
|
|
38
|
+
}: React.ComponentProps<typeof TooltipPrimitive.Content>) {
|
|
39
|
+
return (
|
|
40
|
+
<TooltipPrimitive.Portal>
|
|
41
|
+
<TooltipPrimitive.Content
|
|
42
|
+
data-slot="tooltip-content"
|
|
43
|
+
sideOffset={sideOffset}
|
|
44
|
+
className={cn(
|
|
45
|
+
"bg-foreground text-background animate-in fade-in-0 zoom-in-95 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 w-fit origin-(--radix-tooltip-content-transform-origin) rounded-md px-3 py-1.5 text-xs text-balance",
|
|
46
|
+
className
|
|
47
|
+
)}
|
|
48
|
+
{...props}
|
|
49
|
+
>
|
|
50
|
+
{children}
|
|
51
|
+
<TooltipPrimitive.Arrow className="bg-foreground fill-foreground z-50 size-2.5 translate-y-[calc(-50%_-_2px)] rotate-45 rounded-[2px]" />
|
|
52
|
+
</TooltipPrimitive.Content>
|
|
53
|
+
</TooltipPrimitive.Portal>
|
|
54
|
+
)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export { Tooltip, TooltipTrigger, TooltipContent, TooltipProvider }
|
package/components.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://ui.shadcn.com/schema.json",
|
|
3
|
+
"style": "radix-mira",
|
|
4
|
+
"rsc": true,
|
|
5
|
+
"tsx": true,
|
|
6
|
+
"tailwind": {
|
|
7
|
+
"config": "",
|
|
8
|
+
"css": "app/globals.css",
|
|
9
|
+
"baseColor": "zinc",
|
|
10
|
+
"cssVariables": true,
|
|
11
|
+
"prefix": ""
|
|
12
|
+
},
|
|
13
|
+
"iconLibrary": "remixicon",
|
|
14
|
+
"rtl": false,
|
|
15
|
+
"aliases": {
|
|
16
|
+
"components": "@/components",
|
|
17
|
+
"utils": "@/lib/utils",
|
|
18
|
+
"ui": "@/components/ui",
|
|
19
|
+
"lib": "@/lib",
|
|
20
|
+
"hooks": "@/hooks"
|
|
21
|
+
},
|
|
22
|
+
"menuColor": "inverted",
|
|
23
|
+
"menuAccent": "subtle",
|
|
24
|
+
"registries": {}
|
|
25
|
+
}
|
package/lib/data.ts
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Server-side data access for the playground.
|
|
3
|
+
* Reads JSON files from the results/ and evals/ directories.
|
|
4
|
+
* Directory paths are provided via RESULTS_DIR and EVALS_DIR env vars.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { readdirSync, readFileSync, existsSync } from "fs";
|
|
8
|
+
import { join, resolve } from "path";
|
|
9
|
+
|
|
10
|
+
function getResultsDir(): string {
|
|
11
|
+
return resolve(process.env.RESULTS_DIR || "./results");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function getEvalsDir(): string {
|
|
15
|
+
return resolve(process.env.EVALS_DIR || "./evals");
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/** List experiments from the results directory. Pass limit to cap expensive per-item reads. */
|
|
19
|
+
export function listExperiments(limit?: number) {
|
|
20
|
+
const resultsDir = getResultsDir();
|
|
21
|
+
|
|
22
|
+
if (!existsSync(resultsDir)) {
|
|
23
|
+
return { items: [], total: 0 };
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const entries = readdirSync(resultsDir, { withFileTypes: true })
|
|
27
|
+
.filter((e) => e.isDirectory())
|
|
28
|
+
.map((e) => e.name);
|
|
29
|
+
|
|
30
|
+
const total = entries.length;
|
|
31
|
+
const toProcess = limit ? entries.slice(0, limit) : entries;
|
|
32
|
+
|
|
33
|
+
const items = toProcess.map((name) => {
|
|
34
|
+
const expDir = join(resultsDir, name);
|
|
35
|
+
const timestamps = readdirSync(expDir, { withFileTypes: true })
|
|
36
|
+
.filter((e) => e.isDirectory())
|
|
37
|
+
.map((e) => e.name)
|
|
38
|
+
.sort()
|
|
39
|
+
.reverse();
|
|
40
|
+
|
|
41
|
+
let latestPassRate: number | undefined;
|
|
42
|
+
let latestTotalRuns = 0;
|
|
43
|
+
let latestPassedRuns = 0;
|
|
44
|
+
|
|
45
|
+
if (timestamps.length > 0) {
|
|
46
|
+
const latestDir = join(expDir, timestamps[0]);
|
|
47
|
+
const evalDirs = readdirSync(latestDir, { withFileTypes: true }).filter(
|
|
48
|
+
(e) => e.isDirectory()
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
for (const evalDir of evalDirs) {
|
|
52
|
+
const summaryPath = join(latestDir, evalDir.name, "summary.json");
|
|
53
|
+
if (existsSync(summaryPath)) {
|
|
54
|
+
try {
|
|
55
|
+
const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
|
|
56
|
+
latestTotalRuns += summary.totalRuns ?? 0;
|
|
57
|
+
latestPassedRuns += summary.passedRuns ?? 0;
|
|
58
|
+
} catch {
|
|
59
|
+
// Skip invalid summary files
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (latestTotalRuns > 0) {
|
|
65
|
+
latestPassRate = (latestPassedRuns / latestTotalRuns) * 100;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
name,
|
|
71
|
+
timestamps,
|
|
72
|
+
latestTimestamp: timestamps[0] ?? null,
|
|
73
|
+
latestPassRate,
|
|
74
|
+
latestTotalRuns,
|
|
75
|
+
latestPassedRuns,
|
|
76
|
+
};
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
return { items, total };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Get timestamps for a specific experiment */
|
|
83
|
+
export function getExperiment(name: string) {
|
|
84
|
+
const expDir = join(getResultsDir(), name);
|
|
85
|
+
|
|
86
|
+
if (!existsSync(expDir)) {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const timestamps = readdirSync(expDir, { withFileTypes: true })
|
|
91
|
+
.filter((e) => e.isDirectory())
|
|
92
|
+
.map((e) => e.name)
|
|
93
|
+
.sort()
|
|
94
|
+
.reverse();
|
|
95
|
+
|
|
96
|
+
return { name, timestamps, latestTimestamp: timestamps[0] ?? null };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Get full experiment detail for a specific timestamp */
|
|
100
|
+
export function getExperimentDetail(name: string, timestamp: string) {
|
|
101
|
+
const runDir = join(getResultsDir(), name, timestamp);
|
|
102
|
+
|
|
103
|
+
if (!existsSync(runDir)) {
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const evalDirs = readdirSync(runDir, { withFileTypes: true })
|
|
108
|
+
.filter((e) => e.isDirectory())
|
|
109
|
+
.map((e) => e.name);
|
|
110
|
+
|
|
111
|
+
const evals = evalDirs.map((evalName) => {
|
|
112
|
+
const evalDir = join(runDir, evalName);
|
|
113
|
+
const summaryPath = join(evalDir, "summary.json");
|
|
114
|
+
|
|
115
|
+
let summary = {
|
|
116
|
+
totalRuns: 0,
|
|
117
|
+
passedRuns: 0,
|
|
118
|
+
passRate: "0%",
|
|
119
|
+
meanDuration: 0,
|
|
120
|
+
};
|
|
121
|
+
if (existsSync(summaryPath)) {
|
|
122
|
+
try {
|
|
123
|
+
summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
|
|
124
|
+
} catch {
|
|
125
|
+
// Use defaults
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// List run directories
|
|
130
|
+
const runDirs = readdirSync(evalDir, { withFileTypes: true })
|
|
131
|
+
.filter((e) => e.isDirectory() && e.name.startsWith("run-"))
|
|
132
|
+
.map((e) => e.name)
|
|
133
|
+
.sort();
|
|
134
|
+
|
|
135
|
+
// Read each run's result.json
|
|
136
|
+
const runs = runDirs.map((runDirName) => {
|
|
137
|
+
const resultPath = join(evalDir, runDirName, "result.json");
|
|
138
|
+
let result = null;
|
|
139
|
+
if (existsSync(resultPath)) {
|
|
140
|
+
try {
|
|
141
|
+
result = JSON.parse(readFileSync(resultPath, "utf-8"));
|
|
142
|
+
} catch {
|
|
143
|
+
// Skip
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return { name: runDirName, result };
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
name: evalName,
|
|
151
|
+
totalRuns: summary.totalRuns,
|
|
152
|
+
passedRuns: summary.passedRuns,
|
|
153
|
+
passRate:
|
|
154
|
+
typeof summary.passRate === "string"
|
|
155
|
+
? parseFloat(summary.passRate)
|
|
156
|
+
: summary.passRate,
|
|
157
|
+
meanDuration: summary.meanDuration,
|
|
158
|
+
runs,
|
|
159
|
+
};
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
return { name, timestamp, evals };
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/** Get result for a specific run */
|
|
166
|
+
export function getRunResult(
|
|
167
|
+
experiment: string,
|
|
168
|
+
timestamp: string,
|
|
169
|
+
evalName: string,
|
|
170
|
+
run: string
|
|
171
|
+
) {
|
|
172
|
+
const resultPath = join(
|
|
173
|
+
getResultsDir(),
|
|
174
|
+
experiment,
|
|
175
|
+
timestamp,
|
|
176
|
+
evalName,
|
|
177
|
+
run,
|
|
178
|
+
"result.json"
|
|
179
|
+
);
|
|
180
|
+
|
|
181
|
+
if (!existsSync(resultPath)) {
|
|
182
|
+
return null;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
try {
|
|
186
|
+
return { result: JSON.parse(readFileSync(resultPath, "utf-8")) };
|
|
187
|
+
} catch {
|
|
188
|
+
return null;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/** Get parsed transcript for a specific run */
|
|
193
|
+
export function getTranscript(
|
|
194
|
+
experiment: string,
|
|
195
|
+
timestamp: string,
|
|
196
|
+
evalName: string,
|
|
197
|
+
run: string
|
|
198
|
+
) {
|
|
199
|
+
const transcriptPath = join(
|
|
200
|
+
getResultsDir(),
|
|
201
|
+
experiment,
|
|
202
|
+
timestamp,
|
|
203
|
+
evalName,
|
|
204
|
+
run,
|
|
205
|
+
"transcript.json"
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
if (!existsSync(transcriptPath)) {
|
|
209
|
+
return null;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
try {
|
|
213
|
+
return JSON.parse(readFileSync(transcriptPath, "utf-8"));
|
|
214
|
+
} catch {
|
|
215
|
+
return null;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/** List evals from the evals directory. Pass limit to cap per-item reads. */
|
|
220
|
+
export function listEvals(limit?: number) {
|
|
221
|
+
const evalsDir = getEvalsDir();
|
|
222
|
+
|
|
223
|
+
if (!existsSync(evalsDir)) {
|
|
224
|
+
return { items: [], total: 0 };
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const entries = readdirSync(evalsDir, { withFileTypes: true })
|
|
228
|
+
.filter((e) => e.isDirectory())
|
|
229
|
+
.map((e) => e.name);
|
|
230
|
+
|
|
231
|
+
const total = entries.length;
|
|
232
|
+
const toProcess = limit ? entries.slice(0, limit) : entries;
|
|
233
|
+
|
|
234
|
+
const items = toProcess.map((name) => {
|
|
235
|
+
const evalDir = join(evalsDir, name);
|
|
236
|
+
const promptPath = join(evalDir, "PROMPT.md");
|
|
237
|
+
let prompt = "";
|
|
238
|
+
if (existsSync(promptPath)) {
|
|
239
|
+
prompt = readFileSync(promptPath, "utf-8");
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const files = readdirSync(evalDir, { withFileTypes: true })
|
|
243
|
+
.filter((e) => e.isFile())
|
|
244
|
+
.map((e) => e.name);
|
|
245
|
+
|
|
246
|
+
return { name, prompt, files };
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
return { items, total };
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/** Get detail for a specific eval */
|
|
253
|
+
export function getEvalDetail(name: string) {
|
|
254
|
+
const evalDir = join(getEvalsDir(), name);
|
|
255
|
+
|
|
256
|
+
if (!existsSync(evalDir)) {
|
|
257
|
+
return null;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const promptPath = join(evalDir, "PROMPT.md");
|
|
261
|
+
let prompt = "";
|
|
262
|
+
if (existsSync(promptPath)) {
|
|
263
|
+
prompt = readFileSync(promptPath, "utf-8");
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Recursively list files
|
|
267
|
+
const files: string[] = [];
|
|
268
|
+
function walk(dir: string, prefix: string) {
|
|
269
|
+
const entries = readdirSync(dir, { withFileTypes: true });
|
|
270
|
+
for (const entry of entries) {
|
|
271
|
+
if (entry.name === "node_modules" || entry.name === ".git") continue;
|
|
272
|
+
const relativePath = prefix ? `${prefix}/${entry.name}` : entry.name;
|
|
273
|
+
if (entry.isDirectory()) {
|
|
274
|
+
walk(join(dir, entry.name), relativePath);
|
|
275
|
+
} else {
|
|
276
|
+
files.push(relativePath);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
walk(evalDir, "");
|
|
281
|
+
|
|
282
|
+
// Read file contents for key files
|
|
283
|
+
const fileContents: Record<string, string> = {};
|
|
284
|
+
const keyFiles = ["PROMPT.md", "EVAL.ts", "EVAL.tsx", "package.json"];
|
|
285
|
+
for (const file of keyFiles) {
|
|
286
|
+
const filePath = join(evalDir, file);
|
|
287
|
+
if (existsSync(filePath)) {
|
|
288
|
+
try {
|
|
289
|
+
fileContents[file] = readFileSync(filePath, "utf-8");
|
|
290
|
+
} catch {
|
|
291
|
+
// Skip unreadable files
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
return { name, prompt, files, fileContents };
|
|
297
|
+
}
|