@vercel/agent-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +370 -0
  2. package/dist/cli.d.ts +6 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +166 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/index.d.ts +21 -0
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +17 -0
  9. package/dist/index.js.map +1 -0
  10. package/dist/lib/agents/claude-code.d.ts +12 -0
  11. package/dist/lib/agents/claude-code.d.ts.map +1 -0
  12. package/dist/lib/agents/claude-code.js +203 -0
  13. package/dist/lib/agents/claude-code.js.map +1 -0
  14. package/dist/lib/agents/codex.d.ts +12 -0
  15. package/dist/lib/agents/codex.d.ts.map +1 -0
  16. package/dist/lib/agents/codex.js +247 -0
  17. package/dist/lib/agents/codex.js.map +1 -0
  18. package/dist/lib/agents/index.d.ts +7 -0
  19. package/dist/lib/agents/index.d.ts.map +1 -0
  20. package/dist/lib/agents/index.js +14 -0
  21. package/dist/lib/agents/index.js.map +1 -0
  22. package/dist/lib/agents/registry.d.ts +23 -0
  23. package/dist/lib/agents/registry.d.ts.map +1 -0
  24. package/dist/lib/agents/registry.js +35 -0
  25. package/dist/lib/agents/registry.js.map +1 -0
  26. package/dist/lib/agents/shared.d.ts +47 -0
  27. package/dist/lib/agents/shared.d.ts.map +1 -0
  28. package/dist/lib/agents/shared.js +99 -0
  29. package/dist/lib/agents/shared.js.map +1 -0
  30. package/dist/lib/agents/types.d.ts +69 -0
  31. package/dist/lib/agents/types.d.ts.map +1 -0
  32. package/dist/lib/agents/types.js +5 -0
  33. package/dist/lib/agents/types.js.map +1 -0
  34. package/dist/lib/config.d.ts +34 -0
  35. package/dist/lib/config.d.ts.map +1 -0
  36. package/dist/lib/config.js +117 -0
  37. package/dist/lib/config.js.map +1 -0
  38. package/dist/lib/fixture.d.ts +52 -0
  39. package/dist/lib/fixture.d.ts.map +1 -0
  40. package/dist/lib/fixture.js +175 -0
  41. package/dist/lib/fixture.js.map +1 -0
  42. package/dist/lib/init.d.ts +21 -0
  43. package/dist/lib/init.d.ts.map +1 -0
  44. package/dist/lib/init.js +250 -0
  45. package/dist/lib/init.js.map +1 -0
  46. package/dist/lib/results.d.ts +54 -0
  47. package/dist/lib/results.d.ts.map +1 -0
  48. package/dist/lib/results.js +186 -0
  49. package/dist/lib/results.js.map +1 -0
  50. package/dist/lib/runner.d.ts +43 -0
  51. package/dist/lib/runner.d.ts.map +1 -0
  52. package/dist/lib/runner.js +142 -0
  53. package/dist/lib/runner.js.map +1 -0
  54. package/dist/lib/sandbox.d.ts +117 -0
  55. package/dist/lib/sandbox.d.ts.map +1 -0
  56. package/dist/lib/sandbox.js +248 -0
  57. package/dist/lib/sandbox.js.map +1 -0
  58. package/dist/lib/types.d.ts +166 -0
  59. package/dist/lib/types.d.ts.map +1 -0
  60. package/dist/lib/types.js +14 -0
  61. package/dist/lib/types.js.map +1 -0
  62. package/dist/test-setup.d.ts +2 -0
  63. package/dist/test-setup.d.ts.map +1 -0
  64. package/dist/test-setup.js +6 -0
  65. package/dist/test-setup.js.map +1 -0
  66. package/package.json +58 -0
@@ -0,0 +1,166 @@
1
+ /**
2
+ * Core types for the eval framework.
3
+ */
4
+ /**
5
+ * Supported AI agent types.
6
+ */
7
+ export type AgentType = 'vercel-ai-gateway/claude-code' | 'claude-code' | 'vercel-ai-gateway/codex' | 'codex';
8
+ /**
9
+ * Model identifier - any string accepted.
10
+ * Each agent validates its own models at runtime.
11
+ */
12
+ export type ModelTier = string;
13
+ /**
14
+ * Function type for filtering evals.
15
+ */
16
+ export type EvalFilter = (name: string) => boolean;
17
+ /**
18
+ * Sandbox interface for setup functions.
19
+ * Provides methods to interact with the isolated VM.
20
+ */
21
+ export interface Sandbox {
22
+ /** Run a command in the sandbox */
23
+ runCommand(command: string, args?: string[], options?: {
24
+ env?: Record<string, string>;
25
+ }): Promise<{
26
+ stdout: string;
27
+ stderr: string;
28
+ exitCode: number;
29
+ }>;
30
+ /** Read a file from the sandbox */
31
+ readFile(path: string): Promise<string>;
32
+ /** Write files to the sandbox */
33
+ writeFiles(files: Record<string, string>): Promise<void>;
34
+ /** Get the sandbox working directory */
35
+ getWorkingDirectory(): string;
36
+ }
37
+ /**
38
+ * Setup function that runs before the agent starts.
39
+ * Receives a sandbox instance for pre-configuration.
40
+ */
41
+ export type SetupFunction = (sandbox: Sandbox) => Promise<void>;
42
+ /**
43
+ * Experiment configuration.
44
+ * Defines what to test and how.
45
+ */
46
+ export interface ExperimentConfig {
47
+ /** Which AI agent to use */
48
+ agent: AgentType;
49
+ /** Which AI model the agent should use. Default is agent-specific: 'opus' for claude-code, 'openai/gpt-5.2-codex' for codex */
50
+ model?: ModelTier;
51
+ /** Which evals to run. Can be a string, array, or filter function. @default '*' (all evals) */
52
+ evals?: string | string[] | EvalFilter;
53
+ /** How many times to run each eval. @default 1 */
54
+ runs?: number;
55
+ /** Stop after first successful run? @default true */
56
+ earlyExit?: boolean;
57
+ /** npm scripts that must pass after agent finishes. @default [] */
58
+ scripts?: string[];
59
+ /** Maximum time in seconds for agent to complete. @default 300 (5 minutes) */
60
+ timeout?: number;
61
+ /** Setup function that runs before agent starts. @default undefined */
62
+ setup?: SetupFunction;
63
+ }
64
+ /**
65
+ * Resolved experiment config with all defaults applied.
66
+ */
67
+ export interface ResolvedExperimentConfig {
68
+ agent: AgentType;
69
+ model: ModelTier;
70
+ evals: string | string[] | EvalFilter;
71
+ runs: number;
72
+ earlyExit: boolean;
73
+ scripts: string[];
74
+ timeout: number;
75
+ setup?: SetupFunction;
76
+ }
77
+ /**
78
+ * Required files for a valid eval fixture.
79
+ */
80
+ export declare const REQUIRED_EVAL_FILES: readonly ["PROMPT.md", "EVAL.ts", "package.json"];
81
+ /**
82
+ * Files excluded when listing fixture files (used by getFixtureFiles in fixture.ts).
83
+ * This is for local fixture introspection, NOT for sandbox uploads.
84
+ * For sandbox file filtering, see TEST_FILE_PATTERNS in sandbox.ts.
85
+ */
86
+ export declare const EXCLUDED_FILES: readonly ["PROMPT.md", "EVAL.ts", "node_modules", ".git"];
87
+ /**
88
+ * Represents a discovered eval fixture.
89
+ */
90
+ export interface EvalFixture {
91
+ /** Name of the eval (folder name) */
92
+ name: string;
93
+ /** Absolute path to the eval folder */
94
+ path: string;
95
+ /** Contents of PROMPT.md */
96
+ prompt: string;
97
+ /** Whether package.json has "type": "module" */
98
+ isModule: boolean;
99
+ }
100
+ /**
101
+ * Result of a single eval run.
102
+ */
103
+ export interface EvalRunResult {
104
+ /** Pass or fail status */
105
+ status: 'passed' | 'failed';
106
+ /** Error message if failed */
107
+ error?: string;
108
+ /** Duration in seconds */
109
+ duration: number;
110
+ /** Path to transcript file (relative to run directory) */
111
+ transcriptPath?: string;
112
+ /** Paths to output files (relative to run directory) */
113
+ outputPaths?: {
114
+ /** Path to EVAL.ts test output */
115
+ eval?: string;
116
+ /** Paths to npm script outputs (nested to avoid collision) */
117
+ scripts?: Record<string, string>;
118
+ };
119
+ }
120
+ /**
121
+ * Internal run data including transcript and outputs (content, not paths).
122
+ */
123
+ export interface EvalRunData {
124
+ /** The eval result (will have paths added when saving) */
125
+ result: EvalRunResult;
126
+ /** Structured transcript from Claude Code (saved to transcript.jsonl) */
127
+ transcript?: string;
128
+ /** Script/test output content (saved to outputs/) */
129
+ outputContent?: {
130
+ /** EVAL.ts test output */
131
+ eval?: string;
132
+ /** npm script outputs (nested to avoid collision) */
133
+ scripts?: Record<string, string>;
134
+ };
135
+ }
136
+ /**
137
+ * Summary of multiple runs for a single eval.
138
+ */
139
+ export interface EvalSummary {
140
+ /** Name of the eval */
141
+ name: string;
142
+ /** Total number of runs */
143
+ totalRuns: number;
144
+ /** Number of passed runs */
145
+ passedRuns: number;
146
+ /** Pass rate as a percentage */
147
+ passRate: number;
148
+ /** Mean duration across all runs */
149
+ meanDuration: number;
150
+ /** Individual run data (internal, not all fields saved to summary.json) */
151
+ runs: EvalRunData[];
152
+ }
153
+ /**
154
+ * Complete experiment results.
155
+ */
156
+ export interface ExperimentResults {
157
+ /** Timestamp when experiment started */
158
+ startedAt: string;
159
+ /** Timestamp when experiment completed */
160
+ completedAt: string;
161
+ /** Experiment configuration used */
162
+ config: ResolvedExperimentConfig;
163
+ /** Results for each eval */
164
+ evals: EvalSummary[];
165
+ }
166
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,CAAC;AAEZ;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB,+HAA+H;IAC/H,KAAK,CAAC,EAAE,SAAS,CAAC;IAElB,+FAA+F;IAC/F,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IAEvC,kDAAkD;IAClD,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd,qDAAqD;IACrD,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB,8EAA8E;IAC9E,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uEAAuE;IACvE,KAAK,CAAC,EAAE,aAAa,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;CACvB;AAED;;GAEG;AACH,eAAO,MAAM,mBAAmB,mDAAoD,CAAC;AAErF;;;;GAIG;AACH,eAAO,MAAM,cAAc,2DAA4D,CAAC;AAExF;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qCAAqC;IACrC,IAAI,EAAE,MAAM,CAAC;IACb,uCAAuC;IACvC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gDAAgD;IAChD,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,0BAA0B;IAC1B,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC5B,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,0DAA0D;IAC1D,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,wDAAwD;IACxD,WAAW,CAAC,EAAE;QACZ,kCAAkC;QAClC,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,8DAA8D;QAC9D,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,0DAA0D;IAC1D,MAAM,EAAE,aAAa,CAAC;IACtB,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qDAAqD;IACrD,aAAa,CAAC,EAAE;QACd,0BAA0B;QAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,qDAAqD;QACrD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,4BAA4B;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,YAAY,EAAE,MAAM,CAAC;IACrB,2EAA2E;IAC3E,IAAI,EAAE,WAAW,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,oCAAoC;IACpC,MAAM,EAAE,wBAAwB,CAAC;IACjC,4BAA4B;IAC5B,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Core types for the eval framework.
3
+ */
4
+ /**
5
+ * Required files for a valid eval fixture.
6
+ */
7
+ export const REQUIRED_EVAL_FILES = ['PROMPT.md', 'EVAL.ts', 'package.json'];
8
+ /**
9
+ * Files excluded when listing fixture files (used by getFixtureFiles in fixture.ts).
10
+ * This is for local fixture introspection, NOT for sandbox uploads.
11
+ * For sandbox file filtering, see TEST_FILE_PATTERNS in sandbox.ts.
12
+ */
13
+ export const EXCLUDED_FILES = ['PROMPT.md', 'EVAL.ts', 'node_modules', '.git'];
14
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AA2FH;;GAEG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,cAAc,CAAU,CAAC;AAErF;;;;GAIG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,CAAU,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=test-setup.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"test-setup.d.ts","sourceRoot":"","sources":["../src/test-setup.ts"],"names":[],"mappings":""}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Test setup file - loads environment variables for tests.
3
+ */
4
+ import { config as dotenvConfig } from 'dotenv';
5
+ dotenvConfig();
6
+ //# sourceMappingURL=test-setup.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"test-setup.js","sourceRoot":"","sources":["../src/test-setup.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,EAAE,MAAM,IAAI,YAAY,EAAE,MAAM,QAAQ,CAAC;AAEhD,YAAY,EAAE,CAAC"}
package/package.json ADDED
@@ -0,0 +1,58 @@
1
+ {
2
+ "name": "@vercel/agent-eval",
3
+ "version": "0.0.1",
4
+ "description": "Framework for testing AI coding agents in isolated sandboxes",
5
+ "repository": {
6
+ "type": "git",
7
+ "url": "https://github.com/vercel-labs/agent-eval.git"
8
+ },
9
+ "type": "module",
10
+ "main": "dist/index.js",
11
+ "types": "dist/index.d.ts",
12
+ "bin": {
13
+ "agent-eval": "dist/cli.js"
14
+ },
15
+ "files": [
16
+ "dist"
17
+ ],
18
+ "scripts": {
19
+ "build": "tsc",
20
+ "test": "vitest run",
21
+ "test:watch": "vitest",
22
+ "lint": "eslint src/",
23
+ "prepublishOnly": "npm run build"
24
+ },
25
+ "dependencies": {
26
+ "@ai-sdk/anthropic": "^1.2.12",
27
+ "@vercel/sandbox": "^1.2.0",
28
+ "ai": "^5.0.11",
29
+ "chalk": "^5.3.0",
30
+ "commander": "^12.1.0",
31
+ "dotenv": "^16.4.5",
32
+ "glob": "^11.0.0",
33
+ "zod": "^3.23.8"
34
+ },
35
+ "devDependencies": {
36
+ "@types/node": "^22.0.0",
37
+ "eslint": "^9.0.0",
38
+ "tsx": "^4.21.0",
39
+ "typescript": "^5.6.0",
40
+ "typescript-eslint": "^8.54.0",
41
+ "vitest": "^2.1.0"
42
+ },
43
+ "engines": {
44
+ "node": ">=18.0.0"
45
+ },
46
+ "keywords": [
47
+ "ai",
48
+ "eval",
49
+ "testing",
50
+ "claude",
51
+ "agent",
52
+ "sandbox"
53
+ ],
54
+ "license": "MIT",
55
+ "publishConfig": {
56
+ "access": "public"
57
+ }
58
+ }