@vercel/agent-eval 0.0.8 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -24
- package/dist/cli.js +40 -13
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/lib/agents/codex.d.ts.map +1 -1
- package/dist/lib/agents/codex.js +13 -25
- package/dist/lib/agents/codex.js.map +1 -1
- package/dist/lib/agents/index.d.ts.map +1 -1
- package/dist/lib/agents/index.js +0 -2
- package/dist/lib/agents/index.js.map +1 -1
- package/dist/lib/config.js +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/o11y/index.d.ts +11 -0
- package/dist/lib/o11y/index.d.ts.map +1 -0
- package/dist/lib/o11y/index.js +11 -0
- package/dist/lib/o11y/index.js.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts +18 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.js +343 -0
- package/dist/lib/o11y/parsers/claude-code.js.map +1 -0
- package/dist/lib/o11y/parsers/codex.d.ts +17 -0
- package/dist/lib/o11y/parsers/codex.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/codex.js +296 -0
- package/dist/lib/o11y/parsers/codex.js.map +1 -0
- package/dist/lib/o11y/parsers/index.d.ts +51 -0
- package/dist/lib/o11y/parsers/index.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/index.js +280 -0
- package/dist/lib/o11y/parsers/index.js.map +1 -0
- package/dist/lib/o11y/parsers/opencode.d.ts +17 -0
- package/dist/lib/o11y/parsers/opencode.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/opencode.js +313 -0
- package/dist/lib/o11y/parsers/opencode.js.map +1 -0
- package/dist/lib/o11y/types.d.ts +113 -0
- package/dist/lib/o11y/types.d.ts.map +1 -0
- package/dist/lib/o11y/types.js +6 -0
- package/dist/lib/o11y/types.js.map +1 -0
- package/dist/lib/results.d.ts +2 -1
- package/dist/lib/results.d.ts.map +1 -1
- package/dist/lib/results.js +23 -7
- package/dist/lib/results.js.map +1 -1
- package/dist/lib/runner.d.ts +5 -5
- package/dist/lib/runner.d.ts.map +1 -1
- package/dist/lib/runner.js +19 -10
- package/dist/lib/runner.js.map +1 -1
- package/dist/lib/types.d.ts +21 -3
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/package.json +9 -3
- package/dist/lib/agents/ai-sdk-agent.d.ts +0 -10
- package/dist/lib/agents/ai-sdk-agent.d.ts.map +0 -1
- package/dist/lib/agents/ai-sdk-agent.js +0 -427
- package/dist/lib/agents/ai-sdk-agent.js.map +0 -1
package/dist/lib/runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/lib/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAUH,OAAO,EAAE,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAC7C,OAAO,EACL,wBAAwB,EACxB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,EACX,kBAAkB,EAClB,eAAe,EACf,qBAAqB,GACtB,MAAM,cAAc,CAAC;AAwCtB;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAA6B;IAE7B,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;IAC9F,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC;IAE7B,8BAA8B;IAC9B,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAErC,MAAM,GAAG,GAAG,CAAC,GAAW,EAAE,EAAE;QAC1B,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC,GAAG,CAAC,CAAC;QAClB,CAAC;aAAM,IAAI,OAAO,EAAE,CAAC;YACnB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC,CAAC;IAEF,mDAAmD;IACnD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,eAAe,EAAE,CAAC,CAAC;IAC5D,CAAC;IAED,oCAAoC;IACpC,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAC,CAAC;QAC1C,CAAC;IACH,CAAC;IAED,GAAG,CAAC,YAAY,QAAQ,CAAC,MAAM,gCAAgC,QAAQ,CAAC,MAAM,YAAY,MAAM,CAAC,IAAI,QAAQ,CAAC,CAAC;IAE/G,uBAAuB;IACvB,MAAM,UAAU,GAAG,KAAK,EAAE,OAAoB,EAA0B,EAAE;QACxE,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;QACtC,MAAM,UAAU,GAAG,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;QAEvD,2CAA2C;QAC3C,IAAI,UAAU,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YAC9B,OAAO;gBACL,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,QAAQ;gBACR,OAAO,EAAE;oBACP,MAAM,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC,EAAE;iBAC5D;gBACD,OAAO,EAAE,IAAI;aACd,CAAC;QACJ,CAAC;QAED,GAAG,CAAC,qBAAqB,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;QAEpE,MAAM,SAAS,GAAG,MAAM,CAAC,OAAO,GAAG,IAAI,CAAC;QACxC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,oDAAoD;QACpD,MAAM,iBAAiB,GAAG,IAAI,eAAe,EAAE,CAAC;QAEhD,yDAAyD;QACzD,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YACrB,UAAU,CAAC,MAAM,CAAC,gBAAgB,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,iBAAiB,CAAC,KAAK,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/F,CAAC;QAED,IAAI,SAAoD,CAAC;QAEzD,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC;YACrC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE;gBACtB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,OAAO,EAAE,SAAS;gBAClB,MAAM;gBACN,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,MAAM,EAAE,iBAAiB,CAAC,MAAM;gBAChC,OAAO,EAAE,MAAM,CAAC,OAAO;aACxB,CAAC;YACF,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;gBAC/B,SAAS,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC1B,iBAAiB,CAAC,KAAK,EAAE,CAAC,CAAC,mCAAmC;oBAC9D,MAAM,CAAC,IAAI,KAAK,CAAC,wBAAwB,MAAM,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC;gBAC/D,CAAC,EAAE,SAAS,CAAC,CAAC;YAChB,CAAC,CAAC;SACH,CAAC,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;YACjB,8CAA8C;YAC9C,IAAI,KAAK,YAAY,KAAK,IAAI,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;gBAClE,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,MAAM,EAAE,EAAE;oBACV,KAAK,EAAE,KAAK,CAAC,OAAO;oBACpB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;iBACjC,CAAC;YACJ,CAAC;YACD,MAAM,KAAK,CAAC;QACd,CAAC,CAAC,CAAC;QAEH,kDAAkD;QAClD,IAAI,SAAS;YAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAEvC,4BAA4B;QAC5B,IAAI,WAAW,CAAC,KAAK,KAAK,SAAS,IAAI,WAAW,CAAC,KAAK,KAAK,sBAAsB,EAAE,CAAC;YACpF,OAAO;gBACL,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,QAAQ;gBACR,OAAO,EAAE;oBACP,MAAM,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,QAAQ,EAAE,WAAW,CAAC,QAAQ,GAAG,IAAI,EAAE;iBACtF;gBACD,OAAO,EAAE,IAAI;aACd,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,wBAAwB,CAAC,WAAW,CAAC,CAAC;QAEtD,GAAG,CAAC,eAAe,CAAC,OAAO,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC,EAAE,MAAM,CAAC,IAAI,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;QAE9E,4EAA4E;QAC5E,IAAI,MAAM,CAAC,SAAS,IAAI,OAAO,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YAC3D,GAAG,CAAC,eAAe,OAAO,CAAC,IAAI,kBAAkB,QAAQ,GAAG,CAAC,+BAA+B,CAAC,CAAC;YAC9F,UAAU,CAAC,KAAK,EAAE,CAAC;QACrB,CAAC;QAED,OAAO;YACL,WAAW,EAAE,OAAO,CAAC,IAAI;YACzB,QAAQ;YACR,OAAO;SACR,CAAC;IACJ,CAAC,CAAC;IAEF,gCAAgC;IAChC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC;IAE5D,sDAAsD;IACtD,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA2B,CAAC;IAC5D,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;YACpB,gBAAgB,CAAC,GAAG,CAAC,MAAM,CAAC,WAAW,CAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,6CAA6C;IAC7C,MAAM,aAAa,GAAkB,EAAE,CAAC;IACxC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,cAAc,GAAG,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAE,CAAC;QAE3D,wCAAwC;QACxC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC;QAEvD,MAAM,WAAW,GAAkB,EAAE,CAAC;QACtC,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE,CAAC;YACpC,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YAEjC,iDAAiD;YACjD,IAAI,MAAM,CAAC,SAAS,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;gBAClE,MAAM;YACR,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,iBAAiB,CAAC,OAAO,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QAC7D,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,IAAI,EAAE,CAAC;IAC/B,MAAM,iBAAiB,GAAG,uBAAuB,CAAC,MAAM,EAAE,aAAa,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;IAEjG,uBAAuB;IACvB,MAAM,SAAS,GAAG,WAAW,CAAC,iBAAiB,EAAE;QAC/C,UAAU;QACV,cAAc;KACf,CAAC,CAAC;IAEH,GAAG,CAAC,uBAAuB,SAAS,EAAE,CAAC,CAAC;IACxC,GAAG,CAAC,kBAAkB,CAAC,iBAAiB,CAAC,CAAC,CAAC;IAE3C,OAAO,iBAAiB,CAAC;AAC3B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAoB,EACpB,OASC;IAED,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,IAAI,+BAA+B,CAAC,CAAC;IAEzE,MAAM,MAAM,GAAa,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;IAExF,MAAM,OAAO,GAAkB,EAAE,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAE9B,MAAM,WAAW,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE;YACjD,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK;YACL,OAAO,EAAE,OAAO,CAAC,OAAO,GAAG,IAAI;YAC/B,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,OAAO,EAAE,OAAO,CAAC,OAAO;SACxB,CAAC,CAAC;QAEA,OAAO,CAAC,IAAI,CAAC,wBAAwB,CAAC,WAAW,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,gHAAgH;IAChH,IAAG,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,OAAO,OAAO,CAAC,CAAC,CAA2D,CAAC;IAC3E,CAAC;IAED,OAAO,OAAiE,CAAC;AAC3E,CAAC"}
|
package/dist/lib/types.d.ts
CHANGED
|
@@ -50,8 +50,10 @@ export type SandboxBackend = 'vercel' | 'docker';
|
|
|
50
50
|
export interface ExperimentConfig {
|
|
51
51
|
/** Which AI agent to use */
|
|
52
52
|
agent: AgentType;
|
|
53
|
-
/** Which AI model the agent should use.
|
|
54
|
-
|
|
53
|
+
/** Which AI model the agent should use. Can be a single model or array of models to test.
|
|
54
|
+
* If an array is provided, the experiment will run on each model.
|
|
55
|
+
* Default is agent-specific: 'opus' for claude-code, 'openai/gpt-5.2-codex' for codex */
|
|
56
|
+
model?: ModelTier | ModelTier[];
|
|
55
57
|
/** Which evals to run. Can be a string, array, or filter function. @default '*' (all evals) */
|
|
56
58
|
evals?: string | string[] | EvalFilter;
|
|
57
59
|
/** How many times to run each eval. @default 1 */
|
|
@@ -71,6 +73,20 @@ export interface ExperimentConfig {
|
|
|
71
73
|
* Resolved experiment config with all defaults applied.
|
|
72
74
|
*/
|
|
73
75
|
export interface ResolvedExperimentConfig {
|
|
76
|
+
agent: AgentType;
|
|
77
|
+
model: ModelTier | ModelTier[];
|
|
78
|
+
evals: string | string[] | EvalFilter;
|
|
79
|
+
runs: number;
|
|
80
|
+
earlyExit: boolean;
|
|
81
|
+
scripts: string[];
|
|
82
|
+
timeout: number;
|
|
83
|
+
setup?: SetupFunction;
|
|
84
|
+
sandbox: SandboxBackend | 'auto';
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Resolved experiment config with all defaults applied.
|
|
88
|
+
*/
|
|
89
|
+
export interface RunnableExperimentConfig {
|
|
74
90
|
agent: AgentType;
|
|
75
91
|
model: ModelTier;
|
|
76
92
|
evals: string | string[] | EvalFilter;
|
|
@@ -115,8 +131,10 @@ export interface EvalRunResult {
|
|
|
115
131
|
error?: string;
|
|
116
132
|
/** Duration in seconds */
|
|
117
133
|
duration: number;
|
|
118
|
-
/** Path to transcript file (relative to run directory) */
|
|
134
|
+
/** Path to parsed transcript file (relative to run directory) */
|
|
119
135
|
transcriptPath?: string;
|
|
136
|
+
/** Path to raw transcript file (relative to run directory) */
|
|
137
|
+
transcriptRawPath?: string;
|
|
120
138
|
/** Paths to output files (relative to run directory) */
|
|
121
139
|
outputPaths?: {
|
|
122
140
|
/** Path to EVAL.ts test output */
|
package/dist/lib/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,GACP,4BAA4B,CAAC;AAEjC;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,QAAQ,GAAG,QAAQ,CAAC;AAEjD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,MAAM,SAAS,GACjB,+BAA+B,GAC/B,aAAa,GACb,yBAAyB,GACzB,OAAO,GACP,4BAA4B,CAAC;AAEjC;;;GAGG;AACH,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC;AAEnD;;;GAGG;AACH,MAAM,WAAW,OAAO;IACtB,mCAAmC;IACnC,UAAU,CACR,OAAO,EAAE,MAAM,EACf,IAAI,CAAC,EAAE,MAAM,EAAE,EACf,OAAO,CAAC,EAAE;QAAE,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;KAAE,GACzC,OAAO,CAAC;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACjE,mCAAmC;IACnC,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,UAAU,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACzD,wCAAwC;IACxC,mBAAmB,IAAI,MAAM,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,QAAQ,GAAG,QAAQ,CAAC;AAEjD;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB;;6FAEyF;IACzF,KAAK,CAAC,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAEhC,+FAA+F;IAC/F,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IAEvC,kDAAkD;IAClD,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd,qDAAqD;IACrD,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB,8EAA8E;IAC9E,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uEAAuE;IACvE,KAAK,CAAC,EAAE,aAAa,CAAC;IAEtB,qFAAqF;IACrF,OAAO,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;CACnC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,GAAG,SAAS,EAAE,CAAC;IAC/B,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;CAClC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,SAAS,CAAC;IACjB,KAAK,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,CAAC;IACtC,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,OAAO,CAAC;IACnB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,OAAO,EAAE,cAAc,GAAG,MAAM,CAAC;CAClC;AAED;;;GAGG;AACH,eAAO,MAAM,mBAAmB,mDAAoD,CAAC;AAErF;;;;GAIG;AACH,eAAO,MAAM,cAAc,uEAAwE,CAAC;AAEpG;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,qCAAqC;IACrC,IAAI,EAAE,MAAM,CAAC;IACb,uCAAuC;IACvC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,gDAAgD;IAChD,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,0BAA0B;IAC1B,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC5B,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,8DAA8D;IAC9D,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,wDAAwD;IACxD,WAAW,CAAC,EAAE;QACZ,kCAAkC;QAClC,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,8DAA8D;QAC9D,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,0DAA0D;IAC1D,MAAM,EAAE,aAAa,CAAC;IACtB,yEAAyE;IACzE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,qDAAqD;IACrD,aAAa,CAAC,EAAE;QACd,0BAA0B;QAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,qDAAqD;QACrD,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,2BAA2B;IAC3B,SAAS,EAAE,MAAM,CAAC;IAClB,4BAA4B;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,YAAY,EAAE,MAAM,CAAC;IACrB,2EAA2E;IAC3E,IAAI,EAAE,WAAW,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,oCAAoC;IACpC,MAAM,EAAE,wBAAwB,CAAC;IACjC,4BAA4B;IAC5B,KAAK,EAAE,WAAW,EAAE,CAAC;CACtB"}
|
package/dist/lib/types.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/lib/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAsHH;;;GAGG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,cAAc,CAAU,CAAC;AAErF;;;;GAIG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,WAAW,EAAE,SAAS,EAAE,UAAU,EAAE,cAAc,EAAE,MAAM,CAAU,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vercel/agent-eval",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.11",
|
|
4
4
|
"description": "Framework for testing AI coding agents in isolated sandboxes",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
7
|
-
"url": "https://github.com/vercel-labs/agent-eval.git"
|
|
7
|
+
"url": "git+https://github.com/vercel-labs/agent-eval.git"
|
|
8
8
|
},
|
|
9
9
|
"type": "module",
|
|
10
10
|
"main": "dist/index.js",
|
|
@@ -24,6 +24,9 @@
|
|
|
24
24
|
"test:integration:docker": "SANDBOX_BACKEND=docker INTEGRATION_TEST=1 vitest run src/integration.test.ts",
|
|
25
25
|
"test:integration:vercel": "SANDBOX_BACKEND=vercel INTEGRATION_TEST=1 vitest run src/integration.test.ts",
|
|
26
26
|
"lint": "eslint src/",
|
|
27
|
+
"changeset": "changeset",
|
|
28
|
+
"version": "changeset version",
|
|
29
|
+
"release": "npm run build && changeset publish",
|
|
27
30
|
"prepublishOnly": "npm run build"
|
|
28
31
|
},
|
|
29
32
|
"dependencies": {
|
|
@@ -39,6 +42,8 @@
|
|
|
39
42
|
"zod": "^3.23.8"
|
|
40
43
|
},
|
|
41
44
|
"devDependencies": {
|
|
45
|
+
"@changesets/changelog-github": "^0.5.2",
|
|
46
|
+
"@changesets/cli": "^2.29.8",
|
|
42
47
|
"@types/dockerode": "^4.0.1",
|
|
43
48
|
"@types/node": "^22.0.0",
|
|
44
49
|
"@types/tar-stream": "^3.1.4",
|
|
@@ -61,6 +66,7 @@
|
|
|
61
66
|
],
|
|
62
67
|
"license": "MIT",
|
|
63
68
|
"publishConfig": {
|
|
64
|
-
"access": "public"
|
|
69
|
+
"access": "public",
|
|
70
|
+
"provenance": true
|
|
65
71
|
}
|
|
66
72
|
}
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* AI SDK Agent - A simple coding agent using the Vercel AI SDK.
|
|
3
|
-
* Works with any model available on Vercel AI Gateway.
|
|
4
|
-
*/
|
|
5
|
-
import type { Agent } from './types.js';
|
|
6
|
-
/**
|
|
7
|
-
* Create AI SDK agent with Vercel AI Gateway authentication.
|
|
8
|
-
*/
|
|
9
|
-
export declare function createAiSdkAgent(): Agent;
|
|
10
|
-
//# sourceMappingURL=ai-sdk-agent.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"ai-sdk-agent.d.ts","sourceRoot":"","sources":["../../../src/lib/agents/ai-sdk-agent.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,KAAK,EAAmC,MAAM,YAAY,CAAC;AA8QzE;;GAEG;AACH,wBAAgB,gBAAgB,IAAI,KAAK,CAiMxC"}
|
|
@@ -1,427 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* AI SDK Agent - A simple coding agent using the Vercel AI SDK.
|
|
3
|
-
* Works with any model available on Vercel AI Gateway.
|
|
4
|
-
*/
|
|
5
|
-
import { createSandbox, collectLocalFiles, splitTestFiles, verifyNoTestFiles, } from '../sandbox.js';
|
|
6
|
-
import { runValidation, captureGeneratedFiles, createVitestConfig, AI_GATEWAY, } from './shared.js';
|
|
7
|
-
/**
|
|
8
|
-
* The CLI script source code that runs inside the sandbox.
|
|
9
|
-
* This is a self-contained script that uses the AI SDK.
|
|
10
|
-
*/
|
|
11
|
-
const CLI_SCRIPT = `
|
|
12
|
-
import { generateText, tool, stepCountIs } from 'ai';
|
|
13
|
-
import { createGateway } from '@ai-sdk/gateway';
|
|
14
|
-
import { z } from 'zod';
|
|
15
|
-
import { readFileSync, writeFileSync, readdirSync, existsSync, statSync } from 'fs';
|
|
16
|
-
import { execSync } from 'child_process';
|
|
17
|
-
import { join, dirname } from 'path';
|
|
18
|
-
import { mkdirSync } from 'fs';
|
|
19
|
-
|
|
20
|
-
// Parse command line arguments
|
|
21
|
-
const args = process.argv.slice(2);
|
|
22
|
-
let prompt = '';
|
|
23
|
-
let model = '';
|
|
24
|
-
|
|
25
|
-
for (let i = 0; i < args.length; i++) {
|
|
26
|
-
if (args[i] === '--prompt' && args[i + 1]) {
|
|
27
|
-
prompt = args[++i];
|
|
28
|
-
} else if (args[i] === '--model' && args[i + 1]) {
|
|
29
|
-
model = args[++i];
|
|
30
|
-
}
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
if (!prompt || !model) {
|
|
34
|
-
console.error('Usage: ai-sdk-agent --prompt "..." --model "provider/model"');
|
|
35
|
-
process.exit(1);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
// Create AI Gateway client
|
|
39
|
-
const gateway = createGateway({
|
|
40
|
-
apiKey: process.env.AI_GATEWAY_API_KEY,
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
// Transcript events
|
|
44
|
-
const events = [];
|
|
45
|
-
|
|
46
|
-
function logEvent(type, data) {
|
|
47
|
-
const event = { type, timestamp: Date.now(), ...data };
|
|
48
|
-
events.push(event);
|
|
49
|
-
console.log(JSON.stringify(event));
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
// Define coding tools
|
|
53
|
-
const tools = {
|
|
54
|
-
readFile: tool({
|
|
55
|
-
description: 'Read the contents of a file at the given path',
|
|
56
|
-
inputSchema: z.object({
|
|
57
|
-
path: z.string().describe('The file path to read'),
|
|
58
|
-
}),
|
|
59
|
-
execute: async ({ path }) => {
|
|
60
|
-
try {
|
|
61
|
-
const content = readFileSync(path, 'utf-8');
|
|
62
|
-
logEvent('tool_result', { tool: 'readFile', path, success: true });
|
|
63
|
-
return content;
|
|
64
|
-
} catch (error) {
|
|
65
|
-
logEvent('tool_result', { tool: 'readFile', path, success: false, error: error.message });
|
|
66
|
-
return \`Error reading file: \${error.message}\`;
|
|
67
|
-
}
|
|
68
|
-
},
|
|
69
|
-
}),
|
|
70
|
-
|
|
71
|
-
writeFile: tool({
|
|
72
|
-
description: 'Write content to a file at the given path. Creates directories if needed.',
|
|
73
|
-
inputSchema: z.object({
|
|
74
|
-
path: z.string().describe('The file path to write'),
|
|
75
|
-
content: z.string().describe('The content to write'),
|
|
76
|
-
}),
|
|
77
|
-
execute: async ({ path, content }) => {
|
|
78
|
-
try {
|
|
79
|
-
const dir = dirname(path);
|
|
80
|
-
if (dir && dir !== '.') {
|
|
81
|
-
mkdirSync(dir, { recursive: true });
|
|
82
|
-
}
|
|
83
|
-
writeFileSync(path, content);
|
|
84
|
-
logEvent('tool_result', { tool: 'writeFile', path, success: true });
|
|
85
|
-
return 'File written successfully';
|
|
86
|
-
} catch (error) {
|
|
87
|
-
logEvent('tool_result', { tool: 'writeFile', path, success: false, error: error.message });
|
|
88
|
-
return \`Error writing file: \${error.message}\`;
|
|
89
|
-
}
|
|
90
|
-
},
|
|
91
|
-
}),
|
|
92
|
-
|
|
93
|
-
editFile: tool({
|
|
94
|
-
description: 'Edit a file by replacing a specific string with new content',
|
|
95
|
-
inputSchema: z.object({
|
|
96
|
-
path: z.string().describe('The file path to edit'),
|
|
97
|
-
oldString: z.string().describe('The exact string to find and replace'),
|
|
98
|
-
newString: z.string().describe('The replacement string'),
|
|
99
|
-
}),
|
|
100
|
-
execute: async ({ path, oldString, newString }) => {
|
|
101
|
-
try {
|
|
102
|
-
const content = readFileSync(path, 'utf-8');
|
|
103
|
-
if (!content.includes(oldString)) {
|
|
104
|
-
logEvent('tool_result', { tool: 'editFile', path, success: false, error: 'String not found' });
|
|
105
|
-
return 'Error: The specified string was not found in the file';
|
|
106
|
-
}
|
|
107
|
-
const newContent = content.replace(oldString, newString);
|
|
108
|
-
writeFileSync(path, newContent);
|
|
109
|
-
logEvent('tool_result', { tool: 'editFile', path, success: true });
|
|
110
|
-
return 'File edited successfully';
|
|
111
|
-
} catch (error) {
|
|
112
|
-
logEvent('tool_result', { tool: 'editFile', path, success: false, error: error.message });
|
|
113
|
-
return \`Error editing file: \${error.message}\`;
|
|
114
|
-
}
|
|
115
|
-
},
|
|
116
|
-
}),
|
|
117
|
-
|
|
118
|
-
listFiles: tool({
|
|
119
|
-
description: 'List files in a directory. Call with path="." to list current directory.',
|
|
120
|
-
inputSchema: z.object({
|
|
121
|
-
path: z.string().describe('The directory path to list (use "." for current directory)'),
|
|
122
|
-
recursive: z.boolean().describe('Whether to list recursively').optional(),
|
|
123
|
-
}),
|
|
124
|
-
execute: async ({ path, recursive }) => {
|
|
125
|
-
const targetPath = path || '.';
|
|
126
|
-
const isRecursive = recursive || false;
|
|
127
|
-
try {
|
|
128
|
-
if (isRecursive) {
|
|
129
|
-
const result = execSync(\`find \${targetPath} -type f | head -100\`, { encoding: 'utf-8' });
|
|
130
|
-
logEvent('tool_result', { tool: 'listFiles', path: targetPath, recursive: isRecursive, success: true });
|
|
131
|
-
return result;
|
|
132
|
-
}
|
|
133
|
-
const files = readdirSync(targetPath);
|
|
134
|
-
logEvent('tool_result', { tool: 'listFiles', path: targetPath, recursive: isRecursive, success: true });
|
|
135
|
-
return files.join('\\n');
|
|
136
|
-
} catch (error) {
|
|
137
|
-
logEvent('tool_result', { tool: 'listFiles', path: targetPath, success: false, error: error.message });
|
|
138
|
-
return \`Error listing files: \${error.message}\`;
|
|
139
|
-
}
|
|
140
|
-
},
|
|
141
|
-
}),
|
|
142
|
-
|
|
143
|
-
glob: tool({
|
|
144
|
-
description: 'Find files matching a pattern (e.g., "*.ts" for TypeScript files)',
|
|
145
|
-
inputSchema: z.object({
|
|
146
|
-
pattern: z.string().describe('The file pattern (e.g., "*.ts", "*.js")'),
|
|
147
|
-
}),
|
|
148
|
-
execute: async ({ pattern }) => {
|
|
149
|
-
try {
|
|
150
|
-
// Extract just the file pattern, remove any path prefix
|
|
151
|
-
const filePattern = pattern.replace(/^\\*\\*\\//, '').replace(/^\\.\\//, '');
|
|
152
|
-
const result = execSync(\`find . -name "\${filePattern}" -type f 2>/dev/null | grep -v node_modules | head -50\`, { encoding: 'utf-8' });
|
|
153
|
-
logEvent('tool_result', { tool: 'glob', pattern, success: true });
|
|
154
|
-
return result.trim() || 'No files found';
|
|
155
|
-
} catch (error) {
|
|
156
|
-
logEvent('tool_result', { tool: 'glob', pattern, success: false, error: error.message });
|
|
157
|
-
return 'No files found';
|
|
158
|
-
}
|
|
159
|
-
},
|
|
160
|
-
}),
|
|
161
|
-
|
|
162
|
-
grep: tool({
|
|
163
|
-
description: 'Search for a text pattern in files',
|
|
164
|
-
inputSchema: z.object({
|
|
165
|
-
pattern: z.string().describe('The search pattern'),
|
|
166
|
-
path: z.string().describe('The file or directory to search in').optional(),
|
|
167
|
-
}),
|
|
168
|
-
execute: async ({ pattern, path }) => {
|
|
169
|
-
const targetPath = path || '.';
|
|
170
|
-
try {
|
|
171
|
-
const result = execSync(\`grep -rn "\${pattern}" \${targetPath} 2>/dev/null | grep -v node_modules | head -50\`, { encoding: 'utf-8' });
|
|
172
|
-
logEvent('tool_result', { tool: 'grep', pattern, path: targetPath, success: true });
|
|
173
|
-
return result.trim() || 'No matches found';
|
|
174
|
-
} catch (error) {
|
|
175
|
-
logEvent('tool_result', { tool: 'grep', pattern, path: targetPath, success: false });
|
|
176
|
-
return 'No matches found';
|
|
177
|
-
}
|
|
178
|
-
},
|
|
179
|
-
}),
|
|
180
|
-
|
|
181
|
-
bash: tool({
|
|
182
|
-
description: 'Run a bash command',
|
|
183
|
-
inputSchema: z.object({
|
|
184
|
-
command: z.string().describe('The command to run'),
|
|
185
|
-
}),
|
|
186
|
-
execute: async ({ command }) => {
|
|
187
|
-
try {
|
|
188
|
-
const result = execSync(command, { encoding: 'utf-8', timeout: 30000 });
|
|
189
|
-
logEvent('tool_result', { tool: 'bash', command, success: true });
|
|
190
|
-
return result;
|
|
191
|
-
} catch (error) {
|
|
192
|
-
logEvent('tool_result', { tool: 'bash', command, success: false, error: error.message });
|
|
193
|
-
return \`Error: \${error.message}\\n\${error.stdout || ''}\\n\${error.stderr || ''}\`;
|
|
194
|
-
}
|
|
195
|
-
},
|
|
196
|
-
}),
|
|
197
|
-
};
|
|
198
|
-
|
|
199
|
-
// System prompt for the coding agent
|
|
200
|
-
const systemPrompt = \`You are an expert coding agent. Your job is to complete programming tasks by reading, writing, and modifying files.
|
|
201
|
-
|
|
202
|
-
Available tools:
|
|
203
|
-
- readFile(path): Read a file's contents
|
|
204
|
-
- writeFile(path, content): Write/create a file (creates directories if needed)
|
|
205
|
-
- editFile(path, oldString, newString): Replace a specific string in a file
|
|
206
|
-
- listFiles(path): List files in a directory (use path="." for current directory)
|
|
207
|
-
- glob(pattern): Find files by pattern (e.g., "*.ts")
|
|
208
|
-
- grep(pattern, path): Search for text in files
|
|
209
|
-
- bash(command): Run shell commands
|
|
210
|
-
|
|
211
|
-
IMPORTANT WORKFLOW:
|
|
212
|
-
1. First, list files to understand the project structure: listFiles(path=".")
|
|
213
|
-
2. Read any relevant existing files to understand the context
|
|
214
|
-
3. Make the necessary code changes using writeFile or editFile
|
|
215
|
-
4. If needed, run build/test commands with bash to verify
|
|
216
|
-
|
|
217
|
-
RULES:
|
|
218
|
-
- Always check what files exist before modifying them
|
|
219
|
-
- Create complete, working code - not placeholders
|
|
220
|
-
- Put files in the correct directories (e.g., src/ for source files)
|
|
221
|
-
- Be thorough but efficient\`;
|
|
222
|
-
|
|
223
|
-
// Run the agent
|
|
224
|
-
async function main() {
|
|
225
|
-
logEvent('start', { model, prompt });
|
|
226
|
-
|
|
227
|
-
try {
|
|
228
|
-
const result = await generateText({
|
|
229
|
-
model: gateway(model),
|
|
230
|
-
tools,
|
|
231
|
-
stopWhen: stepCountIs(100), // Allow up to 100 steps
|
|
232
|
-
system: systemPrompt,
|
|
233
|
-
prompt,
|
|
234
|
-
onStepFinish: ({ stepType, text, toolCalls, toolResults }) => {
|
|
235
|
-
logEvent('step', { stepType, text, toolCalls: toolCalls?.length, toolResults: toolResults?.length });
|
|
236
|
-
},
|
|
237
|
-
});
|
|
238
|
-
|
|
239
|
-
logEvent('complete', {
|
|
240
|
-
success: true,
|
|
241
|
-
steps: result.steps.length,
|
|
242
|
-
text: result.text,
|
|
243
|
-
});
|
|
244
|
-
} catch (error) {
|
|
245
|
-
logEvent('error', {
|
|
246
|
-
success: false,
|
|
247
|
-
error: error.message,
|
|
248
|
-
name: error.name,
|
|
249
|
-
});
|
|
250
|
-
process.exit(1);
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
main();
|
|
255
|
-
`;
|
|
256
|
-
/**
|
|
257
|
-
* Create AI SDK agent with Vercel AI Gateway authentication.
|
|
258
|
-
*/
|
|
259
|
-
export function createAiSdkAgent() {
|
|
260
|
-
return {
|
|
261
|
-
name: 'vercel-ai-gateway/ai-sdk-harness',
|
|
262
|
-
displayName: 'AI SDK Harness (Vercel AI Gateway)',
|
|
263
|
-
getApiKeyEnvVar() {
|
|
264
|
-
return AI_GATEWAY.apiKeyEnvVar;
|
|
265
|
-
},
|
|
266
|
-
getDefaultModel() {
|
|
267
|
-
return 'anthropic/claude-sonnet-4';
|
|
268
|
-
},
|
|
269
|
-
async run(fixturePath, options) {
|
|
270
|
-
const startTime = Date.now();
|
|
271
|
-
let sandbox = null;
|
|
272
|
-
let agentOutput = '';
|
|
273
|
-
let aborted = false;
|
|
274
|
-
let sandboxStopped = false;
|
|
275
|
-
// Handle abort signal
|
|
276
|
-
const abortHandler = () => {
|
|
277
|
-
aborted = true;
|
|
278
|
-
if (sandbox && !sandboxStopped) {
|
|
279
|
-
sandboxStopped = true;
|
|
280
|
-
sandbox.stop().catch(() => { });
|
|
281
|
-
}
|
|
282
|
-
};
|
|
283
|
-
if (options.signal) {
|
|
284
|
-
if (options.signal.aborted) {
|
|
285
|
-
return {
|
|
286
|
-
success: false,
|
|
287
|
-
output: '',
|
|
288
|
-
error: 'Aborted before start',
|
|
289
|
-
duration: 0,
|
|
290
|
-
};
|
|
291
|
-
}
|
|
292
|
-
options.signal.addEventListener('abort', abortHandler);
|
|
293
|
-
}
|
|
294
|
-
try {
|
|
295
|
-
// Collect files from fixture
|
|
296
|
-
const allFiles = await collectLocalFiles(fixturePath);
|
|
297
|
-
const { workspaceFiles, testFiles } = splitTestFiles(allFiles);
|
|
298
|
-
// Check for abort before expensive operations
|
|
299
|
-
if (aborted) {
|
|
300
|
-
return {
|
|
301
|
-
success: false,
|
|
302
|
-
output: '',
|
|
303
|
-
error: 'Aborted',
|
|
304
|
-
duration: Date.now() - startTime,
|
|
305
|
-
};
|
|
306
|
-
}
|
|
307
|
-
// Create sandbox
|
|
308
|
-
sandbox = await createSandbox({
|
|
309
|
-
timeout: options.timeout,
|
|
310
|
-
runtime: 'node24',
|
|
311
|
-
backend: options.sandbox,
|
|
312
|
-
});
|
|
313
|
-
// Check for abort after sandbox creation
|
|
314
|
-
if (aborted) {
|
|
315
|
-
return {
|
|
316
|
-
success: false,
|
|
317
|
-
output: '',
|
|
318
|
-
error: 'Aborted',
|
|
319
|
-
duration: Date.now() - startTime,
|
|
320
|
-
sandboxId: sandbox.sandboxId,
|
|
321
|
-
};
|
|
322
|
-
}
|
|
323
|
-
// Upload workspace files (excluding tests)
|
|
324
|
-
await sandbox.uploadFiles(workspaceFiles);
|
|
325
|
-
// Run setup function if provided
|
|
326
|
-
if (options.setup) {
|
|
327
|
-
await options.setup(sandbox);
|
|
328
|
-
}
|
|
329
|
-
// Install dependencies
|
|
330
|
-
const installResult = await sandbox.runCommand('npm', ['install']);
|
|
331
|
-
if (installResult.exitCode !== 0) {
|
|
332
|
-
throw new Error(`npm install failed: ${installResult.stderr}`);
|
|
333
|
-
}
|
|
334
|
-
// Install AI SDK dependencies
|
|
335
|
-
const aiInstall = await sandbox.runCommand('npm', [
|
|
336
|
-
'install',
|
|
337
|
-
'ai@^5.0.11',
|
|
338
|
-
'@ai-sdk/gateway@^1.0.0',
|
|
339
|
-
'zod@^3.23.8',
|
|
340
|
-
]);
|
|
341
|
-
if (aiInstall.exitCode !== 0) {
|
|
342
|
-
throw new Error(`AI SDK install failed: ${aiInstall.stderr}`);
|
|
343
|
-
}
|
|
344
|
-
// Write the CLI script to the sandbox
|
|
345
|
-
await sandbox.writeFiles({
|
|
346
|
-
'ai-sdk-agent.mjs': CLI_SCRIPT,
|
|
347
|
-
});
|
|
348
|
-
// Verify no test files in sandbox
|
|
349
|
-
await verifyNoTestFiles(sandbox);
|
|
350
|
-
// Run the AI SDK agent
|
|
351
|
-
const agentResult = await sandbox.runCommand('node', [
|
|
352
|
-
'ai-sdk-agent.mjs',
|
|
353
|
-
'--prompt',
|
|
354
|
-
options.prompt,
|
|
355
|
-
'--model',
|
|
356
|
-
options.model,
|
|
357
|
-
], {
|
|
358
|
-
env: {
|
|
359
|
-
[AI_GATEWAY.apiKeyEnvVar]: options.apiKey,
|
|
360
|
-
},
|
|
361
|
-
});
|
|
362
|
-
agentOutput = agentResult.stdout + agentResult.stderr;
|
|
363
|
-
if (agentResult.exitCode !== 0) {
|
|
364
|
-
// Extract meaningful error from output
|
|
365
|
-
const errorLines = agentOutput.trim().split('\n').slice(-5).join('\n');
|
|
366
|
-
return {
|
|
367
|
-
success: false,
|
|
368
|
-
output: agentOutput,
|
|
369
|
-
error: errorLines || `AI SDK agent exited with code ${agentResult.exitCode}`,
|
|
370
|
-
duration: Date.now() - startTime,
|
|
371
|
-
sandboxId: sandbox.sandboxId,
|
|
372
|
-
};
|
|
373
|
-
}
|
|
374
|
-
// Upload test files for validation
|
|
375
|
-
await sandbox.uploadFiles(testFiles);
|
|
376
|
-
// Create vitest config for EVAL.ts/tsx
|
|
377
|
-
await createVitestConfig(sandbox);
|
|
378
|
-
// The agent outputs JSON events, use that as transcript
|
|
379
|
-
const transcript = agentOutput;
|
|
380
|
-
// Run validation scripts
|
|
381
|
-
const validationResults = await runValidation(sandbox, options.scripts ?? []);
|
|
382
|
-
// Capture generated files
|
|
383
|
-
const generatedFiles = await captureGeneratedFiles(sandbox);
|
|
384
|
-
return {
|
|
385
|
-
success: validationResults.allPassed,
|
|
386
|
-
output: agentOutput,
|
|
387
|
-
transcript,
|
|
388
|
-
duration: Date.now() - startTime,
|
|
389
|
-
testResult: validationResults.test,
|
|
390
|
-
scriptsResults: validationResults.scripts,
|
|
391
|
-
sandboxId: sandbox.sandboxId,
|
|
392
|
-
generatedFiles,
|
|
393
|
-
};
|
|
394
|
-
}
|
|
395
|
-
catch (error) {
|
|
396
|
-
// Check if this was an abort
|
|
397
|
-
if (aborted) {
|
|
398
|
-
return {
|
|
399
|
-
success: false,
|
|
400
|
-
output: agentOutput,
|
|
401
|
-
error: 'Aborted',
|
|
402
|
-
duration: Date.now() - startTime,
|
|
403
|
-
sandboxId: sandbox?.sandboxId,
|
|
404
|
-
};
|
|
405
|
-
}
|
|
406
|
-
return {
|
|
407
|
-
success: false,
|
|
408
|
-
output: agentOutput,
|
|
409
|
-
error: error instanceof Error ? error.message : String(error),
|
|
410
|
-
duration: Date.now() - startTime,
|
|
411
|
-
sandboxId: sandbox?.sandboxId,
|
|
412
|
-
};
|
|
413
|
-
}
|
|
414
|
-
finally {
|
|
415
|
-
// Clean up abort listener
|
|
416
|
-
if (options.signal) {
|
|
417
|
-
options.signal.removeEventListener('abort', abortHandler);
|
|
418
|
-
}
|
|
419
|
-
if (sandbox && !sandboxStopped) {
|
|
420
|
-
sandboxStopped = true;
|
|
421
|
-
await sandbox.stop();
|
|
422
|
-
}
|
|
423
|
-
}
|
|
424
|
-
},
|
|
425
|
-
};
|
|
426
|
-
}
|
|
427
|
-
//# sourceMappingURL=ai-sdk-agent.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"ai-sdk-agent.js","sourceRoot":"","sources":["../../../src/lib/agents/ai-sdk-agent.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,cAAc,EACd,iBAAiB,GAElB,MAAM,eAAe,CAAC;AAEvB,OAAO,EACL,aAAa,EACb,qBAAqB,EACrB,kBAAkB,EAClB,UAAU,GACX,MAAM,aAAa,CAAC;AAKrB;;;GAGG;AACH,MAAM,UAAU,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAoPlB,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,gBAAgB;IAC9B,OAAO;QACL,IAAI,EAAE,kCAAkC;QACxC,WAAW,EAAE,oCAAoC;QAEjD,eAAe;YACb,OAAO,UAAU,CAAC,YAAY,CAAC;QACjC,CAAC;QAED,eAAe;YACb,OAAO,2BAA2B,CAAC;QACrC,CAAC;QAED,KAAK,CAAC,GAAG,CAAC,WAAmB,EAAE,OAAwB;YACrD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC7B,IAAI,OAAO,GAAsB,IAAI,CAAC;YACtC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,IAAI,OAAO,GAAG,KAAK,CAAC;YACpB,IAAI,cAAc,GAAG,KAAK,CAAC;YAE3B,sBAAsB;YACtB,MAAM,YAAY,GAAG,GAAG,EAAE;gBACxB,OAAO,GAAG,IAAI,CAAC;gBACf,IAAI,OAAO,IAAI,CAAC,cAAc,EAAE,CAAC;oBAC/B,cAAc,GAAG,IAAI,CAAC;oBACtB,OAAO,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;gBACjC,CAAC;YACH,CAAC,CAAC;YAEF,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;gBACnB,IAAI,OAAO,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;oBAC3B,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,EAAE;wBACV,KAAK,EAAE,sBAAsB;wBAC7B,QAAQ,EAAE,CAAC;qBACZ,CAAC;gBACJ,CAAC;gBACD,OAAO,CAAC,MAAM,CAAC,gBAAgB,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;YACzD,CAAC;YAED,IAAI,CAAC;gBACH,6BAA6B;gBAC7B,MAAM,QAAQ,GAAG,MAAM,iBAAiB,CAAC,WAAW,CAAC,CAAC;gBACtD,MAAM,EAAE,cAAc,EAAE,SAAS,EAAE,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;gBAE/D,8CAA8C;gBAC9C,IAAI,OAAO,EAAE,CAAC;oBACZ,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,EAAE;wBACV,KAAK,EAAE,SAAS;wBAChB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;qBACjC,CAAC;gBACJ,CAAC;gBAED,iBAAiB;gBACjB,OAAO,GAAG,MAAM,aAAa,CAAC;oBAC5B,OAAO,EAAE,OAAO,CAAC,OAAO;oBACxB,OAAO,EAAE,QAAQ;oBACjB,OAAO,EAAE,OAAO,CAAC,OAAO;iBACzB,CAAC,CAAC;gBAEH,yCAAyC;gBACzC,IAAI,OAAO,EAAE,CAAC;oBACZ,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,EAAE;wBACV,KAAK,EAAE,SAAS;wBAChB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;wBAChC,SAAS,EAAE,OAAO,CAAC,SAAS;qBAC7B,CAAC;gBACJ,CAAC;gBAED,2CAA2C;gBAC3C,MAAM,OAAO,CAAC,WAAW,CAAC,cAAc,CAAC,CAAC;gBAE1C,iCAAiC;gBACjC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;oBAClB,MAAM,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;gBAC/B,CAAC;gBAED,uBAAuB;gBACvB,MAAM,aAAa,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC;gBACnE,IAAI,aAAa,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBACjC,MAAM,IAAI,KAAK,CAAC,uBAAuB,aAAa,CAAC,MAAM,EAAE,CAAC,CAAC;gBACjE,CAAC;gBAED,8BAA8B;gBAC9B,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,KAAK,EAAE;oBAChD,SAAS;oBACT,YAAY;oBACZ,wBAAwB;oBACxB,aAAa;iBACd,CAAC,CAAC;gBACH,IAAI,SAAS,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBAC7B,MAAM,IAAI,KAAK,CAAC,0BAA0B,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC;gBAChE,CAAC;gBAED,sCAAsC;gBACtC,MAAM,OAAO,CAAC,UAAU,CAAC;oBACvB,kBAAkB,EAAE,UAAU;iBAC/B,CAAC,CAAC;gBAEH,kCAAkC;gBAClC,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;gBAEjC,uBAAuB;gBACvB,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,UAAU,CAC1C,MAAM,EACN;oBACE,kBAAkB;oBAClB,UAAU;oBACV,OAAO,CAAC,MAAM;oBACd,SAAS;oBACT,OAAO,CAAC,KAAK;iBACd,EACD;oBACE,GAAG,EAAE;wBACH,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,OAAO,CAAC,MAAM;qBAC1C;iBACF,CACF,CAAC;gBAEF,WAAW,GAAG,WAAW,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC;gBAEtD,IAAI,WAAW,CAAC,QAAQ,KAAK,CAAC,EAAE,CAAC;oBAC/B,uCAAuC;oBACvC,MAAM,UAAU,GAAG,WAAW,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;oBACvE,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,WAAW;wBACnB,KAAK,EAAE,UAAU,IAAI,iCAAiC,WAAW,CAAC,QAAQ,EAAE;wBAC5E,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;wBAChC,SAAS,EAAE,OAAO,CAAC,SAAS;qBAC7B,CAAC;gBACJ,CAAC;gBAED,mCAAmC;gBACnC,MAAM,OAAO,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;gBAErC,uCAAuC;gBACvC,MAAM,kBAAkB,CAAC,OAAO,CAAC,CAAC;gBAElC,wDAAwD;gBACxD,MAAM,UAAU,GAAG,WAAW,CAAC;gBAE/B,yBAAyB;gBACzB,MAAM,iBAAiB,GAAG,MAAM,aAAa,CAAC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC;gBAE9E,0BAA0B;gBAC1B,MAAM,cAAc,GAAG,MAAM,qBAAqB,CAAC,OAAO,CAAC,CAAC;gBAE5D,OAAO;oBACL,OAAO,EAAE,iBAAiB,CAAC,SAAS;oBACpC,MAAM,EAAE,WAAW;oBACnB,UAAU;oBACV,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;oBAChC,UAAU,EAAE,iBAAiB,CAAC,IAAI;oBAClC,cAAc,EAAE,iBAAiB,CAAC,OAAO;oBACzC,SAAS,EAAE,OAAO,CAAC,SAAS;oBAC5B,cAAc;iBACf,CAAC;YACJ,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,6BAA6B;gBAC7B,IAAI,OAAO,EAAE,CAAC;oBACZ,OAAO;wBACL,OAAO,EAAE,KAAK;wBACd,MAAM,EAAE,WAAW;wBACnB,KAAK,EAAE,SAAS;wBAChB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;wBAChC,SAAS,EAAE,OAAO,EAAE,SAAS;qBAC9B,CAAC;gBACJ,CAAC;gBACD,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,MAAM,EAAE,WAAW;oBACnB,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;oBAC7D,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;oBAChC,SAAS,EAAE,OAAO,EAAE,SAAS;iBAC9B,CAAC;YACJ,CAAC;oBAAS,CAAC;gBACT,0BAA0B;gBAC1B,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;oBACnB,OAAO,CAAC,MAAM,CAAC,mBAAmB,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;gBAC5D,CAAC;gBACD,IAAI,OAAO,IAAI,CAAC,cAAc,EAAE,CAAC;oBAC/B,cAAc,GAAG,IAAI,CAAC;oBACtB,MAAM,OAAO,CAAC,IAAI,EAAE,CAAC;gBACvB,CAAC;YACH,CAAC;QACH,CAAC;KACF,CAAC;AACJ,CAAC"}
|