agent-eval-opencode 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +6 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +590 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +30 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agents/claude-code.d.ts +12 -0
- package/dist/lib/agents/claude-code.d.ts.map +1 -0
- package/dist/lib/agents/claude-code.js +231 -0
- package/dist/lib/agents/claude-code.js.map +1 -0
- package/dist/lib/agents/codex.d.ts +12 -0
- package/dist/lib/agents/codex.d.ts.map +1 -0
- package/dist/lib/agents/codex.js +267 -0
- package/dist/lib/agents/codex.js.map +1 -0
- package/dist/lib/agents/cursor.d.ts +10 -0
- package/dist/lib/agents/cursor.d.ts.map +1 -0
- package/dist/lib/agents/cursor.js +204 -0
- package/dist/lib/agents/cursor.js.map +1 -0
- package/dist/lib/agents/gemini.d.ts +10 -0
- package/dist/lib/agents/gemini.d.ts.map +1 -0
- package/dist/lib/agents/gemini.js +207 -0
- package/dist/lib/agents/gemini.js.map +1 -0
- package/dist/lib/agents/index.d.ts +7 -0
- package/dist/lib/agents/index.d.ts.map +1 -0
- package/dist/lib/agents/index.js +20 -0
- package/dist/lib/agents/index.js.map +1 -0
- package/dist/lib/agents/opencode.d.ts +11 -0
- package/dist/lib/agents/opencode.d.ts.map +1 -0
- package/dist/lib/agents/opencode.js +245 -0
- package/dist/lib/agents/opencode.js.map +1 -0
- package/dist/lib/agents/registry.d.ts +23 -0
- package/dist/lib/agents/registry.d.ts.map +1 -0
- package/dist/lib/agents/registry.js +35 -0
- package/dist/lib/agents/registry.js.map +1 -0
- package/dist/lib/agents/shared.d.ts +83 -0
- package/dist/lib/agents/shared.d.ts.map +1 -0
- package/dist/lib/agents/shared.js +192 -0
- package/dist/lib/agents/shared.js.map +1 -0
- package/dist/lib/agents/types.d.ts +73 -0
- package/dist/lib/agents/types.d.ts.map +1 -0
- package/dist/lib/agents/types.js +5 -0
- package/dist/lib/agents/types.js.map +1 -0
- package/dist/lib/classifier.d.ts +89 -0
- package/dist/lib/classifier.d.ts.map +1 -0
- package/dist/lib/classifier.js +285 -0
- package/dist/lib/classifier.js.map +1 -0
- package/dist/lib/config.d.ts +37 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +187 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/dashboard.d.ts +65 -0
- package/dist/lib/dashboard.d.ts.map +1 -0
- package/dist/lib/dashboard.js +237 -0
- package/dist/lib/dashboard.js.map +1 -0
- package/dist/lib/docker-sandbox.d.ts +92 -0
- package/dist/lib/docker-sandbox.d.ts.map +1 -0
- package/dist/lib/docker-sandbox.js +375 -0
- package/dist/lib/docker-sandbox.js.map +1 -0
- package/dist/lib/fingerprint.d.ts +15 -0
- package/dist/lib/fingerprint.d.ts.map +1 -0
- package/dist/lib/fingerprint.js +59 -0
- package/dist/lib/fingerprint.js.map +1 -0
- package/dist/lib/fixture.d.ts +55 -0
- package/dist/lib/fixture.d.ts.map +1 -0
- package/dist/lib/fixture.js +215 -0
- package/dist/lib/fixture.js.map +1 -0
- package/dist/lib/housekeeping.d.ts +26 -0
- package/dist/lib/housekeeping.d.ts.map +1 -0
- package/dist/lib/housekeeping.js +170 -0
- package/dist/lib/housekeeping.js.map +1 -0
- package/dist/lib/init.d.ts +21 -0
- package/dist/lib/init.d.ts.map +1 -0
- package/dist/lib/init.js +275 -0
- package/dist/lib/init.js.map +1 -0
- package/dist/lib/o11y/index.d.ts +13 -0
- package/dist/lib/o11y/index.d.ts.map +1 -0
- package/dist/lib/o11y/index.js +13 -0
- package/dist/lib/o11y/index.js.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts +18 -0
- package/dist/lib/o11y/parsers/claude-code.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/claude-code.js +343 -0
- package/dist/lib/o11y/parsers/claude-code.js.map +1 -0
- package/dist/lib/o11y/parsers/codex.d.ts +17 -0
- package/dist/lib/o11y/parsers/codex.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/codex.js +364 -0
- package/dist/lib/o11y/parsers/codex.js.map +1 -0
- package/dist/lib/o11y/parsers/cursor.d.ts +21 -0
- package/dist/lib/o11y/parsers/cursor.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/cursor.js +226 -0
- package/dist/lib/o11y/parsers/cursor.js.map +1 -0
- package/dist/lib/o11y/parsers/gemini.d.ts +21 -0
- package/dist/lib/o11y/parsers/gemini.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/gemini.js +241 -0
- package/dist/lib/o11y/parsers/gemini.js.map +1 -0
- package/dist/lib/o11y/parsers/index.d.ts +55 -0
- package/dist/lib/o11y/parsers/index.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/index.js +284 -0
- package/dist/lib/o11y/parsers/index.js.map +1 -0
- package/dist/lib/o11y/parsers/opencode.d.ts +17 -0
- package/dist/lib/o11y/parsers/opencode.d.ts.map +1 -0
- package/dist/lib/o11y/parsers/opencode.js +320 -0
- package/dist/lib/o11y/parsers/opencode.js.map +1 -0
- package/dist/lib/o11y/types.d.ts +113 -0
- package/dist/lib/o11y/types.d.ts.map +1 -0
- package/dist/lib/o11y/types.js +6 -0
- package/dist/lib/o11y/types.js.map +1 -0
- package/dist/lib/results.d.ts +91 -0
- package/dist/lib/results.d.ts.map +1 -0
- package/dist/lib/results.js +361 -0
- package/dist/lib/results.js.map +1 -0
- package/dist/lib/runner.d.ts +71 -0
- package/dist/lib/runner.d.ts.map +1 -0
- package/dist/lib/runner.js +267 -0
- package/dist/lib/runner.js.map +1 -0
- package/dist/lib/sandbox.d.ts +173 -0
- package/dist/lib/sandbox.d.ts.map +1 -0
- package/dist/lib/sandbox.js +337 -0
- package/dist/lib/sandbox.js.map +1 -0
- package/dist/lib/types.d.ts +258 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +15 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/test-setup.d.ts +2 -0
- package/dist/test-setup.d.ts.map +1 -0
- package/dist/test-setup.js +6 -0
- package/dist/test-setup.js.map +1 -0
- package/package.json +72 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent interface and common types for all agents.
|
|
3
|
+
*/
|
|
4
|
+
import type { ModelTier, SetupFunction, SandboxBackend } from '../types.js';
|
|
5
|
+
/**
|
|
6
|
+
* Common options for all agents.
|
|
7
|
+
*/
|
|
8
|
+
export interface AgentRunOptions {
|
|
9
|
+
/** The prompt/task for the agent */
|
|
10
|
+
prompt: string;
|
|
11
|
+
/** Model to use (agent-specific) */
|
|
12
|
+
model: ModelTier;
|
|
13
|
+
/** Timeout in milliseconds */
|
|
14
|
+
timeout: number;
|
|
15
|
+
/** API key for the agent */
|
|
16
|
+
apiKey: string;
|
|
17
|
+
/** Optional setup function to run before agent */
|
|
18
|
+
setup?: SetupFunction;
|
|
19
|
+
/** npm scripts to run after agent completes */
|
|
20
|
+
scripts?: string[];
|
|
21
|
+
/** Abort signal to cancel the run */
|
|
22
|
+
signal?: AbortSignal;
|
|
23
|
+
/** Sandbox backend to use */
|
|
24
|
+
sandbox?: SandboxBackend | 'auto';
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Result from running a script.
|
|
28
|
+
*/
|
|
29
|
+
export interface ScriptResult {
|
|
30
|
+
success: boolean;
|
|
31
|
+
output: string;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Result of an agent run.
|
|
35
|
+
*/
|
|
36
|
+
export interface AgentRunResult {
|
|
37
|
+
/** Whether the run was successful */
|
|
38
|
+
success: boolean;
|
|
39
|
+
/** Raw output from the agent (stdout/stderr) */
|
|
40
|
+
output: string;
|
|
41
|
+
/** Structured transcript (JSONL format for Claude, JSON for Codex) */
|
|
42
|
+
transcript?: string;
|
|
43
|
+
/** Error message if failed */
|
|
44
|
+
error?: string;
|
|
45
|
+
/** Duration in milliseconds */
|
|
46
|
+
duration: number;
|
|
47
|
+
/** Test result (EVAL.ts always runs) */
|
|
48
|
+
testResult?: ScriptResult;
|
|
49
|
+
/** Results from configured scripts */
|
|
50
|
+
scriptsResults?: Record<string, ScriptResult>;
|
|
51
|
+
/** Sandbox ID for debugging */
|
|
52
|
+
sandboxId?: string;
|
|
53
|
+
/** Files generated by the agent */
|
|
54
|
+
generatedFiles?: Record<string, string>;
|
|
55
|
+
/** Files deleted by the agent */
|
|
56
|
+
deletedFiles?: string[];
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Agent interface that all agents must implement.
|
|
60
|
+
*/
|
|
61
|
+
export interface Agent {
|
|
62
|
+
/** Unique identifier for the agent */
|
|
63
|
+
name: string;
|
|
64
|
+
/** Human-readable display name */
|
|
65
|
+
displayName: string;
|
|
66
|
+
/** Run the agent on a fixture */
|
|
67
|
+
run(fixturePath: string, options: AgentRunOptions): Promise<AgentRunResult>;
|
|
68
|
+
/** Get agent-specific environment variable name for API key */
|
|
69
|
+
getApiKeyEnvVar(): string;
|
|
70
|
+
/** Get the default model for this agent */
|
|
71
|
+
getDefaultModel(): ModelTier;
|
|
72
|
+
}
|
|
73
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/lib/agents/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE5E;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oCAAoC;IACpC,MAAM,EAAE,MAAM,CAAC;IACf,oCAAoC;IACpC,KAAK,EAAE,SAAS,CAAC;IACjB,8BAA8B;IAC9B,OAAO,EAAE,MAAM,CAAC;IAChB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,kDAAkD;IAClD,KAAK,CAAC,EAAE,aAAa,CAAC;IACtB,+CAA+C;IAC/C,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,qCAAqC;IACrC,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,6BAA6B;IAC7B,OAAO,CAAC,EAAE,cAAc,GAAG,MAAM,CAAC;CACnC;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,qCAAqC;IACrC,OAAO,EAAE,OAAO,CAAC;IACjB,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAC;IACf,sEAAsE;IACtE,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,+BAA+B;IAC/B,QAAQ,EAAE,MAAM,CAAC;IACjB,wCAAwC;IACxC,UAAU,CAAC,EAAE,YAAY,CAAC;IAC1B,sCAAsC;IACtC,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;IAC9C,+BAA+B;IAC/B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,mCAAmC;IACnC,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxC,iCAAiC;IACjC,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,sCAAsC;IACtC,IAAI,EAAE,MAAM,CAAC;IAEb,kCAAkC;IAClC,WAAW,EAAE,MAAM,CAAC;IAEpB,iCAAiC;IACjC,GAAG,CAAC,WAAW,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAE5E,+DAA+D;IAC/D,eAAe,IAAI,MAAM,CAAC;IAE1B,2CAA2C;IAC3C,eAAe,IAAI,SAAS,CAAC;CAC9B"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/agents/types.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Failure classification for eval results.
|
|
3
|
+
*
|
|
4
|
+
* Classifies failed eval runs as:
|
|
5
|
+
* - "model" — the model tried but wrote incorrect code
|
|
6
|
+
* - "infra" — infrastructure broke (API errors, rate limits, crashes)
|
|
7
|
+
* - "timeout" — the run hit its time limit
|
|
8
|
+
*
|
|
9
|
+
* Uses AI classification via the Vercel AI Gateway. Requires AI_GATEWAY_API_KEY or VERCEL_OIDC_TOKEN.
|
|
10
|
+
*/
|
|
11
|
+
import type { Classification } from './types.js';
|
|
12
|
+
/**
|
|
13
|
+
* Check if the classifier feature is enabled.
|
|
14
|
+
* The classifier requires either AI_GATEWAY_API_KEY or VERCEL_OIDC_TOKEN to be set.
|
|
15
|
+
* If neither is available, classification is disabled and housekeeping won't clean up non-model failures.
|
|
16
|
+
*/
|
|
17
|
+
export declare function isClassifierEnabled(): boolean;
|
|
18
|
+
/**
|
|
19
|
+
* Creates sandboxed read-only tools for the AI classifier.
|
|
20
|
+
*/
|
|
21
|
+
export declare function createClassifierTools(evalResultDir: string): {
|
|
22
|
+
list_files: import("ai").Tool<{
|
|
23
|
+
path: string;
|
|
24
|
+
}, {
|
|
25
|
+
error: string;
|
|
26
|
+
entries?: undefined;
|
|
27
|
+
} | {
|
|
28
|
+
entries: {
|
|
29
|
+
name: string;
|
|
30
|
+
type: "file" | "dir";
|
|
31
|
+
}[];
|
|
32
|
+
error?: undefined;
|
|
33
|
+
}>;
|
|
34
|
+
read_file: import("ai").Tool<{
|
|
35
|
+
path: string;
|
|
36
|
+
offset?: number | undefined;
|
|
37
|
+
limit?: number | undefined;
|
|
38
|
+
}, {
|
|
39
|
+
error: string;
|
|
40
|
+
content?: undefined;
|
|
41
|
+
totalLines?: undefined;
|
|
42
|
+
showing?: undefined;
|
|
43
|
+
} | {
|
|
44
|
+
content: string;
|
|
45
|
+
totalLines: number;
|
|
46
|
+
showing: string;
|
|
47
|
+
error?: undefined;
|
|
48
|
+
}>;
|
|
49
|
+
grep: import("ai").Tool<{
|
|
50
|
+
path: string;
|
|
51
|
+
pattern: string;
|
|
52
|
+
maxResults?: number | undefined;
|
|
53
|
+
}, {
|
|
54
|
+
error: string;
|
|
55
|
+
matches?: undefined;
|
|
56
|
+
totalFound?: undefined;
|
|
57
|
+
truncated?: undefined;
|
|
58
|
+
} | {
|
|
59
|
+
matches: {
|
|
60
|
+
file: string;
|
|
61
|
+
line: number;
|
|
62
|
+
text: string;
|
|
63
|
+
}[];
|
|
64
|
+
totalFound: number;
|
|
65
|
+
truncated: boolean;
|
|
66
|
+
error?: undefined;
|
|
67
|
+
}>;
|
|
68
|
+
};
|
|
69
|
+
/**
|
|
70
|
+
* Classify a failure using AI via the Vercel AI Gateway.
|
|
71
|
+
* Requires AI_GATEWAY_API_KEY in the environment.
|
|
72
|
+
*/
|
|
73
|
+
export declare function classifyWithAI(evalResultDir: string, evalName: string, experimentName: string): Promise<Classification | null>;
|
|
74
|
+
/**
|
|
75
|
+
* Classify a failed eval result using AI.
|
|
76
|
+
* Requires AI_GATEWAY_API_KEY in the environment.
|
|
77
|
+
*
|
|
78
|
+
* Caches results in classification.json within the eval result directory.
|
|
79
|
+
*/
|
|
80
|
+
export declare function classifyFailure(evalResultDir: string, evalName: string, experimentName: string): Promise<Classification | null>;
|
|
81
|
+
/**
|
|
82
|
+
* Check if an eval result was classified as a non-model failure (infra or timeout).
|
|
83
|
+
* Reads classification.json — the single source of truth for classification data.
|
|
84
|
+
*
|
|
85
|
+
* Returns false for acknowledged failures (--ack-failures), since those are
|
|
86
|
+
* intentionally kept as final results.
|
|
87
|
+
*/
|
|
88
|
+
export declare function isNonModelFailure(evalResultDir: string): boolean;
|
|
89
|
+
//# sourceMappingURL=classifier.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../../src/lib/classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAMH,OAAO,KAAK,EAAE,cAAc,EAAe,MAAM,YAAY,CAAC;AAE9D;;;;GAIG;AACH,wBAAgB,mBAAmB,IAAI,OAAO,CAE7C;AA8BD;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,aAAa,EAAE,MAAM;;;;;;;;kBAepB,MAAM;kBAAQ,MAAM,GAAG,KAAK;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kBAkE9B,MAAM;kBAAQ,MAAM;kBAAQ,MAAM;;;;;;EA0DtE;AAED;;;GAGG;AACH,wBAAsB,cAAc,CAClC,aAAa,EAAE,MAAM,EACrB,QAAQ,EAAE,MAAM,EAChB,cAAc,EAAE,MAAM,GACrB,OAAO,CAAC,cAAc,GAAG,IAAI,CAAC,CAwChC;AAED;;;;;GAKG;AACH,wBAAsB,eAAe,CACnC,aAAa,EAAE,MAAM,EACrB,QAAQ,EAAE,MAAM,EAChB,cAAc,EAAE,MAAM,GACrB,OAAO,CAAC,cAAc,GAAG,IAAI,CAAC,CAyBhC;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,aAAa,EAAE,MAAM,GAAG,OAAO,CAQhE"}
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Failure classification for eval results.
|
|
3
|
+
*
|
|
4
|
+
* Classifies failed eval runs as:
|
|
5
|
+
* - "model" — the model tried but wrote incorrect code
|
|
6
|
+
* - "infra" — infrastructure broke (API errors, rate limits, crashes)
|
|
7
|
+
* - "timeout" — the run hit its time limit
|
|
8
|
+
*
|
|
9
|
+
* Uses AI classification via the Vercel AI Gateway. Requires AI_GATEWAY_API_KEY or VERCEL_OIDC_TOKEN.
|
|
10
|
+
*/
|
|
11
|
+
import { readFileSync, readdirSync, statSync, writeFileSync } from 'fs';
|
|
12
|
+
import { join, resolve } from 'path';
|
|
13
|
+
import { tool } from 'ai';
|
|
14
|
+
import { z } from 'zod';
|
|
15
|
+
/**
|
|
16
|
+
* Check if the classifier feature is enabled.
|
|
17
|
+
* The classifier requires either AI_GATEWAY_API_KEY or VERCEL_OIDC_TOKEN to be set.
|
|
18
|
+
* If neither is available, classification is disabled and housekeeping won't clean up non-model failures.
|
|
19
|
+
*/
|
|
20
|
+
export function isClassifierEnabled() {
|
|
21
|
+
return !!(process.env.AI_GATEWAY_API_KEY || process.env.VERCEL_OIDC_TOKEN);
|
|
22
|
+
}
|
|
23
|
+
const CLASSIFIER_SYSTEM_PROMPT = `You are a failure classifier for an AI coding agent benchmark.
|
|
24
|
+
|
|
25
|
+
Your job: figure out WHY a failed eval run failed. Each eval tests whether an AI model can complete a coding task (e.g. migrate to App Router, add a Next.js feature). You have tools to explore the result files.
|
|
26
|
+
|
|
27
|
+
Classify into one of:
|
|
28
|
+
- "model" — the model tried but wrote incorrect code
|
|
29
|
+
- "infra" — infrastructure broke (API errors, rate limits, crashes) and the model never got to do real work
|
|
30
|
+
- "timeout" — the run hit its time limit
|
|
31
|
+
|
|
32
|
+
The eval result directory contains run-1/ through run-N/ subdirectories (one per attempt, N depends on config), plus a summary.json. Each run directory has:
|
|
33
|
+
- result.json — status, error, duration
|
|
34
|
+
- transcript.json or transcript-raw.jsonl (or older results may have transcript.jsonl) — the agent's conversation log
|
|
35
|
+
- outputs/eval.txt — EVAL.ts test output
|
|
36
|
+
- outputs/scripts/*.txt — npm script outputs (e.g. build.txt), if the experiment configured scripts
|
|
37
|
+
|
|
38
|
+
IMPORTANT: The eval harness always runs EVAL.ts tests after the agent finishes, plus any npm scripts configured in the experiment's \`scripts\` array (e.g. \`["build"]\`). These run even if the model produced nothing — tests just run against unmodified scaffold code (TODO placeholders). So test/script failures alone do NOT mean the model wrote code.
|
|
39
|
+
|
|
40
|
+
The transcript is the key evidence. It records every action the model took. If there is no transcript file, or the transcript only shows errors (no tool calls or text output from the model), the model never actually ran — that's "infra". Only classify as "model" if you see evidence in the transcript that the model actually generated code.`;
|
|
41
|
+
/**
|
|
42
|
+
* Validates and resolves a path, ensuring it stays within the allowed root.
|
|
43
|
+
*/
|
|
44
|
+
function safePath(root, relativePath) {
|
|
45
|
+
const resolved = resolve(root, relativePath);
|
|
46
|
+
if (!resolved.startsWith(root))
|
|
47
|
+
return null;
|
|
48
|
+
return resolved;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Creates sandboxed read-only tools for the AI classifier.
|
|
52
|
+
*/
|
|
53
|
+
export function createClassifierTools(evalResultDir) {
|
|
54
|
+
return {
|
|
55
|
+
list_files: tool({
|
|
56
|
+
description: 'List files and directories at a path relative to the eval result root. Use "." for the root.',
|
|
57
|
+
inputSchema: z.object({
|
|
58
|
+
path: z
|
|
59
|
+
.string()
|
|
60
|
+
.describe('Relative path to list, e.g. "." or "run-1" or "run-1/outputs"'),
|
|
61
|
+
}),
|
|
62
|
+
execute: async ({ path: relPath }) => {
|
|
63
|
+
const target = safePath(evalResultDir, relPath);
|
|
64
|
+
if (!target)
|
|
65
|
+
return { error: 'Path outside allowed directory' };
|
|
66
|
+
try {
|
|
67
|
+
const entries = readdirSync(target);
|
|
68
|
+
const results = [];
|
|
69
|
+
for (const entry of entries.sort()) {
|
|
70
|
+
const info = statSync(join(target, entry));
|
|
71
|
+
results.push({ name: entry, type: info.isDirectory() ? 'dir' : 'file' });
|
|
72
|
+
}
|
|
73
|
+
return { entries: results };
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
return { error: `Cannot list: ${relPath}` };
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
}),
|
|
80
|
+
read_file: tool({
|
|
81
|
+
description: 'Read a file relative to the eval result root. For large files, use offset/limit to paginate.',
|
|
82
|
+
inputSchema: z.object({
|
|
83
|
+
path: z
|
|
84
|
+
.string()
|
|
85
|
+
.describe('Relative path to the file, e.g. "run-1/result.json"'),
|
|
86
|
+
offset: z
|
|
87
|
+
.number()
|
|
88
|
+
.describe('Line offset to start reading from (0-based)')
|
|
89
|
+
.optional(),
|
|
90
|
+
limit: z
|
|
91
|
+
.number()
|
|
92
|
+
.describe('Max number of lines to return')
|
|
93
|
+
.optional(),
|
|
94
|
+
}),
|
|
95
|
+
execute: async ({ path: relPath, offset: rawOffset, limit: rawLimit }) => {
|
|
96
|
+
const offset = rawOffset ?? 0;
|
|
97
|
+
const limit = rawLimit ?? 200;
|
|
98
|
+
const target = safePath(evalResultDir, relPath);
|
|
99
|
+
if (!target)
|
|
100
|
+
return { error: 'Path outside allowed directory' };
|
|
101
|
+
try {
|
|
102
|
+
const content = readFileSync(target, 'utf-8');
|
|
103
|
+
const lines = content.split('\n');
|
|
104
|
+
const sliced = lines.slice(offset, offset + limit);
|
|
105
|
+
return {
|
|
106
|
+
content: sliced.join('\n'),
|
|
107
|
+
totalLines: lines.length,
|
|
108
|
+
showing: `lines ${offset}-${Math.min(offset + limit, lines.length)} of ${lines.length}`,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
return { error: `Cannot read: ${relPath}` };
|
|
113
|
+
}
|
|
114
|
+
},
|
|
115
|
+
}),
|
|
116
|
+
grep: tool({
|
|
117
|
+
description: 'Search for a pattern in files under a directory. Returns matching lines with context.',
|
|
118
|
+
inputSchema: z.object({
|
|
119
|
+
pattern: z.string().describe('Text or regex pattern to search for'),
|
|
120
|
+
path: z
|
|
121
|
+
.string()
|
|
122
|
+
.describe('Relative directory or file to search in, e.g. "." or "run-1"'),
|
|
123
|
+
maxResults: z
|
|
124
|
+
.number()
|
|
125
|
+
.describe('Max number of matches to return')
|
|
126
|
+
.optional(),
|
|
127
|
+
}),
|
|
128
|
+
execute: async ({ pattern, path: relPath, maxResults: rawMax }) => {
|
|
129
|
+
const maxResults = rawMax ?? 20;
|
|
130
|
+
const target = safePath(evalResultDir, relPath);
|
|
131
|
+
if (!target)
|
|
132
|
+
return { error: 'Path outside allowed directory' };
|
|
133
|
+
const regex = new RegExp(pattern, 'i');
|
|
134
|
+
const matches = [];
|
|
135
|
+
async function searchFile(filePath, relName) {
|
|
136
|
+
try {
|
|
137
|
+
const content = readFileSync(filePath, 'utf-8');
|
|
138
|
+
const lines = content.split('\n');
|
|
139
|
+
for (let i = 0; i < lines.length && matches.length < maxResults; i++) {
|
|
140
|
+
if (regex.test(lines[i])) {
|
|
141
|
+
matches.push({
|
|
142
|
+
file: relName,
|
|
143
|
+
line: i + 1,
|
|
144
|
+
text: lines[i].slice(0, 500),
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
catch {
|
|
150
|
+
// Skip unreadable files
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
async function searchDir(dirPath, prefix) {
|
|
154
|
+
try {
|
|
155
|
+
const entries = readdirSync(dirPath);
|
|
156
|
+
for (const entry of entries) {
|
|
157
|
+
if (matches.length >= maxResults)
|
|
158
|
+
break;
|
|
159
|
+
const full = join(dirPath, entry);
|
|
160
|
+
const rel = prefix ? `${prefix}/${entry}` : entry;
|
|
161
|
+
const info = statSync(full);
|
|
162
|
+
if (info.isDirectory()) {
|
|
163
|
+
await searchDir(full, rel);
|
|
164
|
+
}
|
|
165
|
+
else {
|
|
166
|
+
await searchFile(full, rel);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
// Skip unreadable dirs
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
try {
|
|
175
|
+
const info = statSync(target);
|
|
176
|
+
if (info.isDirectory()) {
|
|
177
|
+
await searchDir(target, relPath === '.' ? '' : relPath);
|
|
178
|
+
}
|
|
179
|
+
else {
|
|
180
|
+
await searchFile(target, relPath);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
catch {
|
|
184
|
+
return { error: `Path not found: ${relPath}` };
|
|
185
|
+
}
|
|
186
|
+
return {
|
|
187
|
+
matches,
|
|
188
|
+
totalFound: matches.length,
|
|
189
|
+
truncated: matches.length >= maxResults,
|
|
190
|
+
};
|
|
191
|
+
},
|
|
192
|
+
}),
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Classify a failure using AI via the Vercel AI Gateway.
|
|
197
|
+
* Requires AI_GATEWAY_API_KEY in the environment.
|
|
198
|
+
*/
|
|
199
|
+
export async function classifyWithAI(evalResultDir, evalName, experimentName) {
|
|
200
|
+
const { generateText, hasToolCall, createGateway } = await import('ai');
|
|
201
|
+
const gateway = createGateway({ apiKey: process.env.AI_GATEWAY_API_KEY ?? process.env.VERCEL_OIDC_TOKEN ?? '' });
|
|
202
|
+
let classification = null;
|
|
203
|
+
const explorationTools = createClassifierTools(evalResultDir);
|
|
204
|
+
const allTools = {
|
|
205
|
+
...explorationTools,
|
|
206
|
+
classify: tool({
|
|
207
|
+
description: 'Submit your final classification. Call this once you have enough evidence.',
|
|
208
|
+
inputSchema: z.object({
|
|
209
|
+
failureType: z
|
|
210
|
+
.enum(['model', 'infra', 'timeout'])
|
|
211
|
+
.describe('The failure category'),
|
|
212
|
+
failureReason: z
|
|
213
|
+
.string()
|
|
214
|
+
.describe('Brief 1-2 sentence explanation of why'),
|
|
215
|
+
}),
|
|
216
|
+
execute: async ({ failureType, failureReason }) => {
|
|
217
|
+
classification = { failureType: failureType, failureReason };
|
|
218
|
+
return { ok: true };
|
|
219
|
+
},
|
|
220
|
+
}),
|
|
221
|
+
};
|
|
222
|
+
try {
|
|
223
|
+
await generateText({
|
|
224
|
+
model: gateway('anthropic/claude-haiku-4-5-20251001'),
|
|
225
|
+
system: CLASSIFIER_SYSTEM_PROMPT,
|
|
226
|
+
prompt: `Classify the failure for eval "${evalName}" (experiment: ${experimentName}). Use the exploration tools to investigate, then call classify() with your verdict.`,
|
|
227
|
+
tools: allTools,
|
|
228
|
+
stopWhen: hasToolCall('classify'),
|
|
229
|
+
});
|
|
230
|
+
return classification;
|
|
231
|
+
}
|
|
232
|
+
catch {
|
|
233
|
+
return null;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Classify a failed eval result using AI.
|
|
238
|
+
* Requires AI_GATEWAY_API_KEY in the environment.
|
|
239
|
+
*
|
|
240
|
+
* Caches results in classification.json within the eval result directory.
|
|
241
|
+
*/
|
|
242
|
+
export async function classifyFailure(evalResultDir, evalName, experimentName) {
|
|
243
|
+
// Check for cached classification
|
|
244
|
+
const cachedPath = join(evalResultDir, 'classification.json');
|
|
245
|
+
try {
|
|
246
|
+
const cached = JSON.parse(readFileSync(cachedPath, 'utf-8'));
|
|
247
|
+
if (cached.failureType && cached.failureReason) {
|
|
248
|
+
return { failureType: cached.failureType, failureReason: cached.failureReason };
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
// No cache
|
|
253
|
+
}
|
|
254
|
+
// Classify with AI
|
|
255
|
+
const classification = await classifyWithAI(evalResultDir, evalName, experimentName);
|
|
256
|
+
// Cache the result
|
|
257
|
+
if (classification) {
|
|
258
|
+
try {
|
|
259
|
+
writeFileSync(cachedPath, JSON.stringify(classification, null, 2));
|
|
260
|
+
}
|
|
261
|
+
catch {
|
|
262
|
+
// Non-fatal: caching failed
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
return classification;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Check if an eval result was classified as a non-model failure (infra or timeout).
|
|
269
|
+
* Reads classification.json — the single source of truth for classification data.
|
|
270
|
+
*
|
|
271
|
+
* Returns false for acknowledged failures (--ack-failures), since those are
|
|
272
|
+
* intentionally kept as final results.
|
|
273
|
+
*/
|
|
274
|
+
export function isNonModelFailure(evalResultDir) {
|
|
275
|
+
try {
|
|
276
|
+
const classification = JSON.parse(readFileSync(join(evalResultDir, 'classification.json'), 'utf-8'));
|
|
277
|
+
if (classification.acknowledged)
|
|
278
|
+
return false;
|
|
279
|
+
return classification.failureType != null && classification.failureType !== 'model';
|
|
280
|
+
}
|
|
281
|
+
catch {
|
|
282
|
+
return false;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
//# sourceMappingURL=classifier.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../../src/lib/classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AACxE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,IAAI,EAAE,MAAM,IAAI,CAAC;AAC1B,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB;;;;GAIG;AACH,MAAM,UAAU,mBAAmB;IACjC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,kBAAkB,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAC;AAC7E,CAAC;AAED,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;;;;qVAiBoT,CAAC;AAEtV;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,YAAoB;IAClD,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;IAC7C,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CAAC,aAAqB;IACzD,OAAO;QACL,UAAU,EAAE,IAAI,CAAC;YACf,WAAW,EACT,8FAA8F;YAChG,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,+DAA+D,CAAC;aAC7E,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE;gBACnC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;oBACpC,MAAM,OAAO,GAAkD,EAAE,CAAC;oBAClE,KAAK,MAAM,KAAK,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;wBACnC,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;wBAC3C,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;oBAC3E,CAAC;oBACD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;gBAC9B,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,gBAAgB,OAAO,EAAE,EAAE,CAAC;gBAC9C,CAAC;YACH,CAAC;SACF,CAAC;QAEF,SAAS,EAAE,IAAI,CAAC;YACd,WAAW,EACT,8FAA8F;YAChG,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,qDAAqD,CAAC;gBAClE,MAAM,EAAE,CAAC;qBACN,MAAM,EAAE;qBACR,QAAQ,CAAC,6CAA6C,CAAC;qBACvD,QAAQ,EAAE;gBACb,KAAK,EAAE,CAAC;qBACL,MAAM,EAAE;qBACR,QAAQ,CAAC,+BAA+B,CAAC;qBACzC,QAAQ,EAAE;aACd,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE,EAAE;gBACvE,MAAM,MAAM,GAAG,SAAS,IAAI,CAAC,CAAC;gBAC9B,MAAM,KAAK,GAAG,QAAQ,IAAI,GAAG,CAAC;gBAC9B,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;oBAC9C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;oBAClC,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,GAAG,KAAK,CAAC,CAAC;oBACnD,OAAO;wBACL,OAAO,EAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC;wBAC1B,UAAU,EAAE,KAAK,CAAC,MAAM;wBACxB,OAAO,EAAE,SAAS,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,OAAO,KAAK,CAAC,MAAM,EAAE;qBACxF,CAAC;gBACJ,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,gBAAgB,OAAO,EAAE,EAAE,CAAC;gBAC9C,CAAC;YACH,CAAC;SACF,CAAC;QAEF,IAAI,EAAE,IAAI,CAAC;YACT,WAAW,EACT,uFAAuF;YACzF,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,qCAAqC,CAAC;gBACnE,IAAI,EAAE,CAAC;qBACJ,MAAM,EAAE;qBACR,QAAQ,CAAC,8DAA8D,CAAC;gBAC3E,UAAU,EAAE,CAAC;qBACV,MAAM,EAAE;qBACR,QAAQ,CAAC,iCAAiC,CAAC;qBAC3C,QAAQ,EAAE;aACd,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,EAAE,EAAE;gBAChE,MAAM,UAAU,GAAG,MAAM,IAAI,EAAE,CAAC;gBAChC,MAAM,MAAM,GAAG,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;gBAChD,IAAI,CAAC,MAAM;oBAAE,OAAO,EAAE,KAAK,EAAE,gCAAgC,EAAE,CAAC;gBAChE,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;gBACvC,MAAM,OAAO,GAAwD,EAAE,CAAC;gBAExE,KAAK,UAAU,UAAU,CAAC,QAAgB,EAAE,OAAe;oBACzD,IAAI,CAAC;wBACH,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;wBAChD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;wBAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;4BACrE,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gCACzB,OAAO,CAAC,IAAI,CAAC;oCACX,IAAI,EAAE,OAAO;oCACb,IAAI,EAAE,CAAC,GAAG,CAAC;oCACX,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC;iCAC7B,CAAC,CAAC;4BACL,CAAC;wBACH,CAAC;oBACH,CAAC;oBAAC,MAAM,CAAC;wBACP,wBAAwB;oBAC1B,CAAC;gBACH,CAAC;gBAED,KAAK,UAAU,SAAS,CAAC,OAAe,EAAE,MAAc;oBACtD,IAAI,CAAC;wBACH,MAAM,OAAO,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC;wBACrC,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;4BAC5B,IAAI,OAAO,CAAC,MAAM,IAAI,UAAU;gCAAE,MAAM;4BACxC,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;4BAClC,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;4BAClD,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;4BAC5B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gCACvB,MAAM,SAAS,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;4BAC7B,CAAC;iCAAM,CAAC;gCACN,MAAM,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;4BAC9B,CAAC;wBACH,CAAC;oBACH,CAAC;oBAAC,MAAM,CAAC;wBACP,uBAAuB;oBACzB,CAAC;gBACH,CAAC;gBAED,IAAI,CAAC;oBACH,MAAM,IAAI,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC;oBAC9B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;wBACvB,MAAM,SAAS,CAAC,MAAM,EAAE,OAAO,KAAK,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;oBAC1D,CAAC;yBAAM,CAAC;wBACN,MAAM,UAAU,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;oBACpC,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,EAAE,KAAK,EAAE,mBAAmB,OAAO,EAAE,EAAE,CAAC;gBACjD,CAAC;gBAED,OAAO;oBACL,OAAO;oBACP,UAAU,EAAE,OAAO,CAAC,MAAM;oBAC1B,SAAS,EAAE,OAAO,CAAC,MAAM,IAAI,UAAU;iBACxC,CAAC;YACJ,CAAC;SACF,CAAC;KACH,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,aAAqB,EACrB,QAAgB,EAChB,cAAsB;IAEtB,MAAM,EAAE,YAAY,EAAE,WAAW,EAAE,aAAa,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;IAExE,MAAM,OAAO,GAAG,aAAa,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,kBAAkB,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,EAAE,EAAE,CAAC,CAAC;IAEjH,IAAI,cAAc,GAA0B,IAAI,CAAC;IAEjD,MAAM,gBAAgB,GAAG,qBAAqB,CAAC,aAAa,CAAC,CAAC;IAC9D,MAAM,QAAQ,GAAG;QACf,GAAG,gBAAgB;QACnB,QAAQ,EAAE,IAAI,CAAC;YACb,WAAW,EAAE,4EAA4E;YACzF,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;gBACpB,WAAW,EAAE,CAAC;qBACX,IAAI,CAAC,CAAC,OAAO,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;qBACnC,QAAQ,CAAC,sBAAsB,CAAC;gBACnC,aAAa,EAAE,CAAC;qBACb,MAAM,EAAE;qBACR,QAAQ,CAAC,uCAAuC,CAAC;aACrD,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,EAAE,WAAW,EAAE,aAAa,EAAE,EAAE,EAAE;gBAChD,cAAc,GAAG,EAAE,WAAW,EAAE,WAA0B,EAAE,aAAa,EAAE,CAAC;gBAC5E,OAAO,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC;YACtB,CAAC;SACF,CAAC;KACH,CAAC;IAEF,IAAI,CAAC;QACH,MAAM,YAAY,CAAC;YACjB,KAAK,EAAE,OAAO,CAAC,qCAAqC,CAAC;YACrD,MAAM,EAAE,wBAAwB;YAChC,MAAM,EAAE,kCAAkC,QAAQ,kBAAkB,cAAc,sFAAsF;YACxK,KAAK,EAAE,QAAQ;YACf,QAAQ,EAAE,WAAW,CAAC,UAAU,CAAC;SAClC,CAAC,CAAC;QAEH,OAAO,cAAc,CAAC;IACxB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,aAAqB,EACrB,QAAgB,EAChB,cAAsB;IAEtB,kCAAkC;IAClC,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE,qBAAqB,CAAC,CAAC;IAC9D,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC;QAC7D,IAAI,MAAM,CAAC,WAAW,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,aAAa,EAAE,MAAM,CAAC,aAAa,EAAE,CAAC;QAClF,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,WAAW;IACb,CAAC;IAED,mBAAmB;IACnB,MAAM,cAAc,GAAG,MAAM,cAAc,CAAC,aAAa,EAAE,QAAQ,EAAE,cAAc,CAAC,CAAC;IAErF,mBAAmB;IACnB,IAAI,cAAc,EAAE,CAAC;QACnB,IAAI,CAAC;YACH,aAAa,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,cAAc,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QACrE,CAAC;QAAC,MAAM,CAAC;YACP,4BAA4B;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,cAAc,CAAC;AACxB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,aAAqB;IACrD,IAAI,CAAC;QACH,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,IAAI,CAAC,aAAa,EAAE,qBAAqB,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;QACrG,IAAI,cAAc,CAAC,YAAY;YAAE,OAAO,KAAK,CAAC;QAC9C,OAAO,cAAc,CAAC,WAAW,IAAI,IAAI,IAAI,cAAc,CAAC,WAAW,KAAK,OAAO,CAAC;IACtF,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Experiment configuration validation and defaults.
|
|
3
|
+
*/
|
|
4
|
+
import type { ExperimentConfig, ResolvedExperimentConfig, EvalFilter } from './types.js';
|
|
5
|
+
/**
|
|
6
|
+
* Default configuration values.
|
|
7
|
+
*/
|
|
8
|
+
export declare const CONFIG_DEFAULTS: {
|
|
9
|
+
model: "opus";
|
|
10
|
+
evals: "*";
|
|
11
|
+
runs: number;
|
|
12
|
+
earlyExit: boolean;
|
|
13
|
+
scripts: string[];
|
|
14
|
+
timeout: number;
|
|
15
|
+
sandbox: "auto";
|
|
16
|
+
copyFiles: "none";
|
|
17
|
+
};
|
|
18
|
+
/**
|
|
19
|
+
* Validates an experiment configuration object.
|
|
20
|
+
* Throws a descriptive error if validation fails.
|
|
21
|
+
*/
|
|
22
|
+
export declare function validateConfig(config: unknown): ExperimentConfig;
|
|
23
|
+
/**
|
|
24
|
+
* Resolves an experiment configuration by applying defaults.
|
|
25
|
+
*/
|
|
26
|
+
export declare function resolveConfig(config: ExperimentConfig): ResolvedExperimentConfig;
|
|
27
|
+
/**
|
|
28
|
+
* Loads an experiment configuration from a file path.
|
|
29
|
+
* Supports TypeScript and JavaScript files with default exports.
|
|
30
|
+
*/
|
|
31
|
+
export declare function loadConfig(configPath: string): Promise<ResolvedExperimentConfig>;
|
|
32
|
+
/**
|
|
33
|
+
* Resolves the evals filter to a list of eval names.
|
|
34
|
+
* Supports glob patterns like "vercel-cli/*" for nested directories.
|
|
35
|
+
*/
|
|
36
|
+
export declare function resolveEvalNames(filter: string | string[] | EvalFilter, availableEvals: string[]): string[];
|
|
37
|
+
//# sourceMappingURL=config.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../src/lib/config.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,KAAK,EACV,gBAAgB,EAChB,wBAAwB,EACxB,UAAU,EACX,MAAM,YAAY,CAAC;AAGpB;;GAEG;AACH,eAAO,MAAM,eAAe;;;;;aAKX,MAAM,EAAE;;;;CAIxB,CAAC;AA6BF;;;GAGG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,OAAO,GAAG,gBAAgB,CAWhE;AAED;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,gBAAgB,GAAG,wBAAwB,CAoBhF;AAED;;;GAGG;AACH,wBAAsB,UAAU,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,wBAAwB,CAAC,CA8BtF;AAgBD;;;GAGG;AACH,wBAAgB,gBAAgB,CAC9B,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,UAAU,EACtC,cAAc,EAAE,MAAM,EAAE,GACvB,MAAM,EAAE,CA0DV"}
|