@fastino-ai/pioneer-cli 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +161 -22
- package/bun.lock +82 -0
- package/cache/cache.db +0 -0
- package/cache/cache.db-shm +0 -0
- package/cache/cache.db-wal +0 -0
- package/fastino-ai-pioneer-cli-0.2.0.tgz +0 -0
- package/package.json +6 -3
- package/src/agent/Agent.ts +342 -0
- package/src/agent/BudgetManager.ts +167 -0
- package/src/agent/FileResolver.ts +321 -0
- package/src/agent/LLMClient.ts +435 -0
- package/src/agent/ToolRegistry.ts +97 -0
- package/src/agent/index.ts +15 -0
- package/src/agent/types.ts +84 -0
- package/src/chat/ChatApp.tsx +701 -0
- package/src/chat/index.ts +7 -0
- package/src/config.ts +185 -3
- package/src/evolution/EvalRunner.ts +301 -0
- package/src/evolution/EvolutionEngine.ts +319 -0
- package/src/evolution/FeedbackCollector.ts +197 -0
- package/src/evolution/ModelTrainer.ts +371 -0
- package/src/evolution/index.ts +18 -0
- package/src/evolution/types.ts +110 -0
- package/src/index.tsx +101 -2
- package/src/tools/bash.ts +184 -0
- package/src/tools/filesystem.ts +444 -0
- package/src/tools/index.ts +29 -0
- package/src/tools/modal.ts +269 -0
- package/src/tools/sandbox.ts +310 -0
- package/src/tools/training.ts +443 -0
- package/src/tools/wandb.ts +348 -0
package/src/config.ts
CHANGED
|
@@ -1,15 +1,69 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Configuration management for Pioneer CLI.
|
|
3
|
-
* Stores API key
|
|
3
|
+
* Stores API key, base URL, and agent settings in ~/.pioneer/config.json
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import fs from "fs";
|
|
7
7
|
import os from "os";
|
|
8
8
|
import path from "path";
|
|
9
9
|
|
|
10
|
+
export interface AgentProviderConfig {
|
|
11
|
+
provider: "anthropic" | "openai" | "local";
|
|
12
|
+
model: string;
|
|
13
|
+
apiKey?: string;
|
|
14
|
+
baseUrl?: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface BudgetConfig {
|
|
18
|
+
maxTokens?: number;
|
|
19
|
+
maxCost?: number; // in USD
|
|
20
|
+
maxTime?: number; // in seconds
|
|
21
|
+
maxIterations?: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface SandboxConfig {
|
|
25
|
+
useDocker?: boolean;
|
|
26
|
+
dockerImage?: string;
|
|
27
|
+
timeout?: number;
|
|
28
|
+
memoryLimit?: string;
|
|
29
|
+
cpuLimit?: number;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface MLConfig {
|
|
33
|
+
modal?: {
|
|
34
|
+
tokenId?: string;
|
|
35
|
+
tokenSecret?: string;
|
|
36
|
+
};
|
|
37
|
+
wandb?: {
|
|
38
|
+
apiKey?: string;
|
|
39
|
+
entity?: string;
|
|
40
|
+
project?: string;
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface EvolutionConfigOptions {
|
|
45
|
+
enabled?: boolean;
|
|
46
|
+
targetScore?: number;
|
|
47
|
+
maxIterations?: number;
|
|
48
|
+
budgetPerIteration?: BudgetConfig;
|
|
49
|
+
trainingProvider?: "openai" | "modal" | "local";
|
|
50
|
+
trainingBaseModel?: string;
|
|
51
|
+
}
|
|
52
|
+
|
|
10
53
|
export interface Config {
|
|
54
|
+
// Existing Pioneer config
|
|
11
55
|
apiKey?: string;
|
|
12
56
|
baseUrl?: string;
|
|
57
|
+
|
|
58
|
+
// Agent config
|
|
59
|
+
agent?: AgentProviderConfig;
|
|
60
|
+
budget?: BudgetConfig;
|
|
61
|
+
sandbox?: SandboxConfig;
|
|
62
|
+
ml?: MLConfig;
|
|
63
|
+
evolution?: EvolutionConfigOptions;
|
|
64
|
+
|
|
65
|
+
// System prompt customization
|
|
66
|
+
systemPrompt?: string;
|
|
13
67
|
}
|
|
14
68
|
|
|
15
69
|
const CONFIG_DIR = path.join(os.homedir(), ".pioneer");
|
|
@@ -18,6 +72,18 @@ const CONFIG_FILE = path.join(CONFIG_DIR, "config.json");
|
|
|
18
72
|
export const DEFAULT_BASE_URL =
|
|
19
73
|
process.env.PIONEER_API_URL ?? "http://localhost:5001";
|
|
20
74
|
|
|
75
|
+
export const DEFAULT_AGENT_CONFIG: AgentProviderConfig = {
|
|
76
|
+
provider: "anthropic",
|
|
77
|
+
model: "claude-sonnet-4-5-20250929",
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
export const DEFAULT_BUDGET: BudgetConfig = {
|
|
81
|
+
maxTokens: 500000, // 500k tokens
|
|
82
|
+
maxCost: 5.0, // $5 USD
|
|
83
|
+
maxTime: 7200, // 2 hours
|
|
84
|
+
maxIterations: 100,
|
|
85
|
+
};
|
|
86
|
+
|
|
21
87
|
function ensureConfigDir(): void {
|
|
22
88
|
if (!fs.existsSync(CONFIG_DIR)) {
|
|
23
89
|
fs.mkdirSync(CONFIG_DIR, { recursive: true });
|
|
@@ -36,13 +102,37 @@ export function loadConfig(): Config {
|
|
|
36
102
|
return {};
|
|
37
103
|
}
|
|
38
104
|
|
|
39
|
-
export function saveConfig(config: Config): void {
|
|
105
|
+
export function saveConfig(config: Partial<Config>): void {
|
|
40
106
|
ensureConfigDir();
|
|
41
107
|
const existing = loadConfig();
|
|
42
|
-
const merged =
|
|
108
|
+
const merged = deepMerge(existing, config);
|
|
43
109
|
fs.writeFileSync(CONFIG_FILE, JSON.stringify(merged, null, 2));
|
|
44
110
|
}
|
|
45
111
|
|
|
112
|
+
function deepMerge(target: Config, source: Partial<Config>): Config {
|
|
113
|
+
const result = { ...target };
|
|
114
|
+
for (const key of Object.keys(source) as Array<keyof Config>) {
|
|
115
|
+
const sourceValue = source[key];
|
|
116
|
+
const targetValue = result[key];
|
|
117
|
+
if (
|
|
118
|
+
sourceValue !== null &&
|
|
119
|
+
typeof sourceValue === "object" &&
|
|
120
|
+
!Array.isArray(sourceValue) &&
|
|
121
|
+
targetValue !== null &&
|
|
122
|
+
typeof targetValue === "object" &&
|
|
123
|
+
!Array.isArray(targetValue)
|
|
124
|
+
) {
|
|
125
|
+
(result as Record<string, unknown>)[key] = {
|
|
126
|
+
...targetValue,
|
|
127
|
+
...sourceValue,
|
|
128
|
+
};
|
|
129
|
+
} else if (sourceValue !== undefined) {
|
|
130
|
+
(result as Record<string, unknown>)[key] = sourceValue;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return result;
|
|
134
|
+
}
|
|
135
|
+
|
|
46
136
|
export function clearApiKey(): void {
|
|
47
137
|
const config = loadConfig();
|
|
48
138
|
delete config.apiKey;
|
|
@@ -62,3 +152,95 @@ export function getBaseUrl(): string {
|
|
|
62
152
|
const config = loadConfig();
|
|
63
153
|
return config.baseUrl ?? DEFAULT_BASE_URL;
|
|
64
154
|
}
|
|
155
|
+
|
|
156
|
+
// Agent configuration helpers
|
|
157
|
+
export function getAgentConfig(): AgentProviderConfig {
|
|
158
|
+
const config = loadConfig();
|
|
159
|
+
|
|
160
|
+
// Check environment variables for API keys
|
|
161
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
162
|
+
const openaiKey = process.env.OPENAI_API_KEY;
|
|
163
|
+
|
|
164
|
+
if (config.agent) {
|
|
165
|
+
return {
|
|
166
|
+
...config.agent,
|
|
167
|
+
apiKey: config.agent.apiKey ||
|
|
168
|
+
(config.agent.provider === "anthropic" ? anthropicKey : openaiKey),
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Default to Anthropic if key is available
|
|
173
|
+
if (anthropicKey) {
|
|
174
|
+
return {
|
|
175
|
+
...DEFAULT_AGENT_CONFIG,
|
|
176
|
+
apiKey: anthropicKey,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Fall back to OpenAI
|
|
181
|
+
if (openaiKey) {
|
|
182
|
+
return {
|
|
183
|
+
provider: "openai",
|
|
184
|
+
model: "gpt-4o",
|
|
185
|
+
apiKey: openaiKey,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
return DEFAULT_AGENT_CONFIG;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
export function getBudgetConfig(): BudgetConfig {
|
|
193
|
+
const config = loadConfig();
|
|
194
|
+
return {
|
|
195
|
+
...DEFAULT_BUDGET,
|
|
196
|
+
...config.budget,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
export function getSandboxConfig(): SandboxConfig {
|
|
201
|
+
const config = loadConfig();
|
|
202
|
+
return {
|
|
203
|
+
useDocker: false,
|
|
204
|
+
dockerImage: "python:3.11-slim",
|
|
205
|
+
timeout: 30000,
|
|
206
|
+
memoryLimit: "512m",
|
|
207
|
+
cpuLimit: 1,
|
|
208
|
+
...config.sandbox,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
export function getMLConfig(): MLConfig {
|
|
213
|
+
const config = loadConfig();
|
|
214
|
+
return {
|
|
215
|
+
modal: {
|
|
216
|
+
tokenId: process.env.MODAL_TOKEN_ID,
|
|
217
|
+
tokenSecret: process.env.MODAL_TOKEN_SECRET,
|
|
218
|
+
...config.ml?.modal,
|
|
219
|
+
},
|
|
220
|
+
wandb: {
|
|
221
|
+
apiKey: process.env.WANDB_API_KEY,
|
|
222
|
+
...config.ml?.wandb,
|
|
223
|
+
},
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
export function getEvolutionConfig(): EvolutionConfigOptions {
|
|
228
|
+
const config = loadConfig();
|
|
229
|
+
return {
|
|
230
|
+
enabled: false,
|
|
231
|
+
targetScore: 0.9,
|
|
232
|
+
maxIterations: 10,
|
|
233
|
+
trainingProvider: "local",
|
|
234
|
+
...config.evolution,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
export function getSystemPrompt(): string | undefined {
|
|
239
|
+
return loadConfig().systemPrompt;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Get config directory path
|
|
243
|
+
export function getConfigDir(): string {
|
|
244
|
+
ensureConfigDir();
|
|
245
|
+
return CONFIG_DIR;
|
|
246
|
+
}
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvalRunner - Run evaluations to measure agent improvement
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { Agent } from "../agent/Agent.js";
|
|
6
|
+
import type {
|
|
7
|
+
EvalCase,
|
|
8
|
+
EvalResult,
|
|
9
|
+
EvalRunSummary,
|
|
10
|
+
EvalCriteria,
|
|
11
|
+
} from "./types.js";
|
|
12
|
+
|
|
13
|
+
export interface EvalRunnerConfig {
|
|
14
|
+
timeout?: number;
|
|
15
|
+
parallelism?: number;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export class EvalRunner {
|
|
19
|
+
private config: EvalRunnerConfig;
|
|
20
|
+
|
|
21
|
+
constructor(config: EvalRunnerConfig = {}) {
|
|
22
|
+
this.config = {
|
|
23
|
+
timeout: config.timeout || 60000,
|
|
24
|
+
parallelism: config.parallelism || 1,
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async runEval(agent: Agent, evalCase: EvalCase): Promise<EvalResult> {
|
|
29
|
+
const startTime = Date.now();
|
|
30
|
+
const initialUsage = agent.getBudgetStatus().usage.tokensUsed;
|
|
31
|
+
|
|
32
|
+
try {
|
|
33
|
+
// Run the agent with the eval input
|
|
34
|
+
const response = await agent.chat(evalCase.input, false);
|
|
35
|
+
const duration = Date.now() - startTime;
|
|
36
|
+
const tokenUsage = agent.getBudgetStatus().usage.tokensUsed - initialUsage;
|
|
37
|
+
|
|
38
|
+
// Get tools that were called (from message history)
|
|
39
|
+
const messages = agent.getMessages();
|
|
40
|
+
const toolsCalled: string[] = [];
|
|
41
|
+
for (const msg of messages) {
|
|
42
|
+
if (msg.toolCalls) {
|
|
43
|
+
for (const tc of msg.toolCalls) {
|
|
44
|
+
toolsCalled.push(tc.name);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Evaluate criteria
|
|
50
|
+
const { passed, score, errors } = this.evaluateCriteria(
|
|
51
|
+
response,
|
|
52
|
+
toolsCalled,
|
|
53
|
+
evalCase
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
caseId: evalCase.id,
|
|
58
|
+
passed,
|
|
59
|
+
score,
|
|
60
|
+
actualOutput: response,
|
|
61
|
+
toolsCalled,
|
|
62
|
+
errors,
|
|
63
|
+
duration,
|
|
64
|
+
tokenUsage,
|
|
65
|
+
};
|
|
66
|
+
} catch (error) {
|
|
67
|
+
return {
|
|
68
|
+
caseId: evalCase.id,
|
|
69
|
+
passed: false,
|
|
70
|
+
score: 0,
|
|
71
|
+
actualOutput: "",
|
|
72
|
+
toolsCalled: [],
|
|
73
|
+
errors: [error instanceof Error ? error.message : String(error)],
|
|
74
|
+
duration: Date.now() - startTime,
|
|
75
|
+
tokenUsage: agent.getBudgetStatus().usage.tokensUsed - initialUsage,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
private evaluateCriteria(
|
|
81
|
+
output: string,
|
|
82
|
+
toolsCalled: string[],
|
|
83
|
+
evalCase: EvalCase
|
|
84
|
+
): { passed: boolean; score: number; errors: string[] } {
|
|
85
|
+
const errors: string[] = [];
|
|
86
|
+
let passedCriteria = 0;
|
|
87
|
+
const totalCriteria = evalCase.successCriteria.length;
|
|
88
|
+
|
|
89
|
+
for (const criteria of evalCase.successCriteria) {
|
|
90
|
+
const { passed, error } = this.checkCriterion(
|
|
91
|
+
output,
|
|
92
|
+
toolsCalled,
|
|
93
|
+
criteria
|
|
94
|
+
);
|
|
95
|
+
if (passed) {
|
|
96
|
+
passedCriteria++;
|
|
97
|
+
} else if (error) {
|
|
98
|
+
errors.push(error);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Check expected output if provided
|
|
103
|
+
if (evalCase.expectedOutput) {
|
|
104
|
+
if (output.toLowerCase().includes(evalCase.expectedOutput.toLowerCase())) {
|
|
105
|
+
passedCriteria++;
|
|
106
|
+
} else {
|
|
107
|
+
errors.push(`Expected output not found: "${evalCase.expectedOutput}"`);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Check expected tool calls if provided
|
|
112
|
+
if (evalCase.expectedToolCalls) {
|
|
113
|
+
const missingTools = evalCase.expectedToolCalls.filter(
|
|
114
|
+
(t) => !toolsCalled.includes(t)
|
|
115
|
+
);
|
|
116
|
+
if (missingTools.length === 0) {
|
|
117
|
+
passedCriteria++;
|
|
118
|
+
} else {
|
|
119
|
+
errors.push(`Expected tools not called: ${missingTools.join(", ")}`);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const score = totalCriteria > 0 ? passedCriteria / totalCriteria : 1;
|
|
124
|
+
const passed = errors.length === 0 && score >= 0.8;
|
|
125
|
+
|
|
126
|
+
return { passed, score, errors };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
private checkCriterion(
|
|
130
|
+
output: string,
|
|
131
|
+
toolsCalled: string[],
|
|
132
|
+
criteria: EvalCriteria
|
|
133
|
+
): { passed: boolean; error?: string } {
|
|
134
|
+
switch (criteria.type) {
|
|
135
|
+
case "contains":
|
|
136
|
+
if (output.toLowerCase().includes(criteria.value.toLowerCase())) {
|
|
137
|
+
return { passed: true };
|
|
138
|
+
}
|
|
139
|
+
return {
|
|
140
|
+
passed: false,
|
|
141
|
+
error: `Output should contain: "${criteria.value}"`,
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
case "not_contains":
|
|
145
|
+
if (!output.toLowerCase().includes(criteria.value.toLowerCase())) {
|
|
146
|
+
return { passed: true };
|
|
147
|
+
}
|
|
148
|
+
return {
|
|
149
|
+
passed: false,
|
|
150
|
+
error: `Output should not contain: "${criteria.value}"`,
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
case "tool_called":
|
|
154
|
+
if (toolsCalled.includes(criteria.value)) {
|
|
155
|
+
return { passed: true };
|
|
156
|
+
}
|
|
157
|
+
return {
|
|
158
|
+
passed: false,
|
|
159
|
+
error: `Tool should be called: "${criteria.value}"`,
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
case "tool_not_called":
|
|
163
|
+
if (!toolsCalled.includes(criteria.value)) {
|
|
164
|
+
return { passed: true };
|
|
165
|
+
}
|
|
166
|
+
return {
|
|
167
|
+
passed: false,
|
|
168
|
+
error: `Tool should not be called: "${criteria.value}"`,
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
case "regex":
|
|
172
|
+
try {
|
|
173
|
+
const regex = new RegExp(criteria.value, "i");
|
|
174
|
+
if (regex.test(output)) {
|
|
175
|
+
return { passed: true };
|
|
176
|
+
}
|
|
177
|
+
return {
|
|
178
|
+
passed: false,
|
|
179
|
+
error: `Output should match pattern: "${criteria.value}"`,
|
|
180
|
+
};
|
|
181
|
+
} catch {
|
|
182
|
+
return { passed: false, error: `Invalid regex: "${criteria.value}"` };
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
case "custom":
|
|
186
|
+
// Custom criteria would need to be implemented separately
|
|
187
|
+
return { passed: true };
|
|
188
|
+
|
|
189
|
+
default:
|
|
190
|
+
return { passed: false, error: `Unknown criteria type` };
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
async runEvalSuite(agent: Agent, evalCases: EvalCase[]): Promise<EvalRunSummary> {
|
|
195
|
+
const runId = `eval_${Date.now()}`;
|
|
196
|
+
const startTime = Date.now();
|
|
197
|
+
const results: EvalResult[] = [];
|
|
198
|
+
|
|
199
|
+
// Run evals sequentially for now (could parallelize)
|
|
200
|
+
for (const evalCase of evalCases) {
|
|
201
|
+
// Clear agent history between evals
|
|
202
|
+
agent.clearHistory();
|
|
203
|
+
|
|
204
|
+
const result = await this.runEval(agent, evalCase);
|
|
205
|
+
results.push(result);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const passedCases = results.filter((r) => r.passed).length;
|
|
209
|
+
const totalTokens = results.reduce((sum, r) => sum + r.tokenUsage, 0);
|
|
210
|
+
const totalDuration = Date.now() - startTime;
|
|
211
|
+
|
|
212
|
+
// Calculate weighted average score
|
|
213
|
+
let weightedScore = 0;
|
|
214
|
+
let totalWeight = 0;
|
|
215
|
+
for (let i = 0; i < results.length; i++) {
|
|
216
|
+
const weight = evalCases[i].weight || 1;
|
|
217
|
+
weightedScore += results[i].score * weight;
|
|
218
|
+
totalWeight += weight;
|
|
219
|
+
}
|
|
220
|
+
const averageScore = totalWeight > 0 ? weightedScore / totalWeight : 0;
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
runId,
|
|
224
|
+
timestamp: new Date(),
|
|
225
|
+
totalCases: evalCases.length,
|
|
226
|
+
passedCases,
|
|
227
|
+
failedCases: evalCases.length - passedCases,
|
|
228
|
+
averageScore,
|
|
229
|
+
totalTokens,
|
|
230
|
+
totalDuration,
|
|
231
|
+
results,
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Format results for display
|
|
236
|
+
formatResults(summary: EvalRunSummary): string {
|
|
237
|
+
let output = `\nEvaluation Results (${summary.runId})\n`;
|
|
238
|
+
output += "=".repeat(50) + "\n\n";
|
|
239
|
+
output += `Total: ${summary.totalCases} | Passed: ${summary.passedCases} | Failed: ${summary.failedCases}\n`;
|
|
240
|
+
output += `Average Score: ${(summary.averageScore * 100).toFixed(1)}%\n`;
|
|
241
|
+
output += `Tokens Used: ${summary.totalTokens.toLocaleString()}\n`;
|
|
242
|
+
output += `Duration: ${(summary.totalDuration / 1000).toFixed(1)}s\n\n`;
|
|
243
|
+
|
|
244
|
+
for (const result of summary.results) {
|
|
245
|
+
const status = result.passed ? "✓" : "✗";
|
|
246
|
+
const color = result.passed ? "green" : "red";
|
|
247
|
+
output += `${status} ${result.caseId}: ${(result.score * 100).toFixed(0)}%`;
|
|
248
|
+
if (result.errors && result.errors.length > 0) {
|
|
249
|
+
output += `\n Errors: ${result.errors.join(", ")}`;
|
|
250
|
+
}
|
|
251
|
+
output += "\n";
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return output;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Predefined eval cases for common agent capabilities
|
|
259
|
+
export const DEFAULT_EVAL_CASES: EvalCase[] = [
|
|
260
|
+
{
|
|
261
|
+
id: "bash_basic",
|
|
262
|
+
name: "Basic Bash Execution",
|
|
263
|
+
description: "Test that the agent can execute a simple bash command",
|
|
264
|
+
input: "What is the current date? Use bash to find out.",
|
|
265
|
+
successCriteria: [
|
|
266
|
+
{ type: "tool_called", value: "bash" },
|
|
267
|
+
{ type: "regex", value: "\\d{4}" }, // Year should appear
|
|
268
|
+
],
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
id: "file_read",
|
|
272
|
+
name: "File Reading",
|
|
273
|
+
description: "Test that the agent can read a file",
|
|
274
|
+
input: "Read the contents of package.json and tell me the project name.",
|
|
275
|
+
successCriteria: [
|
|
276
|
+
{ type: "tool_called", value: "read_file" },
|
|
277
|
+
{ type: "contains", value: "pioneer" },
|
|
278
|
+
],
|
|
279
|
+
},
|
|
280
|
+
{
|
|
281
|
+
id: "code_execution",
|
|
282
|
+
name: "Code Sandbox",
|
|
283
|
+
description: "Test that the agent can execute code in a sandbox",
|
|
284
|
+
input: "Calculate the factorial of 5 using Python.",
|
|
285
|
+
successCriteria: [
|
|
286
|
+
{ type: "tool_called", value: "execute_code" },
|
|
287
|
+
{ type: "contains", value: "120" },
|
|
288
|
+
],
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
id: "multi_step",
|
|
292
|
+
name: "Multi-step Task",
|
|
293
|
+
description: "Test that the agent can complete a multi-step task",
|
|
294
|
+
input: "List the files in the current directory, then read the README.md file and summarize it.",
|
|
295
|
+
successCriteria: [
|
|
296
|
+
{ type: "tool_called", value: "list_directory" },
|
|
297
|
+
{ type: "tool_called", value: "read_file" },
|
|
298
|
+
],
|
|
299
|
+
},
|
|
300
|
+
];
|
|
301
|
+
|