@fastino-ai/pioneer-cli 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/config.ts CHANGED
@@ -1,15 +1,69 @@
1
1
  /**
2
2
  * Configuration management for Pioneer CLI.
3
- * Stores API key and base URL in ~/.pioneer/config.json
3
+ * Stores API key, base URL, and agent settings in ~/.pioneer/config.json
4
4
  */
5
5
 
6
6
  import fs from "fs";
7
7
  import os from "os";
8
8
  import path from "path";
9
9
 
10
+ export interface AgentProviderConfig {
11
+ provider: "anthropic" | "openai" | "local";
12
+ model: string;
13
+ apiKey?: string;
14
+ baseUrl?: string;
15
+ }
16
+
17
+ export interface BudgetConfig {
18
+ maxTokens?: number;
19
+ maxCost?: number; // in USD
20
+ maxTime?: number; // in seconds
21
+ maxIterations?: number;
22
+ }
23
+
24
+ export interface SandboxConfig {
25
+ useDocker?: boolean;
26
+ dockerImage?: string;
27
+ timeout?: number;
28
+ memoryLimit?: string;
29
+ cpuLimit?: number;
30
+ }
31
+
32
+ export interface MLConfig {
33
+ modal?: {
34
+ tokenId?: string;
35
+ tokenSecret?: string;
36
+ };
37
+ wandb?: {
38
+ apiKey?: string;
39
+ entity?: string;
40
+ project?: string;
41
+ };
42
+ }
43
+
44
+ export interface EvolutionConfigOptions {
45
+ enabled?: boolean;
46
+ targetScore?: number;
47
+ maxIterations?: number;
48
+ budgetPerIteration?: BudgetConfig;
49
+ trainingProvider?: "openai" | "modal" | "local";
50
+ trainingBaseModel?: string;
51
+ }
52
+
10
53
  export interface Config {
54
+ // Existing Pioneer config
11
55
  apiKey?: string;
12
56
  baseUrl?: string;
57
+
58
+ // Agent config
59
+ agent?: AgentProviderConfig;
60
+ budget?: BudgetConfig;
61
+ sandbox?: SandboxConfig;
62
+ ml?: MLConfig;
63
+ evolution?: EvolutionConfigOptions;
64
+
65
+ // System prompt customization
66
+ systemPrompt?: string;
13
67
  }
14
68
 
15
69
  const CONFIG_DIR = path.join(os.homedir(), ".pioneer");
@@ -18,6 +72,18 @@ const CONFIG_FILE = path.join(CONFIG_DIR, "config.json");
18
72
  export const DEFAULT_BASE_URL =
19
73
  process.env.PIONEER_API_URL ?? "http://localhost:5001";
20
74
 
75
+ export const DEFAULT_AGENT_CONFIG: AgentProviderConfig = {
76
+ provider: "anthropic",
77
+ model: "claude-sonnet-4-5-20250929",
78
+ };
79
+
80
+ export const DEFAULT_BUDGET: BudgetConfig = {
81
+ maxTokens: 500000, // 500k tokens
82
+ maxCost: 5.0, // $5 USD
83
+ maxTime: 7200, // 2 hours
84
+ maxIterations: 100,
85
+ };
86
+
21
87
  function ensureConfigDir(): void {
22
88
  if (!fs.existsSync(CONFIG_DIR)) {
23
89
  fs.mkdirSync(CONFIG_DIR, { recursive: true });
@@ -36,13 +102,37 @@ export function loadConfig(): Config {
36
102
  return {};
37
103
  }
38
104
 
39
- export function saveConfig(config: Config): void {
105
+ export function saveConfig(config: Partial<Config>): void {
40
106
  ensureConfigDir();
41
107
  const existing = loadConfig();
42
- const merged = { ...existing, ...config };
108
+ const merged = deepMerge(existing, config);
43
109
  fs.writeFileSync(CONFIG_FILE, JSON.stringify(merged, null, 2));
44
110
  }
45
111
 
112
+ function deepMerge(target: Config, source: Partial<Config>): Config {
113
+ const result = { ...target };
114
+ for (const key of Object.keys(source) as Array<keyof Config>) {
115
+ const sourceValue = source[key];
116
+ const targetValue = result[key];
117
+ if (
118
+ sourceValue !== null &&
119
+ typeof sourceValue === "object" &&
120
+ !Array.isArray(sourceValue) &&
121
+ targetValue !== null &&
122
+ typeof targetValue === "object" &&
123
+ !Array.isArray(targetValue)
124
+ ) {
125
+ (result as Record<string, unknown>)[key] = {
126
+ ...targetValue,
127
+ ...sourceValue,
128
+ };
129
+ } else if (sourceValue !== undefined) {
130
+ (result as Record<string, unknown>)[key] = sourceValue;
131
+ }
132
+ }
133
+ return result;
134
+ }
135
+
46
136
  export function clearApiKey(): void {
47
137
  const config = loadConfig();
48
138
  delete config.apiKey;
@@ -62,3 +152,95 @@ export function getBaseUrl(): string {
62
152
  const config = loadConfig();
63
153
  return config.baseUrl ?? DEFAULT_BASE_URL;
64
154
  }
155
+
156
+ // Agent configuration helpers
157
+ export function getAgentConfig(): AgentProviderConfig {
158
+ const config = loadConfig();
159
+
160
+ // Check environment variables for API keys
161
+ const anthropicKey = process.env.ANTHROPIC_API_KEY;
162
+ const openaiKey = process.env.OPENAI_API_KEY;
163
+
164
+ if (config.agent) {
165
+ return {
166
+ ...config.agent,
167
+ apiKey: config.agent.apiKey ||
168
+ (config.agent.provider === "anthropic" ? anthropicKey : openaiKey),
169
+ };
170
+ }
171
+
172
+ // Default to Anthropic if key is available
173
+ if (anthropicKey) {
174
+ return {
175
+ ...DEFAULT_AGENT_CONFIG,
176
+ apiKey: anthropicKey,
177
+ };
178
+ }
179
+
180
+ // Fall back to OpenAI
181
+ if (openaiKey) {
182
+ return {
183
+ provider: "openai",
184
+ model: "gpt-4o",
185
+ apiKey: openaiKey,
186
+ };
187
+ }
188
+
189
+ return DEFAULT_AGENT_CONFIG;
190
+ }
191
+
192
+ export function getBudgetConfig(): BudgetConfig {
193
+ const config = loadConfig();
194
+ return {
195
+ ...DEFAULT_BUDGET,
196
+ ...config.budget,
197
+ };
198
+ }
199
+
200
+ export function getSandboxConfig(): SandboxConfig {
201
+ const config = loadConfig();
202
+ return {
203
+ useDocker: false,
204
+ dockerImage: "python:3.11-slim",
205
+ timeout: 30000,
206
+ memoryLimit: "512m",
207
+ cpuLimit: 1,
208
+ ...config.sandbox,
209
+ };
210
+ }
211
+
212
+ export function getMLConfig(): MLConfig {
213
+ const config = loadConfig();
214
+ return {
215
+ modal: {
216
+ tokenId: process.env.MODAL_TOKEN_ID,
217
+ tokenSecret: process.env.MODAL_TOKEN_SECRET,
218
+ ...config.ml?.modal,
219
+ },
220
+ wandb: {
221
+ apiKey: process.env.WANDB_API_KEY,
222
+ ...config.ml?.wandb,
223
+ },
224
+ };
225
+ }
226
+
227
+ export function getEvolutionConfig(): EvolutionConfigOptions {
228
+ const config = loadConfig();
229
+ return {
230
+ enabled: false,
231
+ targetScore: 0.9,
232
+ maxIterations: 10,
233
+ trainingProvider: "local",
234
+ ...config.evolution,
235
+ };
236
+ }
237
+
238
+ export function getSystemPrompt(): string | undefined {
239
+ return loadConfig().systemPrompt;
240
+ }
241
+
242
+ // Get config directory path
243
+ export function getConfigDir(): string {
244
+ ensureConfigDir();
245
+ return CONFIG_DIR;
246
+ }
@@ -0,0 +1,301 @@
1
+ /**
2
+ * EvalRunner - Run evaluations to measure agent improvement
3
+ */
4
+
5
+ import type { Agent } from "../agent/Agent.js";
6
+ import type {
7
+ EvalCase,
8
+ EvalResult,
9
+ EvalRunSummary,
10
+ EvalCriteria,
11
+ } from "./types.js";
12
+
13
+ export interface EvalRunnerConfig {
14
+ timeout?: number;
15
+ parallelism?: number;
16
+ }
17
+
18
+ export class EvalRunner {
19
+ private config: EvalRunnerConfig;
20
+
21
+ constructor(config: EvalRunnerConfig = {}) {
22
+ this.config = {
23
+ timeout: config.timeout || 60000,
24
+ parallelism: config.parallelism || 1,
25
+ };
26
+ }
27
+
28
+ async runEval(agent: Agent, evalCase: EvalCase): Promise<EvalResult> {
29
+ const startTime = Date.now();
30
+ const initialUsage = agent.getBudgetStatus().usage.tokensUsed;
31
+
32
+ try {
33
+ // Run the agent with the eval input
34
+ const response = await agent.chat(evalCase.input, false);
35
+ const duration = Date.now() - startTime;
36
+ const tokenUsage = agent.getBudgetStatus().usage.tokensUsed - initialUsage;
37
+
38
+ // Get tools that were called (from message history)
39
+ const messages = agent.getMessages();
40
+ const toolsCalled: string[] = [];
41
+ for (const msg of messages) {
42
+ if (msg.toolCalls) {
43
+ for (const tc of msg.toolCalls) {
44
+ toolsCalled.push(tc.name);
45
+ }
46
+ }
47
+ }
48
+
49
+ // Evaluate criteria
50
+ const { passed, score, errors } = this.evaluateCriteria(
51
+ response,
52
+ toolsCalled,
53
+ evalCase
54
+ );
55
+
56
+ return {
57
+ caseId: evalCase.id,
58
+ passed,
59
+ score,
60
+ actualOutput: response,
61
+ toolsCalled,
62
+ errors,
63
+ duration,
64
+ tokenUsage,
65
+ };
66
+ } catch (error) {
67
+ return {
68
+ caseId: evalCase.id,
69
+ passed: false,
70
+ score: 0,
71
+ actualOutput: "",
72
+ toolsCalled: [],
73
+ errors: [error instanceof Error ? error.message : String(error)],
74
+ duration: Date.now() - startTime,
75
+ tokenUsage: agent.getBudgetStatus().usage.tokensUsed - initialUsage,
76
+ };
77
+ }
78
+ }
79
+
80
+ private evaluateCriteria(
81
+ output: string,
82
+ toolsCalled: string[],
83
+ evalCase: EvalCase
84
+ ): { passed: boolean; score: number; errors: string[] } {
85
+ const errors: string[] = [];
86
+ let passedCriteria = 0;
87
+ const totalCriteria = evalCase.successCriteria.length;
88
+
89
+ for (const criteria of evalCase.successCriteria) {
90
+ const { passed, error } = this.checkCriterion(
91
+ output,
92
+ toolsCalled,
93
+ criteria
94
+ );
95
+ if (passed) {
96
+ passedCriteria++;
97
+ } else if (error) {
98
+ errors.push(error);
99
+ }
100
+ }
101
+
102
+ // Check expected output if provided
103
+ if (evalCase.expectedOutput) {
104
+ if (output.toLowerCase().includes(evalCase.expectedOutput.toLowerCase())) {
105
+ passedCriteria++;
106
+ } else {
107
+ errors.push(`Expected output not found: "${evalCase.expectedOutput}"`);
108
+ }
109
+ }
110
+
111
+ // Check expected tool calls if provided
112
+ if (evalCase.expectedToolCalls) {
113
+ const missingTools = evalCase.expectedToolCalls.filter(
114
+ (t) => !toolsCalled.includes(t)
115
+ );
116
+ if (missingTools.length === 0) {
117
+ passedCriteria++;
118
+ } else {
119
+ errors.push(`Expected tools not called: ${missingTools.join(", ")}`);
120
+ }
121
+ }
122
+
123
+ const score = totalCriteria > 0 ? passedCriteria / totalCriteria : 1;
124
+ const passed = errors.length === 0 && score >= 0.8;
125
+
126
+ return { passed, score, errors };
127
+ }
128
+
129
+ private checkCriterion(
130
+ output: string,
131
+ toolsCalled: string[],
132
+ criteria: EvalCriteria
133
+ ): { passed: boolean; error?: string } {
134
+ switch (criteria.type) {
135
+ case "contains":
136
+ if (output.toLowerCase().includes(criteria.value.toLowerCase())) {
137
+ return { passed: true };
138
+ }
139
+ return {
140
+ passed: false,
141
+ error: `Output should contain: "${criteria.value}"`,
142
+ };
143
+
144
+ case "not_contains":
145
+ if (!output.toLowerCase().includes(criteria.value.toLowerCase())) {
146
+ return { passed: true };
147
+ }
148
+ return {
149
+ passed: false,
150
+ error: `Output should not contain: "${criteria.value}"`,
151
+ };
152
+
153
+ case "tool_called":
154
+ if (toolsCalled.includes(criteria.value)) {
155
+ return { passed: true };
156
+ }
157
+ return {
158
+ passed: false,
159
+ error: `Tool should be called: "${criteria.value}"`,
160
+ };
161
+
162
+ case "tool_not_called":
163
+ if (!toolsCalled.includes(criteria.value)) {
164
+ return { passed: true };
165
+ }
166
+ return {
167
+ passed: false,
168
+ error: `Tool should not be called: "${criteria.value}"`,
169
+ };
170
+
171
+ case "regex":
172
+ try {
173
+ const regex = new RegExp(criteria.value, "i");
174
+ if (regex.test(output)) {
175
+ return { passed: true };
176
+ }
177
+ return {
178
+ passed: false,
179
+ error: `Output should match pattern: "${criteria.value}"`,
180
+ };
181
+ } catch {
182
+ return { passed: false, error: `Invalid regex: "${criteria.value}"` };
183
+ }
184
+
185
+ case "custom":
186
+ // Custom criteria would need to be implemented separately
187
+ return { passed: true };
188
+
189
+ default:
190
+ return { passed: false, error: `Unknown criteria type` };
191
+ }
192
+ }
193
+
194
+ async runEvalSuite(agent: Agent, evalCases: EvalCase[]): Promise<EvalRunSummary> {
195
+ const runId = `eval_${Date.now()}`;
196
+ const startTime = Date.now();
197
+ const results: EvalResult[] = [];
198
+
199
+ // Run evals sequentially for now (could parallelize)
200
+ for (const evalCase of evalCases) {
201
+ // Clear agent history between evals
202
+ agent.clearHistory();
203
+
204
+ const result = await this.runEval(agent, evalCase);
205
+ results.push(result);
206
+ }
207
+
208
+ const passedCases = results.filter((r) => r.passed).length;
209
+ const totalTokens = results.reduce((sum, r) => sum + r.tokenUsage, 0);
210
+ const totalDuration = Date.now() - startTime;
211
+
212
+ // Calculate weighted average score
213
+ let weightedScore = 0;
214
+ let totalWeight = 0;
215
+ for (let i = 0; i < results.length; i++) {
216
+ const weight = evalCases[i].weight || 1;
217
+ weightedScore += results[i].score * weight;
218
+ totalWeight += weight;
219
+ }
220
+ const averageScore = totalWeight > 0 ? weightedScore / totalWeight : 0;
221
+
222
+ return {
223
+ runId,
224
+ timestamp: new Date(),
225
+ totalCases: evalCases.length,
226
+ passedCases,
227
+ failedCases: evalCases.length - passedCases,
228
+ averageScore,
229
+ totalTokens,
230
+ totalDuration,
231
+ results,
232
+ };
233
+ }
234
+
235
+ // Format results for display
236
+ formatResults(summary: EvalRunSummary): string {
237
+ let output = `\nEvaluation Results (${summary.runId})\n`;
238
+ output += "=".repeat(50) + "\n\n";
239
+ output += `Total: ${summary.totalCases} | Passed: ${summary.passedCases} | Failed: ${summary.failedCases}\n`;
240
+ output += `Average Score: ${(summary.averageScore * 100).toFixed(1)}%\n`;
241
+ output += `Tokens Used: ${summary.totalTokens.toLocaleString()}\n`;
242
+ output += `Duration: ${(summary.totalDuration / 1000).toFixed(1)}s\n\n`;
243
+
244
+ for (const result of summary.results) {
245
+ const status = result.passed ? "✓" : "✗";
246
+ const color = result.passed ? "green" : "red";
247
+ output += `${status} ${result.caseId}: ${(result.score * 100).toFixed(0)}%`;
248
+ if (result.errors && result.errors.length > 0) {
249
+ output += `\n Errors: ${result.errors.join(", ")}`;
250
+ }
251
+ output += "\n";
252
+ }
253
+
254
+ return output;
255
+ }
256
+ }
257
+
258
+ // Predefined eval cases for common agent capabilities
259
+ export const DEFAULT_EVAL_CASES: EvalCase[] = [
260
+ {
261
+ id: "bash_basic",
262
+ name: "Basic Bash Execution",
263
+ description: "Test that the agent can execute a simple bash command",
264
+ input: "What is the current date? Use bash to find out.",
265
+ successCriteria: [
266
+ { type: "tool_called", value: "bash" },
267
+ { type: "regex", value: "\\d{4}" }, // Year should appear
268
+ ],
269
+ },
270
+ {
271
+ id: "file_read",
272
+ name: "File Reading",
273
+ description: "Test that the agent can read a file",
274
+ input: "Read the contents of package.json and tell me the project name.",
275
+ successCriteria: [
276
+ { type: "tool_called", value: "read_file" },
277
+ { type: "contains", value: "pioneer" },
278
+ ],
279
+ },
280
+ {
281
+ id: "code_execution",
282
+ name: "Code Sandbox",
283
+ description: "Test that the agent can execute code in a sandbox",
284
+ input: "Calculate the factorial of 5 using Python.",
285
+ successCriteria: [
286
+ { type: "tool_called", value: "execute_code" },
287
+ { type: "contains", value: "120" },
288
+ ],
289
+ },
290
+ {
291
+ id: "multi_step",
292
+ name: "Multi-step Task",
293
+ description: "Test that the agent can complete a multi-step task",
294
+ input: "List the files in the current directory, then read the README.md file and summarize it.",
295
+ successCriteria: [
296
+ { type: "tool_called", value: "list_directory" },
297
+ { type: "tool_called", value: "read_file" },
298
+ ],
299
+ },
300
+ ];
301
+