@fastino-ai/pioneer-cli 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.claude/settings.local.json +7 -1
  2. package/.cursor/rules/api-documentation.mdc +14 -0
  3. package/.cursor/rules/backend-location-rule.mdc +5 -0
  4. package/Medical_NER_Dataset_1.jsonl +50 -0
  5. package/README.md +4 -1
  6. package/bun.lock +52 -0
  7. package/package.json +5 -2
  8. package/src/api.ts +551 -22
  9. package/src/chat/ChatApp.tsx +548 -263
  10. package/src/client/ToolExecutor.ts +175 -0
  11. package/src/client/WebSocketClient.ts +333 -0
  12. package/src/client/index.ts +2 -0
  13. package/src/config.ts +49 -139
  14. package/src/index.tsx +815 -107
  15. package/src/telemetry.ts +173 -0
  16. package/src/tests/config.test.ts +19 -0
  17. package/src/tools/bash.ts +1 -1
  18. package/src/tools/filesystem.ts +1 -1
  19. package/src/tools/index.ts +2 -9
  20. package/src/tools/sandbox.ts +1 -1
  21. package/src/tools/types.ts +25 -0
  22. package/src/utils/index.ts +6 -0
  23. package/fastino-ai-pioneer-cli-0.2.0.tgz +0 -0
  24. package/ner_dataset.json +0 -111
  25. package/src/agent/Agent.ts +0 -342
  26. package/src/agent/BudgetManager.ts +0 -167
  27. package/src/agent/LLMClient.ts +0 -435
  28. package/src/agent/ToolRegistry.ts +0 -97
  29. package/src/agent/index.ts +0 -15
  30. package/src/agent/types.ts +0 -84
  31. package/src/evolution/EvalRunner.ts +0 -301
  32. package/src/evolution/EvolutionEngine.ts +0 -319
  33. package/src/evolution/FeedbackCollector.ts +0 -197
  34. package/src/evolution/ModelTrainer.ts +0 -371
  35. package/src/evolution/index.ts +0 -18
  36. package/src/evolution/types.ts +0 -110
  37. package/src/tools/modal.ts +0 -269
  38. package/src/tools/training.ts +0 -443
  39. package/src/tools/wandb.ts +0 -348
  40. /package/src/{agent → utils}/FileResolver.ts +0 -0
@@ -1,301 +0,0 @@
1
- /**
2
- * EvalRunner - Run evaluations to measure agent improvement
3
- */
4
-
5
- import type { Agent } from "../agent/Agent.js";
6
- import type {
7
- EvalCase,
8
- EvalResult,
9
- EvalRunSummary,
10
- EvalCriteria,
11
- } from "./types.js";
12
-
13
- export interface EvalRunnerConfig {
14
- timeout?: number;
15
- parallelism?: number;
16
- }
17
-
18
- export class EvalRunner {
19
- private config: EvalRunnerConfig;
20
-
21
- constructor(config: EvalRunnerConfig = {}) {
22
- this.config = {
23
- timeout: config.timeout || 60000,
24
- parallelism: config.parallelism || 1,
25
- };
26
- }
27
-
28
- async runEval(agent: Agent, evalCase: EvalCase): Promise<EvalResult> {
29
- const startTime = Date.now();
30
- const initialUsage = agent.getBudgetStatus().usage.tokensUsed;
31
-
32
- try {
33
- // Run the agent with the eval input
34
- const response = await agent.chat(evalCase.input, false);
35
- const duration = Date.now() - startTime;
36
- const tokenUsage = agent.getBudgetStatus().usage.tokensUsed - initialUsage;
37
-
38
- // Get tools that were called (from message history)
39
- const messages = agent.getMessages();
40
- const toolsCalled: string[] = [];
41
- for (const msg of messages) {
42
- if (msg.toolCalls) {
43
- for (const tc of msg.toolCalls) {
44
- toolsCalled.push(tc.name);
45
- }
46
- }
47
- }
48
-
49
- // Evaluate criteria
50
- const { passed, score, errors } = this.evaluateCriteria(
51
- response,
52
- toolsCalled,
53
- evalCase
54
- );
55
-
56
- return {
57
- caseId: evalCase.id,
58
- passed,
59
- score,
60
- actualOutput: response,
61
- toolsCalled,
62
- errors,
63
- duration,
64
- tokenUsage,
65
- };
66
- } catch (error) {
67
- return {
68
- caseId: evalCase.id,
69
- passed: false,
70
- score: 0,
71
- actualOutput: "",
72
- toolsCalled: [],
73
- errors: [error instanceof Error ? error.message : String(error)],
74
- duration: Date.now() - startTime,
75
- tokenUsage: agent.getBudgetStatus().usage.tokensUsed - initialUsage,
76
- };
77
- }
78
- }
79
-
80
- private evaluateCriteria(
81
- output: string,
82
- toolsCalled: string[],
83
- evalCase: EvalCase
84
- ): { passed: boolean; score: number; errors: string[] } {
85
- const errors: string[] = [];
86
- let passedCriteria = 0;
87
- const totalCriteria = evalCase.successCriteria.length;
88
-
89
- for (const criteria of evalCase.successCriteria) {
90
- const { passed, error } = this.checkCriterion(
91
- output,
92
- toolsCalled,
93
- criteria
94
- );
95
- if (passed) {
96
- passedCriteria++;
97
- } else if (error) {
98
- errors.push(error);
99
- }
100
- }
101
-
102
- // Check expected output if provided
103
- if (evalCase.expectedOutput) {
104
- if (output.toLowerCase().includes(evalCase.expectedOutput.toLowerCase())) {
105
- passedCriteria++;
106
- } else {
107
- errors.push(`Expected output not found: "${evalCase.expectedOutput}"`);
108
- }
109
- }
110
-
111
- // Check expected tool calls if provided
112
- if (evalCase.expectedToolCalls) {
113
- const missingTools = evalCase.expectedToolCalls.filter(
114
- (t) => !toolsCalled.includes(t)
115
- );
116
- if (missingTools.length === 0) {
117
- passedCriteria++;
118
- } else {
119
- errors.push(`Expected tools not called: ${missingTools.join(", ")}`);
120
- }
121
- }
122
-
123
- const score = totalCriteria > 0 ? passedCriteria / totalCriteria : 1;
124
- const passed = errors.length === 0 && score >= 0.8;
125
-
126
- return { passed, score, errors };
127
- }
128
-
129
- private checkCriterion(
130
- output: string,
131
- toolsCalled: string[],
132
- criteria: EvalCriteria
133
- ): { passed: boolean; error?: string } {
134
- switch (criteria.type) {
135
- case "contains":
136
- if (output.toLowerCase().includes(criteria.value.toLowerCase())) {
137
- return { passed: true };
138
- }
139
- return {
140
- passed: false,
141
- error: `Output should contain: "${criteria.value}"`,
142
- };
143
-
144
- case "not_contains":
145
- if (!output.toLowerCase().includes(criteria.value.toLowerCase())) {
146
- return { passed: true };
147
- }
148
- return {
149
- passed: false,
150
- error: `Output should not contain: "${criteria.value}"`,
151
- };
152
-
153
- case "tool_called":
154
- if (toolsCalled.includes(criteria.value)) {
155
- return { passed: true };
156
- }
157
- return {
158
- passed: false,
159
- error: `Tool should be called: "${criteria.value}"`,
160
- };
161
-
162
- case "tool_not_called":
163
- if (!toolsCalled.includes(criteria.value)) {
164
- return { passed: true };
165
- }
166
- return {
167
- passed: false,
168
- error: `Tool should not be called: "${criteria.value}"`,
169
- };
170
-
171
- case "regex":
172
- try {
173
- const regex = new RegExp(criteria.value, "i");
174
- if (regex.test(output)) {
175
- return { passed: true };
176
- }
177
- return {
178
- passed: false,
179
- error: `Output should match pattern: "${criteria.value}"`,
180
- };
181
- } catch {
182
- return { passed: false, error: `Invalid regex: "${criteria.value}"` };
183
- }
184
-
185
- case "custom":
186
- // Custom criteria would need to be implemented separately
187
- return { passed: true };
188
-
189
- default:
190
- return { passed: false, error: `Unknown criteria type` };
191
- }
192
- }
193
-
194
- async runEvalSuite(agent: Agent, evalCases: EvalCase[]): Promise<EvalRunSummary> {
195
- const runId = `eval_${Date.now()}`;
196
- const startTime = Date.now();
197
- const results: EvalResult[] = [];
198
-
199
- // Run evals sequentially for now (could parallelize)
200
- for (const evalCase of evalCases) {
201
- // Clear agent history between evals
202
- agent.clearHistory();
203
-
204
- const result = await this.runEval(agent, evalCase);
205
- results.push(result);
206
- }
207
-
208
- const passedCases = results.filter((r) => r.passed).length;
209
- const totalTokens = results.reduce((sum, r) => sum + r.tokenUsage, 0);
210
- const totalDuration = Date.now() - startTime;
211
-
212
- // Calculate weighted average score
213
- let weightedScore = 0;
214
- let totalWeight = 0;
215
- for (let i = 0; i < results.length; i++) {
216
- const weight = evalCases[i].weight || 1;
217
- weightedScore += results[i].score * weight;
218
- totalWeight += weight;
219
- }
220
- const averageScore = totalWeight > 0 ? weightedScore / totalWeight : 0;
221
-
222
- return {
223
- runId,
224
- timestamp: new Date(),
225
- totalCases: evalCases.length,
226
- passedCases,
227
- failedCases: evalCases.length - passedCases,
228
- averageScore,
229
- totalTokens,
230
- totalDuration,
231
- results,
232
- };
233
- }
234
-
235
- // Format results for display
236
- formatResults(summary: EvalRunSummary): string {
237
- let output = `\nEvaluation Results (${summary.runId})\n`;
238
- output += "=".repeat(50) + "\n\n";
239
- output += `Total: ${summary.totalCases} | Passed: ${summary.passedCases} | Failed: ${summary.failedCases}\n`;
240
- output += `Average Score: ${(summary.averageScore * 100).toFixed(1)}%\n`;
241
- output += `Tokens Used: ${summary.totalTokens.toLocaleString()}\n`;
242
- output += `Duration: ${(summary.totalDuration / 1000).toFixed(1)}s\n\n`;
243
-
244
- for (const result of summary.results) {
245
- const status = result.passed ? "✓" : "✗";
246
- const color = result.passed ? "green" : "red";
247
- output += `${status} ${result.caseId}: ${(result.score * 100).toFixed(0)}%`;
248
- if (result.errors && result.errors.length > 0) {
249
- output += `\n Errors: ${result.errors.join(", ")}`;
250
- }
251
- output += "\n";
252
- }
253
-
254
- return output;
255
- }
256
- }
257
-
258
- // Predefined eval cases for common agent capabilities
259
- export const DEFAULT_EVAL_CASES: EvalCase[] = [
260
- {
261
- id: "bash_basic",
262
- name: "Basic Bash Execution",
263
- description: "Test that the agent can execute a simple bash command",
264
- input: "What is the current date? Use bash to find out.",
265
- successCriteria: [
266
- { type: "tool_called", value: "bash" },
267
- { type: "regex", value: "\\d{4}" }, // Year should appear
268
- ],
269
- },
270
- {
271
- id: "file_read",
272
- name: "File Reading",
273
- description: "Test that the agent can read a file",
274
- input: "Read the contents of package.json and tell me the project name.",
275
- successCriteria: [
276
- { type: "tool_called", value: "read_file" },
277
- { type: "contains", value: "pioneer" },
278
- ],
279
- },
280
- {
281
- id: "code_execution",
282
- name: "Code Sandbox",
283
- description: "Test that the agent can execute code in a sandbox",
284
- input: "Calculate the factorial of 5 using Python.",
285
- successCriteria: [
286
- { type: "tool_called", value: "execute_code" },
287
- { type: "contains", value: "120" },
288
- ],
289
- },
290
- {
291
- id: "multi_step",
292
- name: "Multi-step Task",
293
- description: "Test that the agent can complete a multi-step task",
294
- input: "List the files in the current directory, then read the README.md file and summarize it.",
295
- successCriteria: [
296
- { type: "tool_called", value: "list_directory" },
297
- { type: "tool_called", value: "read_file" },
298
- ],
299
- },
300
- ];
301
-
@@ -1,319 +0,0 @@
1
- /**
2
- * EvolutionEngine - Core self-improvement loop
3
- * Iteratively improves the agent based on evaluations and feedback
4
- */
5
-
6
- import * as fs from "fs";
7
- import * as path from "path";
8
- import * as os from "os";
9
- import type { Agent } from "../agent/Agent.js";
10
- import { FeedbackCollector } from "./FeedbackCollector.js";
11
- import { EvalRunner, DEFAULT_EVAL_CASES } from "./EvalRunner.js";
12
- import { ModelTrainer, type TrainingResult } from "./ModelTrainer.js";
13
- import type {
14
- EvolutionConfig,
15
- EvolutionState,
16
- EvolutionHistory,
17
- EvalCase,
18
- EvalRunSummary,
19
- } from "./types.js";
20
-
21
- export interface EvolutionEngineConfig {
22
- storagePath?: string;
23
- evalCases?: EvalCase[];
24
- targetScore?: number;
25
- maxIterations?: number;
26
- budgetPerIteration?: {
27
- maxTokens?: number;
28
- maxCost?: number;
29
- maxTime?: number;
30
- };
31
- trainingConfig?: {
32
- provider: "openai" | "modal" | "local";
33
- baseModel: string;
34
- };
35
- }
36
-
37
- export interface EvolutionEvents {
38
- onIterationStart?: (iteration: number) => void;
39
- onIterationEnd?: (iteration: number, score: number) => void;
40
- onEvalComplete?: (summary: EvalRunSummary) => void;
41
- onTrainingComplete?: (result: TrainingResult) => void;
42
- onBudgetWarning?: (message: string) => void;
43
- onComplete?: (state: EvolutionState) => void;
44
- onError?: (error: Error) => void;
45
- }
46
-
47
- export class EvolutionEngine {
48
- private config: EvolutionEngineConfig;
49
- private storagePath: string;
50
- private feedbackCollector: FeedbackCollector;
51
- private evalRunner: EvalRunner;
52
- private modelTrainer: ModelTrainer | null = null;
53
- private state: EvolutionState;
54
- private events: EvolutionEvents;
55
-
56
- constructor(config: EvolutionEngineConfig, events: EvolutionEvents = {}) {
57
- this.config = config;
58
- this.events = events;
59
- this.storagePath =
60
- config.storagePath || path.join(os.homedir(), ".pioneer", "evolution");
61
- this.ensureStoragePath();
62
-
63
- this.feedbackCollector = new FeedbackCollector({
64
- storagePath: path.join(this.storagePath, "feedback"),
65
- });
66
-
67
- this.evalRunner = new EvalRunner();
68
-
69
- if (config.trainingConfig) {
70
- this.modelTrainer = new ModelTrainer({
71
- provider: config.trainingConfig.provider,
72
- baseModel: config.trainingConfig.baseModel,
73
- outputDir: path.join(this.storagePath, "models"),
74
- });
75
- }
76
-
77
- this.state = this.loadState() || this.createInitialState();
78
- }
79
-
80
- private ensureStoragePath(): void {
81
- if (!fs.existsSync(this.storagePath)) {
82
- fs.mkdirSync(this.storagePath, { recursive: true });
83
- }
84
- }
85
-
86
- private createInitialState(): EvolutionState {
87
- return {
88
- iteration: 0,
89
- currentScore: 0,
90
- bestScore: 0,
91
- bestPrompt: "",
92
- history: [],
93
- totalTokensUsed: 0,
94
- totalCostUsed: 0,
95
- totalTimeUsed: 0,
96
- startTime: new Date(),
97
- status: "running",
98
- };
99
- }
100
-
101
- private loadState(): EvolutionState | null {
102
- const statePath = path.join(this.storagePath, "state.json");
103
- try {
104
- if (fs.existsSync(statePath)) {
105
- const data = fs.readFileSync(statePath, "utf-8");
106
- return JSON.parse(data);
107
- }
108
- } catch {
109
- // Ignore errors
110
- }
111
- return null;
112
- }
113
-
114
- private saveState(): void {
115
- const statePath = path.join(this.storagePath, "state.json");
116
- fs.writeFileSync(statePath, JSON.stringify(this.state, null, 2));
117
- }
118
-
119
- async evolve(agent: Agent): Promise<EvolutionState> {
120
- const evalCases = this.config.evalCases || DEFAULT_EVAL_CASES;
121
- const targetScore = this.config.targetScore || 0.9;
122
- const maxIterations = this.config.maxIterations || 10;
123
-
124
- this.state.status = "running";
125
- this.state.startTime = new Date();
126
-
127
- while (
128
- this.state.iteration < maxIterations &&
129
- this.state.status === "running"
130
- ) {
131
- // Check budget
132
- if (!this.checkBudget()) {
133
- this.state.status = "budget_exhausted";
134
- break;
135
- }
136
-
137
- this.state.iteration++;
138
- this.events.onIterationStart?.(this.state.iteration);
139
-
140
- try {
141
- // Run evaluation
142
- const evalSummary = await this.evalRunner.runEvalSuite(agent, evalCases);
143
- this.events.onEvalComplete?.(evalSummary);
144
-
145
- // Update state
146
- this.state.currentScore = evalSummary.averageScore;
147
- this.state.totalTokensUsed += evalSummary.totalTokens;
148
- this.state.totalTimeUsed += evalSummary.totalDuration / 1000;
149
-
150
- // Check if target reached
151
- if (this.state.currentScore >= targetScore) {
152
- this.state.status = "completed";
153
- this.events.onIterationEnd?.(this.state.iteration, this.state.currentScore);
154
- break;
155
- }
156
-
157
- // Update best if improved
158
- if (this.state.currentScore > this.state.bestScore) {
159
- this.state.bestScore = this.state.currentScore;
160
- // Save the current configuration as best
161
- }
162
-
163
- // Attempt improvement
164
- await this.attemptImprovement(agent, evalSummary);
165
-
166
- // Record history
167
- this.state.history.push({
168
- iteration: this.state.iteration,
169
- prompt: "", // Would store the current prompt
170
- evalScore: this.state.currentScore,
171
- changes: "Prompt/model adjustment",
172
- timestamp: new Date(),
173
- });
174
-
175
- this.events.onIterationEnd?.(this.state.iteration, this.state.currentScore);
176
- this.saveState();
177
- } catch (error) {
178
- this.events.onError?.(
179
- error instanceof Error ? error : new Error(String(error))
180
- );
181
- this.state.status = "failed";
182
- break;
183
- }
184
- }
185
-
186
- this.state.endTime = new Date();
187
- this.saveState();
188
- this.events.onComplete?.(this.state);
189
-
190
- return this.state;
191
- }
192
-
193
- private checkBudget(): boolean {
194
- const budget = this.config.budgetPerIteration;
195
- if (!budget) return true;
196
-
197
- if (budget.maxTokens && this.state.totalTokensUsed >= budget.maxTokens) {
198
- this.events.onBudgetWarning?.("Token budget exhausted");
199
- return false;
200
- }
201
-
202
- if (budget.maxCost && this.state.totalCostUsed >= budget.maxCost) {
203
- this.events.onBudgetWarning?.("Cost budget exhausted");
204
- return false;
205
- }
206
-
207
- if (budget.maxTime && this.state.totalTimeUsed >= budget.maxTime) {
208
- this.events.onBudgetWarning?.("Time budget exhausted");
209
- return false;
210
- }
211
-
212
- return true;
213
- }
214
-
215
- private async attemptImprovement(
216
- agent: Agent,
217
- evalSummary: EvalRunSummary
218
- ): Promise<void> {
219
- // Get feedback for training
220
- const trainingData = this.feedbackCollector.toTrainingData();
221
-
222
- // If we have enough training data, attempt fine-tuning
223
- if (trainingData.length >= 50 && this.modelTrainer) {
224
- try {
225
- const result = await this.modelTrainer.train(trainingData);
226
- this.events.onTrainingComplete?.(result);
227
-
228
- if (result.success && result.modelId) {
229
- // Would update agent to use the new model
230
- console.log(`New model trained: ${result.modelId}`);
231
- }
232
- } catch (error) {
233
- this.events.onError?.(
234
- error instanceof Error ? error : new Error(String(error))
235
- );
236
- }
237
- }
238
-
239
- // Analyze failed cases and suggest improvements
240
- const failedCases = evalSummary.results.filter((r) => !r.passed);
241
- if (failedCases.length > 0) {
242
- // Could use the LLM to analyze failures and suggest prompt improvements
243
- console.log(
244
- `Analyzing ${failedCases.length} failed cases for improvement...`
245
- );
246
- }
247
- }
248
-
249
- // Record interaction feedback
250
- recordInteraction(params: {
251
- sessionId: string;
252
- userMessage: string;
253
- agentResponse: string;
254
- toolCalls: string[];
255
- wasSuccessful: boolean;
256
- metadata?: Record<string, unknown>;
257
- }): string {
258
- return this.feedbackCollector.recordInteraction(params);
259
- }
260
-
261
- // Add user rating to feedback
262
- rateFeedback(feedbackId: string, rating: number, corrections?: string): void {
263
- this.feedbackCollector.addRating(feedbackId, rating, corrections);
264
- }
265
-
266
- // Get evolution statistics
267
- getStats(): {
268
- state: EvolutionState;
269
- feedbackStats: ReturnType<FeedbackCollector["getStats"]>;
270
- } {
271
- return {
272
- state: this.state,
273
- feedbackStats: this.feedbackCollector.getStats(),
274
- };
275
- }
276
-
277
- // Reset evolution state
278
- reset(): void {
279
- this.state = this.createInitialState();
280
- this.saveState();
281
- }
282
-
283
- // Export feedback for external training
284
- exportFeedback(format: "jsonl" | "openai", outputPath: string): void {
285
- if (format === "openai") {
286
- this.feedbackCollector.exportAsOpenAIFormat(outputPath);
287
- } else {
288
- this.feedbackCollector.exportAsJsonl(outputPath);
289
- }
290
- }
291
-
292
- // Run a single evaluation cycle
293
- async runEvaluation(agent: Agent): Promise<EvalRunSummary> {
294
- const evalCases = this.config.evalCases || DEFAULT_EVAL_CASES;
295
- return this.evalRunner.runEvalSuite(agent, evalCases);
296
- }
297
-
298
- // Get formatted results
299
- formatResults(): string {
300
- let output = "\nEvolution Engine Status\n";
301
- output += "=".repeat(50) + "\n\n";
302
- output += `Status: ${this.state.status}\n`;
303
- output += `Iteration: ${this.state.iteration}\n`;
304
- output += `Current Score: ${(this.state.currentScore * 100).toFixed(1)}%\n`;
305
- output += `Best Score: ${(this.state.bestScore * 100).toFixed(1)}%\n`;
306
- output += `Tokens Used: ${this.state.totalTokensUsed.toLocaleString()}\n`;
307
- output += `Time Used: ${this.state.totalTimeUsed.toFixed(1)}s\n\n`;
308
-
309
- const feedbackStats = this.feedbackCollector.getStats();
310
- output += "Feedback Statistics:\n";
311
- output += ` Total: ${feedbackStats.total}\n`;
312
- output += ` Rated: ${feedbackStats.rated}\n`;
313
- output += ` Avg Rating: ${feedbackStats.avgRating}/5\n`;
314
- output += ` Success Rate: ${feedbackStats.successRate}%\n`;
315
-
316
- return output;
317
- }
318
- }
319
-