@fastino-ai/pioneer-cli 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +7 -1
- package/.cursor/rules/api-documentation.mdc +14 -0
- package/.cursor/rules/backend-location-rule.mdc +5 -0
- package/Medical_NER_Dataset_1.jsonl +50 -0
- package/README.md +4 -1
- package/bun.lock +52 -0
- package/package.json +5 -2
- package/src/api.ts +551 -22
- package/src/chat/ChatApp.tsx +548 -263
- package/src/client/ToolExecutor.ts +175 -0
- package/src/client/WebSocketClient.ts +333 -0
- package/src/client/index.ts +2 -0
- package/src/config.ts +49 -139
- package/src/index.tsx +815 -107
- package/src/telemetry.ts +173 -0
- package/src/tests/config.test.ts +19 -0
- package/src/tools/bash.ts +1 -1
- package/src/tools/filesystem.ts +1 -1
- package/src/tools/index.ts +2 -9
- package/src/tools/sandbox.ts +1 -1
- package/src/tools/types.ts +25 -0
- package/src/utils/index.ts +6 -0
- package/fastino-ai-pioneer-cli-0.2.0.tgz +0 -0
- package/ner_dataset.json +0 -111
- package/src/agent/Agent.ts +0 -342
- package/src/agent/BudgetManager.ts +0 -167
- package/src/agent/LLMClient.ts +0 -435
- package/src/agent/ToolRegistry.ts +0 -97
- package/src/agent/index.ts +0 -15
- package/src/agent/types.ts +0 -84
- package/src/evolution/EvalRunner.ts +0 -301
- package/src/evolution/EvolutionEngine.ts +0 -319
- package/src/evolution/FeedbackCollector.ts +0 -197
- package/src/evolution/ModelTrainer.ts +0 -371
- package/src/evolution/index.ts +0 -18
- package/src/evolution/types.ts +0 -110
- package/src/tools/modal.ts +0 -269
- package/src/tools/training.ts +0 -443
- package/src/tools/wandb.ts +0 -348
- /package/src/{agent → utils}/FileResolver.ts +0 -0
|
@@ -1,301 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* EvalRunner - Run evaluations to measure agent improvement
|
|
3
|
-
*/
|
|
4
|
-
|
|
5
|
-
import type { Agent } from "../agent/Agent.js";
|
|
6
|
-
import type {
|
|
7
|
-
EvalCase,
|
|
8
|
-
EvalResult,
|
|
9
|
-
EvalRunSummary,
|
|
10
|
-
EvalCriteria,
|
|
11
|
-
} from "./types.js";
|
|
12
|
-
|
|
13
|
-
export interface EvalRunnerConfig {
|
|
14
|
-
timeout?: number;
|
|
15
|
-
parallelism?: number;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
export class EvalRunner {
|
|
19
|
-
private config: EvalRunnerConfig;
|
|
20
|
-
|
|
21
|
-
constructor(config: EvalRunnerConfig = {}) {
|
|
22
|
-
this.config = {
|
|
23
|
-
timeout: config.timeout || 60000,
|
|
24
|
-
parallelism: config.parallelism || 1,
|
|
25
|
-
};
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
async runEval(agent: Agent, evalCase: EvalCase): Promise<EvalResult> {
|
|
29
|
-
const startTime = Date.now();
|
|
30
|
-
const initialUsage = agent.getBudgetStatus().usage.tokensUsed;
|
|
31
|
-
|
|
32
|
-
try {
|
|
33
|
-
// Run the agent with the eval input
|
|
34
|
-
const response = await agent.chat(evalCase.input, false);
|
|
35
|
-
const duration = Date.now() - startTime;
|
|
36
|
-
const tokenUsage = agent.getBudgetStatus().usage.tokensUsed - initialUsage;
|
|
37
|
-
|
|
38
|
-
// Get tools that were called (from message history)
|
|
39
|
-
const messages = agent.getMessages();
|
|
40
|
-
const toolsCalled: string[] = [];
|
|
41
|
-
for (const msg of messages) {
|
|
42
|
-
if (msg.toolCalls) {
|
|
43
|
-
for (const tc of msg.toolCalls) {
|
|
44
|
-
toolsCalled.push(tc.name);
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
// Evaluate criteria
|
|
50
|
-
const { passed, score, errors } = this.evaluateCriteria(
|
|
51
|
-
response,
|
|
52
|
-
toolsCalled,
|
|
53
|
-
evalCase
|
|
54
|
-
);
|
|
55
|
-
|
|
56
|
-
return {
|
|
57
|
-
caseId: evalCase.id,
|
|
58
|
-
passed,
|
|
59
|
-
score,
|
|
60
|
-
actualOutput: response,
|
|
61
|
-
toolsCalled,
|
|
62
|
-
errors,
|
|
63
|
-
duration,
|
|
64
|
-
tokenUsage,
|
|
65
|
-
};
|
|
66
|
-
} catch (error) {
|
|
67
|
-
return {
|
|
68
|
-
caseId: evalCase.id,
|
|
69
|
-
passed: false,
|
|
70
|
-
score: 0,
|
|
71
|
-
actualOutput: "",
|
|
72
|
-
toolsCalled: [],
|
|
73
|
-
errors: [error instanceof Error ? error.message : String(error)],
|
|
74
|
-
duration: Date.now() - startTime,
|
|
75
|
-
tokenUsage: agent.getBudgetStatus().usage.tokensUsed - initialUsage,
|
|
76
|
-
};
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
private evaluateCriteria(
|
|
81
|
-
output: string,
|
|
82
|
-
toolsCalled: string[],
|
|
83
|
-
evalCase: EvalCase
|
|
84
|
-
): { passed: boolean; score: number; errors: string[] } {
|
|
85
|
-
const errors: string[] = [];
|
|
86
|
-
let passedCriteria = 0;
|
|
87
|
-
const totalCriteria = evalCase.successCriteria.length;
|
|
88
|
-
|
|
89
|
-
for (const criteria of evalCase.successCriteria) {
|
|
90
|
-
const { passed, error } = this.checkCriterion(
|
|
91
|
-
output,
|
|
92
|
-
toolsCalled,
|
|
93
|
-
criteria
|
|
94
|
-
);
|
|
95
|
-
if (passed) {
|
|
96
|
-
passedCriteria++;
|
|
97
|
-
} else if (error) {
|
|
98
|
-
errors.push(error);
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
// Check expected output if provided
|
|
103
|
-
if (evalCase.expectedOutput) {
|
|
104
|
-
if (output.toLowerCase().includes(evalCase.expectedOutput.toLowerCase())) {
|
|
105
|
-
passedCriteria++;
|
|
106
|
-
} else {
|
|
107
|
-
errors.push(`Expected output not found: "${evalCase.expectedOutput}"`);
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
// Check expected tool calls if provided
|
|
112
|
-
if (evalCase.expectedToolCalls) {
|
|
113
|
-
const missingTools = evalCase.expectedToolCalls.filter(
|
|
114
|
-
(t) => !toolsCalled.includes(t)
|
|
115
|
-
);
|
|
116
|
-
if (missingTools.length === 0) {
|
|
117
|
-
passedCriteria++;
|
|
118
|
-
} else {
|
|
119
|
-
errors.push(`Expected tools not called: ${missingTools.join(", ")}`);
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
const score = totalCriteria > 0 ? passedCriteria / totalCriteria : 1;
|
|
124
|
-
const passed = errors.length === 0 && score >= 0.8;
|
|
125
|
-
|
|
126
|
-
return { passed, score, errors };
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
private checkCriterion(
|
|
130
|
-
output: string,
|
|
131
|
-
toolsCalled: string[],
|
|
132
|
-
criteria: EvalCriteria
|
|
133
|
-
): { passed: boolean; error?: string } {
|
|
134
|
-
switch (criteria.type) {
|
|
135
|
-
case "contains":
|
|
136
|
-
if (output.toLowerCase().includes(criteria.value.toLowerCase())) {
|
|
137
|
-
return { passed: true };
|
|
138
|
-
}
|
|
139
|
-
return {
|
|
140
|
-
passed: false,
|
|
141
|
-
error: `Output should contain: "${criteria.value}"`,
|
|
142
|
-
};
|
|
143
|
-
|
|
144
|
-
case "not_contains":
|
|
145
|
-
if (!output.toLowerCase().includes(criteria.value.toLowerCase())) {
|
|
146
|
-
return { passed: true };
|
|
147
|
-
}
|
|
148
|
-
return {
|
|
149
|
-
passed: false,
|
|
150
|
-
error: `Output should not contain: "${criteria.value}"`,
|
|
151
|
-
};
|
|
152
|
-
|
|
153
|
-
case "tool_called":
|
|
154
|
-
if (toolsCalled.includes(criteria.value)) {
|
|
155
|
-
return { passed: true };
|
|
156
|
-
}
|
|
157
|
-
return {
|
|
158
|
-
passed: false,
|
|
159
|
-
error: `Tool should be called: "${criteria.value}"`,
|
|
160
|
-
};
|
|
161
|
-
|
|
162
|
-
case "tool_not_called":
|
|
163
|
-
if (!toolsCalled.includes(criteria.value)) {
|
|
164
|
-
return { passed: true };
|
|
165
|
-
}
|
|
166
|
-
return {
|
|
167
|
-
passed: false,
|
|
168
|
-
error: `Tool should not be called: "${criteria.value}"`,
|
|
169
|
-
};
|
|
170
|
-
|
|
171
|
-
case "regex":
|
|
172
|
-
try {
|
|
173
|
-
const regex = new RegExp(criteria.value, "i");
|
|
174
|
-
if (regex.test(output)) {
|
|
175
|
-
return { passed: true };
|
|
176
|
-
}
|
|
177
|
-
return {
|
|
178
|
-
passed: false,
|
|
179
|
-
error: `Output should match pattern: "${criteria.value}"`,
|
|
180
|
-
};
|
|
181
|
-
} catch {
|
|
182
|
-
return { passed: false, error: `Invalid regex: "${criteria.value}"` };
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
case "custom":
|
|
186
|
-
// Custom criteria would need to be implemented separately
|
|
187
|
-
return { passed: true };
|
|
188
|
-
|
|
189
|
-
default:
|
|
190
|
-
return { passed: false, error: `Unknown criteria type` };
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
async runEvalSuite(agent: Agent, evalCases: EvalCase[]): Promise<EvalRunSummary> {
|
|
195
|
-
const runId = `eval_${Date.now()}`;
|
|
196
|
-
const startTime = Date.now();
|
|
197
|
-
const results: EvalResult[] = [];
|
|
198
|
-
|
|
199
|
-
// Run evals sequentially for now (could parallelize)
|
|
200
|
-
for (const evalCase of evalCases) {
|
|
201
|
-
// Clear agent history between evals
|
|
202
|
-
agent.clearHistory();
|
|
203
|
-
|
|
204
|
-
const result = await this.runEval(agent, evalCase);
|
|
205
|
-
results.push(result);
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
const passedCases = results.filter((r) => r.passed).length;
|
|
209
|
-
const totalTokens = results.reduce((sum, r) => sum + r.tokenUsage, 0);
|
|
210
|
-
const totalDuration = Date.now() - startTime;
|
|
211
|
-
|
|
212
|
-
// Calculate weighted average score
|
|
213
|
-
let weightedScore = 0;
|
|
214
|
-
let totalWeight = 0;
|
|
215
|
-
for (let i = 0; i < results.length; i++) {
|
|
216
|
-
const weight = evalCases[i].weight || 1;
|
|
217
|
-
weightedScore += results[i].score * weight;
|
|
218
|
-
totalWeight += weight;
|
|
219
|
-
}
|
|
220
|
-
const averageScore = totalWeight > 0 ? weightedScore / totalWeight : 0;
|
|
221
|
-
|
|
222
|
-
return {
|
|
223
|
-
runId,
|
|
224
|
-
timestamp: new Date(),
|
|
225
|
-
totalCases: evalCases.length,
|
|
226
|
-
passedCases,
|
|
227
|
-
failedCases: evalCases.length - passedCases,
|
|
228
|
-
averageScore,
|
|
229
|
-
totalTokens,
|
|
230
|
-
totalDuration,
|
|
231
|
-
results,
|
|
232
|
-
};
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
// Format results for display
|
|
236
|
-
formatResults(summary: EvalRunSummary): string {
|
|
237
|
-
let output = `\nEvaluation Results (${summary.runId})\n`;
|
|
238
|
-
output += "=".repeat(50) + "\n\n";
|
|
239
|
-
output += `Total: ${summary.totalCases} | Passed: ${summary.passedCases} | Failed: ${summary.failedCases}\n`;
|
|
240
|
-
output += `Average Score: ${(summary.averageScore * 100).toFixed(1)}%\n`;
|
|
241
|
-
output += `Tokens Used: ${summary.totalTokens.toLocaleString()}\n`;
|
|
242
|
-
output += `Duration: ${(summary.totalDuration / 1000).toFixed(1)}s\n\n`;
|
|
243
|
-
|
|
244
|
-
for (const result of summary.results) {
|
|
245
|
-
const status = result.passed ? "✓" : "✗";
|
|
246
|
-
const color = result.passed ? "green" : "red";
|
|
247
|
-
output += `${status} ${result.caseId}: ${(result.score * 100).toFixed(0)}%`;
|
|
248
|
-
if (result.errors && result.errors.length > 0) {
|
|
249
|
-
output += `\n Errors: ${result.errors.join(", ")}`;
|
|
250
|
-
}
|
|
251
|
-
output += "\n";
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
return output;
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
// Predefined eval cases for common agent capabilities
|
|
259
|
-
export const DEFAULT_EVAL_CASES: EvalCase[] = [
|
|
260
|
-
{
|
|
261
|
-
id: "bash_basic",
|
|
262
|
-
name: "Basic Bash Execution",
|
|
263
|
-
description: "Test that the agent can execute a simple bash command",
|
|
264
|
-
input: "What is the current date? Use bash to find out.",
|
|
265
|
-
successCriteria: [
|
|
266
|
-
{ type: "tool_called", value: "bash" },
|
|
267
|
-
{ type: "regex", value: "\\d{4}" }, // Year should appear
|
|
268
|
-
],
|
|
269
|
-
},
|
|
270
|
-
{
|
|
271
|
-
id: "file_read",
|
|
272
|
-
name: "File Reading",
|
|
273
|
-
description: "Test that the agent can read a file",
|
|
274
|
-
input: "Read the contents of package.json and tell me the project name.",
|
|
275
|
-
successCriteria: [
|
|
276
|
-
{ type: "tool_called", value: "read_file" },
|
|
277
|
-
{ type: "contains", value: "pioneer" },
|
|
278
|
-
],
|
|
279
|
-
},
|
|
280
|
-
{
|
|
281
|
-
id: "code_execution",
|
|
282
|
-
name: "Code Sandbox",
|
|
283
|
-
description: "Test that the agent can execute code in a sandbox",
|
|
284
|
-
input: "Calculate the factorial of 5 using Python.",
|
|
285
|
-
successCriteria: [
|
|
286
|
-
{ type: "tool_called", value: "execute_code" },
|
|
287
|
-
{ type: "contains", value: "120" },
|
|
288
|
-
],
|
|
289
|
-
},
|
|
290
|
-
{
|
|
291
|
-
id: "multi_step",
|
|
292
|
-
name: "Multi-step Task",
|
|
293
|
-
description: "Test that the agent can complete a multi-step task",
|
|
294
|
-
input: "List the files in the current directory, then read the README.md file and summarize it.",
|
|
295
|
-
successCriteria: [
|
|
296
|
-
{ type: "tool_called", value: "list_directory" },
|
|
297
|
-
{ type: "tool_called", value: "read_file" },
|
|
298
|
-
],
|
|
299
|
-
},
|
|
300
|
-
];
|
|
301
|
-
|
|
@@ -1,319 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* EvolutionEngine - Core self-improvement loop
|
|
3
|
-
* Iteratively improves the agent based on evaluations and feedback
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import * as fs from "fs";
|
|
7
|
-
import * as path from "path";
|
|
8
|
-
import * as os from "os";
|
|
9
|
-
import type { Agent } from "../agent/Agent.js";
|
|
10
|
-
import { FeedbackCollector } from "./FeedbackCollector.js";
|
|
11
|
-
import { EvalRunner, DEFAULT_EVAL_CASES } from "./EvalRunner.js";
|
|
12
|
-
import { ModelTrainer, type TrainingResult } from "./ModelTrainer.js";
|
|
13
|
-
import type {
|
|
14
|
-
EvolutionConfig,
|
|
15
|
-
EvolutionState,
|
|
16
|
-
EvolutionHistory,
|
|
17
|
-
EvalCase,
|
|
18
|
-
EvalRunSummary,
|
|
19
|
-
} from "./types.js";
|
|
20
|
-
|
|
21
|
-
export interface EvolutionEngineConfig {
|
|
22
|
-
storagePath?: string;
|
|
23
|
-
evalCases?: EvalCase[];
|
|
24
|
-
targetScore?: number;
|
|
25
|
-
maxIterations?: number;
|
|
26
|
-
budgetPerIteration?: {
|
|
27
|
-
maxTokens?: number;
|
|
28
|
-
maxCost?: number;
|
|
29
|
-
maxTime?: number;
|
|
30
|
-
};
|
|
31
|
-
trainingConfig?: {
|
|
32
|
-
provider: "openai" | "modal" | "local";
|
|
33
|
-
baseModel: string;
|
|
34
|
-
};
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
export interface EvolutionEvents {
|
|
38
|
-
onIterationStart?: (iteration: number) => void;
|
|
39
|
-
onIterationEnd?: (iteration: number, score: number) => void;
|
|
40
|
-
onEvalComplete?: (summary: EvalRunSummary) => void;
|
|
41
|
-
onTrainingComplete?: (result: TrainingResult) => void;
|
|
42
|
-
onBudgetWarning?: (message: string) => void;
|
|
43
|
-
onComplete?: (state: EvolutionState) => void;
|
|
44
|
-
onError?: (error: Error) => void;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export class EvolutionEngine {
|
|
48
|
-
private config: EvolutionEngineConfig;
|
|
49
|
-
private storagePath: string;
|
|
50
|
-
private feedbackCollector: FeedbackCollector;
|
|
51
|
-
private evalRunner: EvalRunner;
|
|
52
|
-
private modelTrainer: ModelTrainer | null = null;
|
|
53
|
-
private state: EvolutionState;
|
|
54
|
-
private events: EvolutionEvents;
|
|
55
|
-
|
|
56
|
-
constructor(config: EvolutionEngineConfig, events: EvolutionEvents = {}) {
|
|
57
|
-
this.config = config;
|
|
58
|
-
this.events = events;
|
|
59
|
-
this.storagePath =
|
|
60
|
-
config.storagePath || path.join(os.homedir(), ".pioneer", "evolution");
|
|
61
|
-
this.ensureStoragePath();
|
|
62
|
-
|
|
63
|
-
this.feedbackCollector = new FeedbackCollector({
|
|
64
|
-
storagePath: path.join(this.storagePath, "feedback"),
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
this.evalRunner = new EvalRunner();
|
|
68
|
-
|
|
69
|
-
if (config.trainingConfig) {
|
|
70
|
-
this.modelTrainer = new ModelTrainer({
|
|
71
|
-
provider: config.trainingConfig.provider,
|
|
72
|
-
baseModel: config.trainingConfig.baseModel,
|
|
73
|
-
outputDir: path.join(this.storagePath, "models"),
|
|
74
|
-
});
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
this.state = this.loadState() || this.createInitialState();
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
private ensureStoragePath(): void {
|
|
81
|
-
if (!fs.existsSync(this.storagePath)) {
|
|
82
|
-
fs.mkdirSync(this.storagePath, { recursive: true });
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
private createInitialState(): EvolutionState {
|
|
87
|
-
return {
|
|
88
|
-
iteration: 0,
|
|
89
|
-
currentScore: 0,
|
|
90
|
-
bestScore: 0,
|
|
91
|
-
bestPrompt: "",
|
|
92
|
-
history: [],
|
|
93
|
-
totalTokensUsed: 0,
|
|
94
|
-
totalCostUsed: 0,
|
|
95
|
-
totalTimeUsed: 0,
|
|
96
|
-
startTime: new Date(),
|
|
97
|
-
status: "running",
|
|
98
|
-
};
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
private loadState(): EvolutionState | null {
|
|
102
|
-
const statePath = path.join(this.storagePath, "state.json");
|
|
103
|
-
try {
|
|
104
|
-
if (fs.existsSync(statePath)) {
|
|
105
|
-
const data = fs.readFileSync(statePath, "utf-8");
|
|
106
|
-
return JSON.parse(data);
|
|
107
|
-
}
|
|
108
|
-
} catch {
|
|
109
|
-
// Ignore errors
|
|
110
|
-
}
|
|
111
|
-
return null;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
private saveState(): void {
|
|
115
|
-
const statePath = path.join(this.storagePath, "state.json");
|
|
116
|
-
fs.writeFileSync(statePath, JSON.stringify(this.state, null, 2));
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
async evolve(agent: Agent): Promise<EvolutionState> {
|
|
120
|
-
const evalCases = this.config.evalCases || DEFAULT_EVAL_CASES;
|
|
121
|
-
const targetScore = this.config.targetScore || 0.9;
|
|
122
|
-
const maxIterations = this.config.maxIterations || 10;
|
|
123
|
-
|
|
124
|
-
this.state.status = "running";
|
|
125
|
-
this.state.startTime = new Date();
|
|
126
|
-
|
|
127
|
-
while (
|
|
128
|
-
this.state.iteration < maxIterations &&
|
|
129
|
-
this.state.status === "running"
|
|
130
|
-
) {
|
|
131
|
-
// Check budget
|
|
132
|
-
if (!this.checkBudget()) {
|
|
133
|
-
this.state.status = "budget_exhausted";
|
|
134
|
-
break;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
this.state.iteration++;
|
|
138
|
-
this.events.onIterationStart?.(this.state.iteration);
|
|
139
|
-
|
|
140
|
-
try {
|
|
141
|
-
// Run evaluation
|
|
142
|
-
const evalSummary = await this.evalRunner.runEvalSuite(agent, evalCases);
|
|
143
|
-
this.events.onEvalComplete?.(evalSummary);
|
|
144
|
-
|
|
145
|
-
// Update state
|
|
146
|
-
this.state.currentScore = evalSummary.averageScore;
|
|
147
|
-
this.state.totalTokensUsed += evalSummary.totalTokens;
|
|
148
|
-
this.state.totalTimeUsed += evalSummary.totalDuration / 1000;
|
|
149
|
-
|
|
150
|
-
// Check if target reached
|
|
151
|
-
if (this.state.currentScore >= targetScore) {
|
|
152
|
-
this.state.status = "completed";
|
|
153
|
-
this.events.onIterationEnd?.(this.state.iteration, this.state.currentScore);
|
|
154
|
-
break;
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
// Update best if improved
|
|
158
|
-
if (this.state.currentScore > this.state.bestScore) {
|
|
159
|
-
this.state.bestScore = this.state.currentScore;
|
|
160
|
-
// Save the current configuration as best
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
// Attempt improvement
|
|
164
|
-
await this.attemptImprovement(agent, evalSummary);
|
|
165
|
-
|
|
166
|
-
// Record history
|
|
167
|
-
this.state.history.push({
|
|
168
|
-
iteration: this.state.iteration,
|
|
169
|
-
prompt: "", // Would store the current prompt
|
|
170
|
-
evalScore: this.state.currentScore,
|
|
171
|
-
changes: "Prompt/model adjustment",
|
|
172
|
-
timestamp: new Date(),
|
|
173
|
-
});
|
|
174
|
-
|
|
175
|
-
this.events.onIterationEnd?.(this.state.iteration, this.state.currentScore);
|
|
176
|
-
this.saveState();
|
|
177
|
-
} catch (error) {
|
|
178
|
-
this.events.onError?.(
|
|
179
|
-
error instanceof Error ? error : new Error(String(error))
|
|
180
|
-
);
|
|
181
|
-
this.state.status = "failed";
|
|
182
|
-
break;
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
this.state.endTime = new Date();
|
|
187
|
-
this.saveState();
|
|
188
|
-
this.events.onComplete?.(this.state);
|
|
189
|
-
|
|
190
|
-
return this.state;
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
private checkBudget(): boolean {
|
|
194
|
-
const budget = this.config.budgetPerIteration;
|
|
195
|
-
if (!budget) return true;
|
|
196
|
-
|
|
197
|
-
if (budget.maxTokens && this.state.totalTokensUsed >= budget.maxTokens) {
|
|
198
|
-
this.events.onBudgetWarning?.("Token budget exhausted");
|
|
199
|
-
return false;
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
if (budget.maxCost && this.state.totalCostUsed >= budget.maxCost) {
|
|
203
|
-
this.events.onBudgetWarning?.("Cost budget exhausted");
|
|
204
|
-
return false;
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
if (budget.maxTime && this.state.totalTimeUsed >= budget.maxTime) {
|
|
208
|
-
this.events.onBudgetWarning?.("Time budget exhausted");
|
|
209
|
-
return false;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
return true;
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
private async attemptImprovement(
|
|
216
|
-
agent: Agent,
|
|
217
|
-
evalSummary: EvalRunSummary
|
|
218
|
-
): Promise<void> {
|
|
219
|
-
// Get feedback for training
|
|
220
|
-
const trainingData = this.feedbackCollector.toTrainingData();
|
|
221
|
-
|
|
222
|
-
// If we have enough training data, attempt fine-tuning
|
|
223
|
-
if (trainingData.length >= 50 && this.modelTrainer) {
|
|
224
|
-
try {
|
|
225
|
-
const result = await this.modelTrainer.train(trainingData);
|
|
226
|
-
this.events.onTrainingComplete?.(result);
|
|
227
|
-
|
|
228
|
-
if (result.success && result.modelId) {
|
|
229
|
-
// Would update agent to use the new model
|
|
230
|
-
console.log(`New model trained: ${result.modelId}`);
|
|
231
|
-
}
|
|
232
|
-
} catch (error) {
|
|
233
|
-
this.events.onError?.(
|
|
234
|
-
error instanceof Error ? error : new Error(String(error))
|
|
235
|
-
);
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
// Analyze failed cases and suggest improvements
|
|
240
|
-
const failedCases = evalSummary.results.filter((r) => !r.passed);
|
|
241
|
-
if (failedCases.length > 0) {
|
|
242
|
-
// Could use the LLM to analyze failures and suggest prompt improvements
|
|
243
|
-
console.log(
|
|
244
|
-
`Analyzing ${failedCases.length} failed cases for improvement...`
|
|
245
|
-
);
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
// Record interaction feedback
|
|
250
|
-
recordInteraction(params: {
|
|
251
|
-
sessionId: string;
|
|
252
|
-
userMessage: string;
|
|
253
|
-
agentResponse: string;
|
|
254
|
-
toolCalls: string[];
|
|
255
|
-
wasSuccessful: boolean;
|
|
256
|
-
metadata?: Record<string, unknown>;
|
|
257
|
-
}): string {
|
|
258
|
-
return this.feedbackCollector.recordInteraction(params);
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
// Add user rating to feedback
|
|
262
|
-
rateFeedback(feedbackId: string, rating: number, corrections?: string): void {
|
|
263
|
-
this.feedbackCollector.addRating(feedbackId, rating, corrections);
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
// Get evolution statistics
|
|
267
|
-
getStats(): {
|
|
268
|
-
state: EvolutionState;
|
|
269
|
-
feedbackStats: ReturnType<FeedbackCollector["getStats"]>;
|
|
270
|
-
} {
|
|
271
|
-
return {
|
|
272
|
-
state: this.state,
|
|
273
|
-
feedbackStats: this.feedbackCollector.getStats(),
|
|
274
|
-
};
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
// Reset evolution state
|
|
278
|
-
reset(): void {
|
|
279
|
-
this.state = this.createInitialState();
|
|
280
|
-
this.saveState();
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
// Export feedback for external training
|
|
284
|
-
exportFeedback(format: "jsonl" | "openai", outputPath: string): void {
|
|
285
|
-
if (format === "openai") {
|
|
286
|
-
this.feedbackCollector.exportAsOpenAIFormat(outputPath);
|
|
287
|
-
} else {
|
|
288
|
-
this.feedbackCollector.exportAsJsonl(outputPath);
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
// Run a single evaluation cycle
|
|
293
|
-
async runEvaluation(agent: Agent): Promise<EvalRunSummary> {
|
|
294
|
-
const evalCases = this.config.evalCases || DEFAULT_EVAL_CASES;
|
|
295
|
-
return this.evalRunner.runEvalSuite(agent, evalCases);
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
// Get formatted results
|
|
299
|
-
formatResults(): string {
|
|
300
|
-
let output = "\nEvolution Engine Status\n";
|
|
301
|
-
output += "=".repeat(50) + "\n\n";
|
|
302
|
-
output += `Status: ${this.state.status}\n`;
|
|
303
|
-
output += `Iteration: ${this.state.iteration}\n`;
|
|
304
|
-
output += `Current Score: ${(this.state.currentScore * 100).toFixed(1)}%\n`;
|
|
305
|
-
output += `Best Score: ${(this.state.bestScore * 100).toFixed(1)}%\n`;
|
|
306
|
-
output += `Tokens Used: ${this.state.totalTokensUsed.toLocaleString()}\n`;
|
|
307
|
-
output += `Time Used: ${this.state.totalTimeUsed.toFixed(1)}s\n\n`;
|
|
308
|
-
|
|
309
|
-
const feedbackStats = this.feedbackCollector.getStats();
|
|
310
|
-
output += "Feedback Statistics:\n";
|
|
311
|
-
output += ` Total: ${feedbackStats.total}\n`;
|
|
312
|
-
output += ` Rated: ${feedbackStats.rated}\n`;
|
|
313
|
-
output += ` Avg Rating: ${feedbackStats.avgRating}/5\n`;
|
|
314
|
-
output += ` Success Rate: ${feedbackStats.successRate}%\n`;
|
|
315
|
-
|
|
316
|
-
return output;
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
|