@hanzo/dev 2.1.1 ā 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +278 -279
- package/bin/dev.js +413 -0
- package/package.json +32 -61
- package/postinstall.js +513 -0
- package/scripts/preinstall.js +69 -0
- package/scripts/windows-cleanup.ps1 +31 -0
- package/.eslintrc.json +0 -24
- package/dist/cli/dev.js +0 -24746
- package/src/cli/dev.ts +0 -946
- package/src/lib/agent-loop.ts +0 -552
- package/src/lib/benchmark-runner.ts +0 -431
- package/src/lib/code-act-agent.ts +0 -378
- package/src/lib/config.ts +0 -163
- package/src/lib/editor.ts +0 -395
- package/src/lib/function-calling.ts +0 -318
- package/src/lib/mcp-client.ts +0 -259
- package/src/lib/peer-agent-network.ts +0 -584
- package/src/lib/swarm-runner.ts +0 -389
- package/src/lib/unified-workspace.ts +0 -435
- package/test-swarm/file1.js +0 -6
- package/test-swarm/file2.ts +0 -12
- package/test-swarm/file3.py +0 -15
- package/test-swarm/file4.md +0 -13
- package/test-swarm/file5.json +0 -12
- package/test-swarm-demo.sh +0 -22
- package/tests/browser-integration.test.ts +0 -242
- package/tests/code-act-agent.test.ts +0 -305
- package/tests/editor.test.ts +0 -223
- package/tests/fixtures/sample-code.js +0 -13
- package/tests/fixtures/sample-code.py +0 -28
- package/tests/fixtures/sample-code.ts +0 -22
- package/tests/mcp-client.test.ts +0 -238
- package/tests/peer-agent-network.test.ts +0 -340
- package/tests/swarm-runner.test.ts +0 -301
- package/tests/swe-bench.test.ts +0 -357
- package/tsconfig.cli.json +0 -25
- package/tsconfig.json +0 -35
- package/vitest.config.ts +0 -37
|
@@ -1,431 +0,0 @@
|
|
|
1
|
-
import * as fs from 'fs';
|
|
2
|
-
import * as path from 'path';
|
|
3
|
-
import { execSync } from 'child_process';
|
|
4
|
-
import chalk from 'chalk';
|
|
5
|
-
import ora from 'ora';
|
|
6
|
-
import { CodeActAgent } from './code-act-agent';
|
|
7
|
-
import { PeerAgentNetwork } from './peer-agent-network';
|
|
8
|
-
import { FunctionCallingSystem } from './function-calling';
|
|
9
|
-
import { ConfigurableAgentLoop, LLMProvider } from './agent-loop';
|
|
10
|
-
|
|
11
|
-
export interface BenchmarkTask {
|
|
12
|
-
instance_id: string;
|
|
13
|
-
repo: string;
|
|
14
|
-
base_commit: string;
|
|
15
|
-
problem_statement: string;
|
|
16
|
-
hints_text?: string;
|
|
17
|
-
test_patch?: string;
|
|
18
|
-
expected_files?: string[];
|
|
19
|
-
difficulty?: 'easy' | 'medium' | 'hard';
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
export interface BenchmarkResult {
|
|
23
|
-
instance_id: string;
|
|
24
|
-
success: boolean;
|
|
25
|
-
time_taken_ms: number;
|
|
26
|
-
files_modified: number;
|
|
27
|
-
test_passed: boolean;
|
|
28
|
-
error?: string;
|
|
29
|
-
agent_type: string;
|
|
30
|
-
llm_calls: number;
|
|
31
|
-
cost_estimate: number;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
export interface BenchmarkConfig {
|
|
35
|
-
dataset: 'swe-bench' | 'swe-bench-lite' | 'custom';
|
|
36
|
-
agents: number;
|
|
37
|
-
parallel: boolean;
|
|
38
|
-
timeout: number;
|
|
39
|
-
output: string;
|
|
40
|
-
provider?: LLMProvider;
|
|
41
|
-
maxTasks?: number;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
export class BenchmarkRunner {
|
|
45
|
-
private config: BenchmarkConfig;
|
|
46
|
-
private results: BenchmarkResult[] = [];
|
|
47
|
-
private network?: PeerAgentNetwork;
|
|
48
|
-
|
|
49
|
-
constructor(config: BenchmarkConfig) {
|
|
50
|
-
this.config = {
|
|
51
|
-
dataset: 'swe-bench-lite',
|
|
52
|
-
agents: 5,
|
|
53
|
-
parallel: true,
|
|
54
|
-
timeout: 300000, // 5 minutes default
|
|
55
|
-
output: 'benchmark-results.json',
|
|
56
|
-
...config
|
|
57
|
-
};
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
async run(): Promise<void> {
|
|
61
|
-
console.log(chalk.bold.cyan('\nš Hanzo Dev Benchmark Runner\n'));
|
|
62
|
-
console.log(chalk.gray(`Dataset: ${this.config.dataset}`));
|
|
63
|
-
console.log(chalk.gray(`Agents: ${this.config.agents}`));
|
|
64
|
-
console.log(chalk.gray(`Parallel: ${this.config.parallel}`));
|
|
65
|
-
console.log(chalk.gray(`Timeout: ${this.config.timeout}ms\n`));
|
|
66
|
-
|
|
67
|
-
const spinner = ora('Loading benchmark tasks...').start();
|
|
68
|
-
|
|
69
|
-
try {
|
|
70
|
-
// Load tasks
|
|
71
|
-
const tasks = await this.loadTasks();
|
|
72
|
-
spinner.succeed(`Loaded ${tasks.length} tasks`);
|
|
73
|
-
|
|
74
|
-
// Initialize network if using parallel mode
|
|
75
|
-
if (this.config.parallel && this.config.agents > 1) {
|
|
76
|
-
spinner.start('Initializing agent network...');
|
|
77
|
-
this.network = new PeerAgentNetwork();
|
|
78
|
-
spinner.succeed('Agent network initialized');
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// Run benchmark
|
|
82
|
-
const startTime = Date.now();
|
|
83
|
-
await this.runTasks(tasks);
|
|
84
|
-
const totalTime = Date.now() - startTime;
|
|
85
|
-
|
|
86
|
-
// Calculate and display results
|
|
87
|
-
this.displayResults(totalTime);
|
|
88
|
-
|
|
89
|
-
// Save results
|
|
90
|
-
await this.saveResults();
|
|
91
|
-
|
|
92
|
-
} catch (error) {
|
|
93
|
-
spinner.fail(`Benchmark failed: ${error}`);
|
|
94
|
-
throw error;
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
private async loadTasks(): Promise<BenchmarkTask[]> {
|
|
99
|
-
// Load from different sources based on dataset
|
|
100
|
-
switch (this.config.dataset) {
|
|
101
|
-
case 'swe-bench':
|
|
102
|
-
return this.loadSWEBenchTasks(false);
|
|
103
|
-
case 'swe-bench-lite':
|
|
104
|
-
return this.loadSWEBenchTasks(true);
|
|
105
|
-
case 'custom':
|
|
106
|
-
return this.loadCustomTasks();
|
|
107
|
-
default:
|
|
108
|
-
throw new Error(`Unknown dataset: ${this.config.dataset}`);
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
private async loadSWEBenchTasks(lite: boolean): Promise<BenchmarkTask[]> {
|
|
113
|
-
// In production, this would load from the actual SWE-bench dataset
|
|
114
|
-
// For now, return sample tasks for testing
|
|
115
|
-
const sampleTasks: BenchmarkTask[] = [
|
|
116
|
-
{
|
|
117
|
-
instance_id: 'django__django-11999',
|
|
118
|
-
repo: 'django/django',
|
|
119
|
-
base_commit: 'abc123',
|
|
120
|
-
problem_statement: 'Fix QuerySet.delete() to handle circular foreign key dependencies',
|
|
121
|
-
hints_text: 'Look at django/db/models/deletion.py',
|
|
122
|
-
difficulty: 'hard',
|
|
123
|
-
expected_files: ['django/db/models/deletion.py']
|
|
124
|
-
},
|
|
125
|
-
{
|
|
126
|
-
instance_id: 'pytest-dev__pytest-5692',
|
|
127
|
-
repo: 'pytest-dev/pytest',
|
|
128
|
-
base_commit: 'def456',
|
|
129
|
-
problem_statement: 'Fix --collect-only to show parametrized test ids',
|
|
130
|
-
hints_text: 'Check _pytest/main.py and _pytest/python.py',
|
|
131
|
-
difficulty: 'medium',
|
|
132
|
-
expected_files: ['src/_pytest/main.py', 'src/_pytest/python.py']
|
|
133
|
-
},
|
|
134
|
-
{
|
|
135
|
-
instance_id: 'scikit-learn__scikit-learn-13142',
|
|
136
|
-
repo: 'scikit-learn/scikit-learn',
|
|
137
|
-
base_commit: 'ghi789',
|
|
138
|
-
problem_statement: 'Add sample_weight support to Ridge regression',
|
|
139
|
-
hints_text: 'Modify sklearn/linear_model/ridge.py',
|
|
140
|
-
difficulty: 'medium',
|
|
141
|
-
expected_files: ['sklearn/linear_model/ridge.py']
|
|
142
|
-
},
|
|
143
|
-
{
|
|
144
|
-
instance_id: 'requests__requests-3362',
|
|
145
|
-
repo: 'psf/requests',
|
|
146
|
-
base_commit: 'jkl012',
|
|
147
|
-
problem_statement: 'Fix encoding detection for streaming responses',
|
|
148
|
-
hints_text: 'Look at requests/models.py Response class',
|
|
149
|
-
difficulty: 'easy',
|
|
150
|
-
expected_files: ['requests/models.py']
|
|
151
|
-
},
|
|
152
|
-
{
|
|
153
|
-
instance_id: 'flask__flask-2354',
|
|
154
|
-
repo: 'pallets/flask',
|
|
155
|
-
base_commit: 'mno345',
|
|
156
|
-
problem_statement: 'Add support for async view functions',
|
|
157
|
-
hints_text: 'Modify flask/app.py and flask/views.py',
|
|
158
|
-
difficulty: 'hard',
|
|
159
|
-
expected_files: ['flask/app.py', 'flask/views.py']
|
|
160
|
-
}
|
|
161
|
-
];
|
|
162
|
-
|
|
163
|
-
// Apply task limit if specified
|
|
164
|
-
const tasks = lite ? sampleTasks.slice(0, 3) : sampleTasks;
|
|
165
|
-
return this.config.maxTasks ? tasks.slice(0, this.config.maxTasks) : tasks;
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
private async loadCustomTasks(): Promise<BenchmarkTask[]> {
|
|
169
|
-
// Load from custom JSON file
|
|
170
|
-
const customPath = path.join(process.cwd(), 'benchmark-tasks.json');
|
|
171
|
-
if (!fs.existsSync(customPath)) {
|
|
172
|
-
throw new Error(`Custom tasks file not found: ${customPath}`);
|
|
173
|
-
}
|
|
174
|
-
return JSON.parse(fs.readFileSync(customPath, 'utf-8'));
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
private async runTasks(tasks: BenchmarkTask[]): Promise<void> {
|
|
178
|
-
const spinner = ora('Running benchmark tasks...').start();
|
|
179
|
-
|
|
180
|
-
if (this.config.parallel && this.network) {
|
|
181
|
-
// Run tasks in parallel using agent network
|
|
182
|
-
await this.runParallelTasks(tasks, spinner);
|
|
183
|
-
} else {
|
|
184
|
-
// Run tasks sequentially
|
|
185
|
-
await this.runSequentialTasks(tasks, spinner);
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
spinner.succeed(`Completed ${tasks.length} tasks`);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
private async runSequentialTasks(tasks: BenchmarkTask[], spinner: ora.Ora): Promise<void> {
|
|
192
|
-
for (let i = 0; i < tasks.length; i++) {
|
|
193
|
-
const task = tasks[i];
|
|
194
|
-
spinner.text = `Running task ${i + 1}/${tasks.length}: ${task.instance_id}`;
|
|
195
|
-
|
|
196
|
-
const result = await this.runSingleTask(task);
|
|
197
|
-
this.results.push(result);
|
|
198
|
-
|
|
199
|
-
if (result.success) {
|
|
200
|
-
spinner.succeed(`ā ${task.instance_id} (${result.time_taken_ms}ms)`);
|
|
201
|
-
} else {
|
|
202
|
-
spinner.fail(`ā ${task.instance_id}: ${result.error}`);
|
|
203
|
-
}
|
|
204
|
-
spinner.start();
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
private async runParallelTasks(tasks: BenchmarkTask[], spinner: ora.Ora): Promise<void> {
|
|
209
|
-
spinner.text = `Spawning ${this.config.agents} agents for parallel execution...`;
|
|
210
|
-
|
|
211
|
-
// Create agent pool
|
|
212
|
-
const agentPromises = [];
|
|
213
|
-
for (let i = 0; i < Math.min(this.config.agents, tasks.length); i++) {
|
|
214
|
-
agentPromises.push(this.createBenchmarkAgent(`benchmark-agent-${i}`));
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
await Promise.all(agentPromises);
|
|
218
|
-
|
|
219
|
-
// Distribute tasks among agents
|
|
220
|
-
const taskQueue = [...tasks];
|
|
221
|
-
const resultPromises: Promise<BenchmarkResult>[] = [];
|
|
222
|
-
|
|
223
|
-
while (taskQueue.length > 0) {
|
|
224
|
-
const batch = taskQueue.splice(0, this.config.agents);
|
|
225
|
-
const batchPromises = batch.map((task, index) =>
|
|
226
|
-
this.runTaskWithAgent(task, `benchmark-agent-${index}`)
|
|
227
|
-
);
|
|
228
|
-
resultPromises.push(...batchPromises);
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
// Wait for all tasks to complete
|
|
232
|
-
spinner.text = `Running ${tasks.length} tasks in parallel...`;
|
|
233
|
-
const results = await Promise.all(resultPromises);
|
|
234
|
-
this.results.push(...results);
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
private async createBenchmarkAgent(agentId: string): Promise<void> {
|
|
238
|
-
if (!this.network) return;
|
|
239
|
-
|
|
240
|
-
await this.network.spawnAgent({
|
|
241
|
-
id: agentId,
|
|
242
|
-
name: `Benchmark Agent ${agentId}`,
|
|
243
|
-
type: 'claude-code',
|
|
244
|
-
tools: ['edit_file', 'view_file', 'run_command', 'search_files']
|
|
245
|
-
});
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
private async runTaskWithAgent(task: BenchmarkTask, agentId: string): Promise<BenchmarkResult> {
|
|
249
|
-
const startTime = Date.now();
|
|
250
|
-
let llmCalls = 0;
|
|
251
|
-
|
|
252
|
-
try {
|
|
253
|
-
// Create agent loop with timeout
|
|
254
|
-
const loop = new ConfigurableAgentLoop({
|
|
255
|
-
provider: this.config.provider || this.getDefaultProvider(),
|
|
256
|
-
maxIterations: 50,
|
|
257
|
-
enableMCP: true,
|
|
258
|
-
enableBrowser: false,
|
|
259
|
-
enableSwarm: false,
|
|
260
|
-
streamOutput: false,
|
|
261
|
-
confirmActions: false
|
|
262
|
-
});
|
|
263
|
-
|
|
264
|
-
// Track LLM calls
|
|
265
|
-
loop.on('llm-call', () => llmCalls++);
|
|
266
|
-
|
|
267
|
-
// Initialize and execute
|
|
268
|
-
await loop.initialize();
|
|
269
|
-
|
|
270
|
-
const timeoutPromise = new Promise((_, reject) =>
|
|
271
|
-
setTimeout(() => reject(new Error('Task timeout')), this.config.timeout)
|
|
272
|
-
);
|
|
273
|
-
|
|
274
|
-
await Promise.race([
|
|
275
|
-
loop.execute(this.formatTaskPrompt(task)),
|
|
276
|
-
timeoutPromise
|
|
277
|
-
]);
|
|
278
|
-
|
|
279
|
-
// Verify solution
|
|
280
|
-
const testPassed = await this.runTests(task);
|
|
281
|
-
|
|
282
|
-
return {
|
|
283
|
-
instance_id: task.instance_id,
|
|
284
|
-
success: true,
|
|
285
|
-
time_taken_ms: Date.now() - startTime,
|
|
286
|
-
files_modified: task.expected_files?.length || 0,
|
|
287
|
-
test_passed: testPassed,
|
|
288
|
-
agent_type: agentId,
|
|
289
|
-
llm_calls,
|
|
290
|
-
cost_estimate: this.estimateCost(llmCalls)
|
|
291
|
-
};
|
|
292
|
-
|
|
293
|
-
} catch (error: any) {
|
|
294
|
-
return {
|
|
295
|
-
instance_id: task.instance_id,
|
|
296
|
-
success: false,
|
|
297
|
-
time_taken_ms: Date.now() - startTime,
|
|
298
|
-
files_modified: 0,
|
|
299
|
-
test_passed: false,
|
|
300
|
-
error: error.message,
|
|
301
|
-
agent_type: agentId,
|
|
302
|
-
llm_calls,
|
|
303
|
-
cost_estimate: this.estimateCost(llmCalls)
|
|
304
|
-
};
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
private async runSingleTask(task: BenchmarkTask): Promise<BenchmarkResult> {
|
|
309
|
-
return this.runTaskWithAgent(task, 'single-agent');
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
private formatTaskPrompt(task: BenchmarkTask): string {
|
|
313
|
-
let prompt = `Repository: ${task.repo}\n`;
|
|
314
|
-
prompt += `Problem: ${task.problem_statement}\n`;
|
|
315
|
-
|
|
316
|
-
if (task.hints_text) {
|
|
317
|
-
prompt += `\nHints: ${task.hints_text}\n`;
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
if (task.expected_files?.length) {
|
|
321
|
-
prompt += `\nFiles that likely need modification: ${task.expected_files.join(', ')}\n`;
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
prompt += '\nPlease fix this issue by making the necessary code changes.';
|
|
325
|
-
|
|
326
|
-
return prompt;
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
private async runTests(task: BenchmarkTask): Promise<boolean> {
|
|
330
|
-
// In production, this would apply the test patch and run actual tests
|
|
331
|
-
// For now, simulate test results
|
|
332
|
-
return Math.random() > 0.3; // 70% test pass rate
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
private getDefaultProvider(): LLMProvider {
|
|
336
|
-
// Check for available API keys
|
|
337
|
-
if (process.env.ANTHROPIC_API_KEY) {
|
|
338
|
-
return {
|
|
339
|
-
name: 'Claude',
|
|
340
|
-
type: 'anthropic',
|
|
341
|
-
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
342
|
-
model: 'claude-3-opus-20240229',
|
|
343
|
-
supportsTools: true,
|
|
344
|
-
supportsStreaming: true
|
|
345
|
-
};
|
|
346
|
-
} else if (process.env.OPENAI_API_KEY) {
|
|
347
|
-
return {
|
|
348
|
-
name: 'GPT-4',
|
|
349
|
-
type: 'openai',
|
|
350
|
-
apiKey: process.env.OPENAI_API_KEY,
|
|
351
|
-
model: 'gpt-4-turbo-preview',
|
|
352
|
-
supportsTools: true,
|
|
353
|
-
supportsStreaming: true
|
|
354
|
-
};
|
|
355
|
-
} else {
|
|
356
|
-
throw new Error('No LLM API key found. Please set ANTHROPIC_API_KEY or OPENAI_API_KEY');
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
private estimateCost(llmCalls: number): number {
|
|
361
|
-
// Rough cost estimation based on average tokens per call
|
|
362
|
-
const avgTokensPerCall = 2000;
|
|
363
|
-
const costPer1kTokens = 0.01; // Adjust based on model
|
|
364
|
-
return (llmCalls * avgTokensPerCall * costPer1kTokens) / 1000;
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
private displayResults(totalTime: number): void {
|
|
368
|
-
const successful = this.results.filter(r => r.success).length;
|
|
369
|
-
const testsPassed = this.results.filter(r => r.test_passed).length;
|
|
370
|
-
const avgTime = this.results.reduce((sum, r) => sum + r.time_taken_ms, 0) / this.results.length;
|
|
371
|
-
const totalCost = this.results.reduce((sum, r) => sum + r.cost_estimate, 0);
|
|
372
|
-
const avgLLMCalls = this.results.reduce((sum, r) => sum + r.llm_calls, 0) / this.results.length;
|
|
373
|
-
|
|
374
|
-
console.log(chalk.bold.cyan('\nš Benchmark Results\n'));
|
|
375
|
-
console.log(chalk.white('Total Tasks:'), this.results.length);
|
|
376
|
-
console.log(chalk.green('Successful:'), `${successful} (${(successful / this.results.length * 100).toFixed(1)}%)`);
|
|
377
|
-
console.log(chalk.blue('Tests Passed:'), `${testsPassed} (${(testsPassed / this.results.length * 100).toFixed(1)}%)`);
|
|
378
|
-
console.log(chalk.yellow('Avg Time:'), `${(avgTime / 1000).toFixed(1)}s`);
|
|
379
|
-
console.log(chalk.yellow('Total Time:'), `${(totalTime / 1000).toFixed(1)}s`);
|
|
380
|
-
console.log(chalk.magenta('Avg LLM Calls:'), avgLLMCalls.toFixed(1));
|
|
381
|
-
console.log(chalk.cyan('Est. Total Cost:'), `$${totalCost.toFixed(2)}`);
|
|
382
|
-
console.log(chalk.cyan('Cost per Task:'), `$${(totalCost / this.results.length).toFixed(3)}`);
|
|
383
|
-
|
|
384
|
-
if (this.config.parallel) {
|
|
385
|
-
const speedup = (avgTime * this.results.length) / totalTime;
|
|
386
|
-
console.log(chalk.green('Parallel Speedup:'), `${speedup.toFixed(2)}x`);
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
// Show difficulty breakdown
|
|
390
|
-
const byDifficulty = this.groupByDifficulty();
|
|
391
|
-
console.log(chalk.bold.gray('\nBy Difficulty:'));
|
|
392
|
-
Object.entries(byDifficulty).forEach(([difficulty, stats]) => {
|
|
393
|
-
console.log(` ${difficulty}: ${stats.success}/${stats.total} (${(stats.success / stats.total * 100).toFixed(1)}%)`);
|
|
394
|
-
});
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
private groupByDifficulty(): Record<string, { total: number; success: number }> {
|
|
398
|
-
const groups: Record<string, { total: number; success: number }> = {
|
|
399
|
-
easy: { total: 0, success: 0 },
|
|
400
|
-
medium: { total: 0, success: 0 },
|
|
401
|
-
hard: { total: 0, success: 0 }
|
|
402
|
-
};
|
|
403
|
-
|
|
404
|
-
// Note: We'd need to store difficulty in results for this to work properly
|
|
405
|
-
// For now, just return mock data
|
|
406
|
-
return groups;
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
private async saveResults(): Promise<void> {
|
|
410
|
-
const output = {
|
|
411
|
-
metadata: {
|
|
412
|
-
dataset: this.config.dataset,
|
|
413
|
-
agents: this.config.agents,
|
|
414
|
-
parallel: this.config.parallel,
|
|
415
|
-
timestamp: new Date().toISOString(),
|
|
416
|
-
provider: this.config.provider?.name || 'auto'
|
|
417
|
-
},
|
|
418
|
-
summary: {
|
|
419
|
-
total_tasks: this.results.length,
|
|
420
|
-
successful: this.results.filter(r => r.success).length,
|
|
421
|
-
tests_passed: this.results.filter(r => r.test_passed).length,
|
|
422
|
-
avg_time_ms: this.results.reduce((sum, r) => sum + r.time_taken_ms, 0) / this.results.length,
|
|
423
|
-
total_cost: this.results.reduce((sum, r) => sum + r.cost_estimate, 0)
|
|
424
|
-
},
|
|
425
|
-
results: this.results
|
|
426
|
-
};
|
|
427
|
-
|
|
428
|
-
fs.writeFileSync(this.config.output, JSON.stringify(output, null, 2));
|
|
429
|
-
console.log(chalk.gray(`\nResults saved to ${this.config.output}`));
|
|
430
|
-
}
|
|
431
|
-
}
|