agentic-flow 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +87 -12
- package/dist/agents/claudeAgent.js +96 -67
- package/dist/cli/claude-code-wrapper.js +23 -1
- package/dist/cli-proxy.js +71 -5
- package/dist/mcp/standalone-stdio.js +251 -2
- package/dist/proxy/anthropic-to-requesty.js +707 -0
- package/dist/utils/cli.js +6 -0
- package/dist/utils/modelCapabilities.js +22 -0
- package/docs/plans/agent-booster/00-INDEX.md +230 -0
- package/docs/plans/agent-booster/00-OVERVIEW.md +454 -0
- package/docs/plans/agent-booster/01-ARCHITECTURE.md +699 -0
- package/docs/plans/agent-booster/02-INTEGRATION.md +771 -0
- package/docs/plans/agent-booster/03-BENCHMARKS.md +616 -0
- package/docs/plans/agent-booster/04-NPM-SDK.md +673 -0
- package/docs/plans/agent-booster/GITHUB-ISSUE.md +523 -0
- package/docs/plans/agent-booster/README.md +576 -0
- package/docs/plans/requesty/00-overview.md +176 -0
- package/docs/plans/requesty/01-api-research.md +573 -0
- package/docs/plans/requesty/02-architecture.md +1076 -0
- package/docs/plans/requesty/03-implementation-phases.md +1129 -0
- package/docs/plans/requesty/04-testing-strategy.md +905 -0
- package/docs/plans/requesty/05-migration-guide.md +576 -0
- package/docs/plans/requesty/README.md +290 -0
- package/package.json +1 -1
|
@@ -0,0 +1,616 @@
|
|
|
1
|
+
# Agent Booster: Benchmark Methodology
|
|
2
|
+
|
|
3
|
+
## 🎯 Benchmark Goals
|
|
4
|
+
|
|
5
|
+
1. **Establish baseline** - Measure Morph LLM performance with Anthropic models
|
|
6
|
+
2. **Measure speedup** - Quantify Agent Booster performance improvements
|
|
7
|
+
3. **Validate accuracy** - Ensure quality is maintained or improved
|
|
8
|
+
4. **Calculate savings** - Demonstrate cost reduction
|
|
9
|
+
5. **Identify limitations** - Understand where Agent Booster excels vs struggles
|
|
10
|
+
|
|
11
|
+
## 📊 Benchmark Suite Structure
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
benchmarks/
|
|
15
|
+
├── datasets/ # Test code samples
|
|
16
|
+
│ ├── javascript/
|
|
17
|
+
│ │ ├── simple/ # 40 samples
|
|
18
|
+
│ │ ├── medium/ # 40 samples
|
|
19
|
+
│ │ └── complex/ # 20 samples
|
|
20
|
+
│ ├── typescript/
|
|
21
|
+
│ ├── python/
|
|
22
|
+
│ └── rust/
|
|
23
|
+
│
|
|
24
|
+
├── baselines/ # Morph LLM baselines
|
|
25
|
+
│ ├── morph-claude-sonnet-4.ts
|
|
26
|
+
│ ├── morph-claude-opus-4.ts
|
|
27
|
+
│ └── morph-claude-haiku-4.ts
|
|
28
|
+
│
|
|
29
|
+
├── agent-booster/ # Agent Booster tests
|
|
30
|
+
│ ├── native-addon.ts
|
|
31
|
+
│ ├── wasm.ts
|
|
32
|
+
│ └── typescript-fallback.ts
|
|
33
|
+
│
|
|
34
|
+
├── results/ # Benchmark outputs
|
|
35
|
+
│ ├── raw/ # Raw JSON results
|
|
36
|
+
│ ├── analysis/ # Processed results
|
|
37
|
+
│ └── reports/ # HTML/PDF reports
|
|
38
|
+
│
|
|
39
|
+
└── scripts/
|
|
40
|
+
├── run-all.sh # Run full suite
|
|
41
|
+
├── run-baseline.sh # Morph LLM only
|
|
42
|
+
├── run-agent-booster.sh # Agent Booster only
|
|
43
|
+
├── compare.ts # Generate comparison
|
|
44
|
+
└── visualize.ts # Create charts
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## 📝 Test Datasets
|
|
48
|
+
|
|
49
|
+
### Simple Edits (40 samples per language)
|
|
50
|
+
|
|
51
|
+
**Characteristics:**
|
|
52
|
+
- Single function/method modifications
|
|
53
|
+
- Clear, unambiguous edit descriptions
|
|
54
|
+
- < 50 lines of code
|
|
55
|
+
- Expected accuracy: 99%+
|
|
56
|
+
|
|
57
|
+
**Examples:**
|
|
58
|
+
|
|
59
|
+
1. **Add parameter**
|
|
60
|
+
```typescript
|
|
61
|
+
// Original
|
|
62
|
+
function greet(name: string) {
|
|
63
|
+
return `Hello, ${name}!`;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Edit: "add optional greeting parameter with default 'Hello'"
|
|
67
|
+
|
|
68
|
+
// Expected
|
|
69
|
+
function greet(name: string, greeting: string = 'Hello') {
|
|
70
|
+
return `${greeting}, ${name}!`;
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
2. **Add error handling**
|
|
75
|
+
```typescript
|
|
76
|
+
// Original
|
|
77
|
+
function parseJSON(text: string) {
|
|
78
|
+
return JSON.parse(text);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Edit: "add try-catch error handling"
|
|
82
|
+
|
|
83
|
+
// Expected
|
|
84
|
+
function parseJSON(text: string) {
|
|
85
|
+
try {
|
|
86
|
+
return JSON.parse(text);
|
|
87
|
+
} catch (error) {
|
|
88
|
+
console.error('Failed to parse JSON:', error);
|
|
89
|
+
return null;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
3. **Rename variable**
|
|
95
|
+
```typescript
|
|
96
|
+
// Edit: "rename 'data' to 'userData'"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
4. **Add return type**
|
|
100
|
+
```typescript
|
|
101
|
+
// Edit: "add explicit return type annotation"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
5. **Add JSDoc comment**
|
|
105
|
+
```typescript
|
|
106
|
+
// Edit: "add JSDoc documentation"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Medium Edits (40 samples per language)
|
|
110
|
+
|
|
111
|
+
**Characteristics:**
|
|
112
|
+
- Multi-line function bodies
|
|
113
|
+
- Some ambiguity in edit description
|
|
114
|
+
- 50-200 lines of code
|
|
115
|
+
- Expected accuracy: 95%+
|
|
116
|
+
|
|
117
|
+
**Examples:**
|
|
118
|
+
|
|
119
|
+
1. **Convert to async/await**
|
|
120
|
+
```typescript
|
|
121
|
+
// Edit: "convert promises to async/await"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
2. **Add input validation**
|
|
125
|
+
```typescript
|
|
126
|
+
// Edit: "add parameter validation for email format"
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
3. **Extract helper function**
|
|
130
|
+
```typescript
|
|
131
|
+
// Edit: "extract password hashing logic into separate function"
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
4. **Add type safety**
|
|
135
|
+
```typescript
|
|
136
|
+
// Edit: "replace 'any' types with proper types"
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Complex Edits (20 samples per language)
|
|
140
|
+
|
|
141
|
+
**Characteristics:**
|
|
142
|
+
- Architectural changes
|
|
143
|
+
- Multiple functions affected
|
|
144
|
+
- 200+ lines of code
|
|
145
|
+
- Expected accuracy: 85%+
|
|
146
|
+
|
|
147
|
+
**Examples:**
|
|
148
|
+
|
|
149
|
+
1. **Refactor to design pattern**
|
|
150
|
+
```typescript
|
|
151
|
+
// Edit: "refactor to use Strategy pattern for authentication"
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
2. **Add dependency injection**
|
|
155
|
+
```typescript
|
|
156
|
+
// Edit: "convert to use dependency injection for database"
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
3. **Extract class**
|
|
160
|
+
```typescript
|
|
161
|
+
// Edit: "extract user validation into separate class"
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## ⚡ Baseline: Morph LLM Performance
|
|
165
|
+
|
|
166
|
+
### Test Configuration
|
|
167
|
+
|
|
168
|
+
```typescript
|
|
169
|
+
// benchmarks/baselines/morph-claude-sonnet-4.ts
|
|
170
|
+
|
|
171
|
+
import Anthropic from '@anthropic-ai/sdk';
|
|
172
|
+
|
|
173
|
+
const MORPH_API_KEY = process.env.MORPH_API_KEY;
|
|
174
|
+
const MORPH_BASE_URL = 'https://api.morphllm.com/v1';
|
|
175
|
+
|
|
176
|
+
interface MorphBenchmarkConfig {
|
|
177
|
+
model: 'claude-sonnet-4' | 'claude-opus-4' | 'claude-haiku-4';
|
|
178
|
+
morphModel: 'morph-v3-fast' | 'morph-v3-large';
|
|
179
|
+
dataset: string;
|
|
180
|
+
iterations: number;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
async function benchmarkMorph(config: MorphBenchmarkConfig) {
|
|
184
|
+
const client = new Anthropic({
|
|
185
|
+
apiKey: MORPH_API_KEY,
|
|
186
|
+
baseURL: MORPH_BASE_URL,
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
const results = [];
|
|
190
|
+
const dataset = loadDataset(config.dataset);
|
|
191
|
+
|
|
192
|
+
for (const sample of dataset) {
|
|
193
|
+
for (let i = 0; i < config.iterations; i++) {
|
|
194
|
+
const startTime = performance.now();
|
|
195
|
+
|
|
196
|
+
const response = await client.messages.create({
|
|
197
|
+
model: config.morphModel,
|
|
198
|
+
max_tokens: 4096,
|
|
199
|
+
messages: [{
|
|
200
|
+
role: 'user',
|
|
201
|
+
content: formatMorphPrompt(sample.original, sample.edit),
|
|
202
|
+
}],
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
const latency = performance.now() - startTime;
|
|
206
|
+
const mergedCode = response.content[0].text;
|
|
207
|
+
|
|
208
|
+
// Validate result
|
|
209
|
+
const isCorrect = validateResult(mergedCode, sample.expected);
|
|
210
|
+
const syntaxValid = checkSyntax(mergedCode, sample.language);
|
|
211
|
+
|
|
212
|
+
// Calculate cost
|
|
213
|
+
const cost = calculateCost(response.usage);
|
|
214
|
+
|
|
215
|
+
results.push({
|
|
216
|
+
sample_id: sample.id,
|
|
217
|
+
iteration: i,
|
|
218
|
+
model: config.model,
|
|
219
|
+
morph_model: config.morphModel,
|
|
220
|
+
latency_ms: latency,
|
|
221
|
+
correct: isCorrect,
|
|
222
|
+
syntax_valid: syntaxValid,
|
|
223
|
+
cost_usd: cost,
|
|
224
|
+
tokens_input: response.usage.input_tokens,
|
|
225
|
+
tokens_output: response.usage.output_tokens,
|
|
226
|
+
timestamp: new Date().toISOString(),
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
// Rate limiting
|
|
230
|
+
await sleep(1000); // 1 req/sec to be safe
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return aggregateResults(results);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function formatMorphPrompt(original: string, edit: string): string {
|
|
238
|
+
return `<instruction>${edit}</instruction>
|
|
239
|
+
<code>${original}</code>
|
|
240
|
+
<update>Apply the edit</update>`;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
function calculateCost(usage: { input_tokens: number; output_tokens: number }): number {
|
|
244
|
+
// Claude Sonnet 4 pricing (example)
|
|
245
|
+
const inputCost = (usage.input_tokens / 1000) * 0.003;
|
|
246
|
+
const outputCost = (usage.output_tokens / 1000) * 0.015;
|
|
247
|
+
return inputCost + outputCost;
|
|
248
|
+
}
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### Anthropic Models to Test
|
|
252
|
+
|
|
253
|
+
#### 1. Claude Sonnet 4 (claude-sonnet-4-20250514)
|
|
254
|
+
- **Use Case**: Production default (best balance)
|
|
255
|
+
- **Expected Performance**: 6000ms latency, 98% accuracy
|
|
256
|
+
- **Cost**: ~$0.01 per edit
|
|
257
|
+
|
|
258
|
+
#### 2. Claude Opus 4 (claude-opus-4-20250514)
|
|
259
|
+
- **Use Case**: Maximum accuracy
|
|
260
|
+
- **Expected Performance**: 8000ms latency, 99% accuracy
|
|
261
|
+
- **Cost**: ~$0.02 per edit
|
|
262
|
+
|
|
263
|
+
#### 3. Claude Haiku 4 (claude-haiku-4-20250320)
|
|
264
|
+
- **Use Case**: Speed-optimized
|
|
265
|
+
- **Expected Performance**: 3000ms latency, 96% accuracy
|
|
266
|
+
- **Cost**: ~$0.005 per edit
|
|
267
|
+
|
|
268
|
+
### Morph Model Variants
|
|
269
|
+
|
|
270
|
+
#### 1. morph-v3-large
|
|
271
|
+
- **Use Case**: Best accuracy
|
|
272
|
+
- **Expected**: Slower but more accurate
|
|
273
|
+
|
|
274
|
+
#### 2. morph-v3-fast
|
|
275
|
+
- **Use Case**: Speed-optimized
|
|
276
|
+
- **Expected**: Faster but slightly less accurate
|
|
277
|
+
|
|
278
|
+
## ⚡ Agent Booster Benchmarks
|
|
279
|
+
|
|
280
|
+
### Test Configuration
|
|
281
|
+
|
|
282
|
+
```typescript
|
|
283
|
+
// benchmarks/agent-booster/native-addon.ts
|
|
284
|
+
|
|
285
|
+
import { AgentBooster } from 'agent-booster';
|
|
286
|
+
|
|
287
|
+
interface AgentBoosterBenchmarkConfig {
|
|
288
|
+
model: 'jina-code-v2' | 'all-MiniLM-L6-v2';
|
|
289
|
+
dataset: string;
|
|
290
|
+
iterations: number;
|
|
291
|
+
variant: 'native' | 'wasm' | 'typescript';
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
async function benchmarkAgentBooster(config: AgentBoosterBenchmarkConfig) {
|
|
295
|
+
const booster = new AgentBooster({
|
|
296
|
+
model: config.model,
|
|
297
|
+
confidenceThreshold: 0.0, // Disable fallback for pure benchmark
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
const results = [];
|
|
301
|
+
const dataset = loadDataset(config.dataset);
|
|
302
|
+
|
|
303
|
+
for (const sample of dataset) {
|
|
304
|
+
for (let i = 0; i < config.iterations; i++) {
|
|
305
|
+
const startTime = performance.now();
|
|
306
|
+
|
|
307
|
+
try {
|
|
308
|
+
const result = await booster.applyEdit({
|
|
309
|
+
originalCode: sample.original,
|
|
310
|
+
editSnippet: sample.edit,
|
|
311
|
+
language: sample.language,
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
const latency = performance.now() - startTime;
|
|
315
|
+
|
|
316
|
+
// Validate result
|
|
317
|
+
const isCorrect = validateResult(result.mergedCode, sample.expected);
|
|
318
|
+
const syntaxValid = checkSyntax(result.mergedCode, sample.language);
|
|
319
|
+
|
|
320
|
+
results.push({
|
|
321
|
+
sample_id: sample.id,
|
|
322
|
+
iteration: i,
|
|
323
|
+
variant: config.variant,
|
|
324
|
+
model: config.model,
|
|
325
|
+
latency_ms: latency,
|
|
326
|
+
correct: isCorrect,
|
|
327
|
+
syntax_valid: syntaxValid,
|
|
328
|
+
confidence: result.confidence,
|
|
329
|
+
strategy: result.strategy,
|
|
330
|
+
cost_usd: 0, // Always $0
|
|
331
|
+
timestamp: new Date().toISOString(),
|
|
332
|
+
});
|
|
333
|
+
} catch (error) {
|
|
334
|
+
results.push({
|
|
335
|
+
sample_id: sample.id,
|
|
336
|
+
iteration: i,
|
|
337
|
+
variant: config.variant,
|
|
338
|
+
error: error.message,
|
|
339
|
+
latency_ms: performance.now() - startTime,
|
|
340
|
+
correct: false,
|
|
341
|
+
syntax_valid: false,
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
return aggregateResults(results);
|
|
348
|
+
}
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
### Variants to Test
|
|
352
|
+
|
|
353
|
+
#### 1. Native Addon (napi-rs)
|
|
354
|
+
- **Platform**: Node.js on native hardware
|
|
355
|
+
- **Expected**: Fastest (30-50ms)
|
|
356
|
+
|
|
357
|
+
#### 2. WASM
|
|
358
|
+
- **Platform**: Node.js with WASM
|
|
359
|
+
- **Expected**: Medium (50-100ms)
|
|
360
|
+
|
|
361
|
+
#### 3. TypeScript Fallback
|
|
362
|
+
- **Platform**: Pure TypeScript (no Rust)
|
|
363
|
+
- **Expected**: Slower (100-200ms)
|
|
364
|
+
|
|
365
|
+
## 📊 Metrics to Collect
|
|
366
|
+
|
|
367
|
+
### Performance Metrics
|
|
368
|
+
|
|
369
|
+
```typescript
|
|
370
|
+
interface PerformanceMetrics {
|
|
371
|
+
// Latency
|
|
372
|
+
latency_p50: number; // Median
|
|
373
|
+
latency_p95: number; // 95th percentile
|
|
374
|
+
latency_p99: number; // 99th percentile
|
|
375
|
+
latency_max: number; // Maximum
|
|
376
|
+
latency_min: number; // Minimum
|
|
377
|
+
latency_mean: number; // Average
|
|
378
|
+
latency_stddev: number; // Standard deviation
|
|
379
|
+
|
|
380
|
+
// Throughput
|
|
381
|
+
throughput_edits_per_sec: number;
|
|
382
|
+
throughput_tokens_per_sec: number;
|
|
383
|
+
|
|
384
|
+
// Memory
|
|
385
|
+
memory_peak_mb: number;
|
|
386
|
+
memory_avg_mb: number;
|
|
387
|
+
|
|
388
|
+
// Startup
|
|
389
|
+
cold_start_ms: number;
|
|
390
|
+
warm_start_ms: number;
|
|
391
|
+
}
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
### Accuracy Metrics
|
|
395
|
+
|
|
396
|
+
```typescript
|
|
397
|
+
interface AccuracyMetrics {
|
|
398
|
+
// Overall
|
|
399
|
+
accuracy_exact_match: number; // Exact code match
|
|
400
|
+
accuracy_semantic_match: number; // Semantically equivalent
|
|
401
|
+
accuracy_syntax_valid: number; // Valid syntax
|
|
402
|
+
|
|
403
|
+
// By complexity
|
|
404
|
+
accuracy_simple: number; // Simple edits
|
|
405
|
+
accuracy_medium: number; // Medium edits
|
|
406
|
+
accuracy_complex: number; // Complex edits
|
|
407
|
+
|
|
408
|
+
// Confidence correlation
|
|
409
|
+
confidence_avg: number;
|
|
410
|
+
confidence_accuracy_correlation: number;
|
|
411
|
+
|
|
412
|
+
// Error rates
|
|
413
|
+
false_positive_rate: number;
|
|
414
|
+
false_negative_rate: number;
|
|
415
|
+
syntax_error_rate: number;
|
|
416
|
+
}
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
### Cost Metrics
|
|
420
|
+
|
|
421
|
+
```typescript
|
|
422
|
+
interface CostMetrics {
|
|
423
|
+
cost_per_edit: number;
|
|
424
|
+
cost_total: number;
|
|
425
|
+
cost_saved_vs_baseline: number;
|
|
426
|
+
cost_saved_percentage: number;
|
|
427
|
+
|
|
428
|
+
// Token usage (for LLM baselines)
|
|
429
|
+
tokens_per_edit_avg: number;
|
|
430
|
+
tokens_input_avg: number;
|
|
431
|
+
tokens_output_avg: number;
|
|
432
|
+
}
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
## 📈 Comparison Analysis
|
|
436
|
+
|
|
437
|
+
### Statistical Tests
|
|
438
|
+
|
|
439
|
+
```typescript
|
|
440
|
+
interface ComparisonAnalysis {
|
|
441
|
+
// Speed comparison
|
|
442
|
+
speedup_factor: number; // Agent Booster vs Morph
|
|
443
|
+
speedup_confidence_interval: [number, number];
|
|
444
|
+
speedup_p_value: number; // T-test significance
|
|
445
|
+
|
|
446
|
+
// Accuracy comparison
|
|
447
|
+
accuracy_difference: number; // Percentage points
|
|
448
|
+
accuracy_significance: boolean; // Statistically significant?
|
|
449
|
+
|
|
450
|
+
// Cost savings
|
|
451
|
+
cost_savings_per_edit: number;
|
|
452
|
+
cost_savings_per_1000_edits: number;
|
|
453
|
+
break_even_point: number; // Number of edits to break even
|
|
454
|
+
|
|
455
|
+
// Quality metrics
|
|
456
|
+
quality_score: number; // Weighted score (accuracy + speed)
|
|
457
|
+
recommended_use_cases: string[];
|
|
458
|
+
}
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
### Visualization
|
|
462
|
+
|
|
463
|
+
```typescript
|
|
464
|
+
// Generate comparison charts
|
|
465
|
+
async function generateCharts(results: BenchmarkResults) {
|
|
466
|
+
await generateLatencyChart(results);
|
|
467
|
+
await generateAccuracyChart(results);
|
|
468
|
+
await generateCostChart(results);
|
|
469
|
+
await generateConfidenceDistribution(results);
|
|
470
|
+
await generateComplexityBreakdown(results);
|
|
471
|
+
}
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
## 🎯 Benchmark Execution Plan
|
|
475
|
+
|
|
476
|
+
### Phase 1: Baseline (Week 1)
|
|
477
|
+
```bash
|
|
478
|
+
# 1. Setup Morph LLM account and get API key
|
|
479
|
+
export MORPH_API_KEY=sk-morph-xxx
|
|
480
|
+
|
|
481
|
+
# 2. Prepare datasets
|
|
482
|
+
npm run benchmark:prepare-datasets
|
|
483
|
+
|
|
484
|
+
# 3. Run Morph + Claude Sonnet 4 baseline
|
|
485
|
+
npm run benchmark:baseline -- --model claude-sonnet-4 --iterations 3
|
|
486
|
+
|
|
487
|
+
# 4. Run Morph + Claude Opus 4 baseline
|
|
488
|
+
npm run benchmark:baseline -- --model claude-opus-4 --iterations 3
|
|
489
|
+
|
|
490
|
+
# 5. Run Morph + Claude Haiku 4 baseline
|
|
491
|
+
npm run benchmark:baseline -- --model claude-haiku-4 --iterations 3
|
|
492
|
+
|
|
493
|
+
# 6. Analyze baseline results
|
|
494
|
+
npm run benchmark:analyze-baseline
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
**Expected Duration**: 8-12 hours (100 samples × 3 iterations × 3 models × 6s)
|
|
498
|
+
|
|
499
|
+
**Expected Cost**: ~$30-50 (300 edits × $0.01-0.02 per edit)
|
|
500
|
+
|
|
501
|
+
### Phase 2: Agent Booster (Week 2)
|
|
502
|
+
```bash
|
|
503
|
+
# 1. Build Agent Booster
|
|
504
|
+
cargo build --release
|
|
505
|
+
npm run build
|
|
506
|
+
|
|
507
|
+
# 2. Download embedding models
|
|
508
|
+
npm run download-models
|
|
509
|
+
|
|
510
|
+
# 3. Run native addon benchmarks
|
|
511
|
+
npm run benchmark:agent-booster -- --variant native --iterations 10
|
|
512
|
+
|
|
513
|
+
# 4. Run WASM benchmarks
|
|
514
|
+
npm run benchmark:agent-booster -- --variant wasm --iterations 10
|
|
515
|
+
|
|
516
|
+
# 5. Run TypeScript fallback benchmarks
|
|
517
|
+
npm run benchmark:agent-booster -- --variant typescript --iterations 10
|
|
518
|
+
|
|
519
|
+
# 6. Analyze Agent Booster results
|
|
520
|
+
npm run benchmark:analyze-agent-booster
|
|
521
|
+
```
|
|
522
|
+
|
|
523
|
+
**Expected Duration**: 1-2 hours (100 samples × 10 iterations × 3 variants × 50ms)
|
|
524
|
+
|
|
525
|
+
**Expected Cost**: $0
|
|
526
|
+
|
|
527
|
+
### Phase 3: Comparison (Week 3)
|
|
528
|
+
```bash
|
|
529
|
+
# 1. Generate comparison analysis
|
|
530
|
+
npm run benchmark:compare
|
|
531
|
+
|
|
532
|
+
# 2. Generate charts and visualizations
|
|
533
|
+
npm run benchmark:visualize
|
|
534
|
+
|
|
535
|
+
# 3. Generate HTML report
|
|
536
|
+
npm run benchmark:report
|
|
537
|
+
|
|
538
|
+
# 4. Publish results
|
|
539
|
+
npm run benchmark:publish
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
## 📋 Expected Results
|
|
543
|
+
|
|
544
|
+
### Latency Comparison
|
|
545
|
+
|
|
546
|
+
| Metric | Morph + Sonnet 4 | Agent Booster (Native) | Improvement |
|
|
547
|
+
|--------|------------------|------------------------|-------------|
|
|
548
|
+
| **p50** | 5,800ms | 35ms | **166x faster** |
|
|
549
|
+
| **p95** | 8,200ms | 52ms | **158x faster** |
|
|
550
|
+
| **p99** | 12,000ms | 85ms | **141x faster** |
|
|
551
|
+
| **Max** | 18,000ms | 150ms | **120x faster** |
|
|
552
|
+
|
|
553
|
+
### Accuracy Comparison
|
|
554
|
+
|
|
555
|
+
| Complexity | Morph + Sonnet 4 | Agent Booster | Difference |
|
|
556
|
+
|------------|------------------|---------------|------------|
|
|
557
|
+
| **Simple** | 99.2% | 98.5% | -0.7% |
|
|
558
|
+
| **Medium** | 97.8% | 96.2% | -1.6% |
|
|
559
|
+
| **Complex** | 96.1% | 93.8% | -2.3% |
|
|
560
|
+
| **Overall** | 98.0% | 96.8% | -1.2% |
|
|
561
|
+
|
|
562
|
+
### Cost Comparison (1000 edits)
|
|
563
|
+
|
|
564
|
+
| Solution | Total Cost | Cost per Edit | Savings |
|
|
565
|
+
|----------|-----------|---------------|---------|
|
|
566
|
+
| **Morph + Sonnet 4** | $10.00 | $0.010 | - |
|
|
567
|
+
| **Morph + Opus 4** | $20.00 | $0.020 | - |
|
|
568
|
+
| **Agent Booster** | $0.00 | $0.000 | **100%** |
|
|
569
|
+
|
|
570
|
+
### Recommended Configuration
|
|
571
|
+
|
|
572
|
+
Based on benchmarks, recommend:
|
|
573
|
+
|
|
574
|
+
```typescript
|
|
575
|
+
// For maximum performance
|
|
576
|
+
const config = {
|
|
577
|
+
primaryMethod: 'agent-booster',
|
|
578
|
+
model: 'jina-code-v2',
|
|
579
|
+
confidenceThreshold: 0.65,
|
|
580
|
+
fallbackToMorph: true,
|
|
581
|
+
morphModel: 'claude-sonnet-4'
|
|
582
|
+
};
|
|
583
|
+
|
|
584
|
+
// Expected results with 1000 edits:
|
|
585
|
+
// - 850 edits via Agent Booster (85%, avg 40ms, $0)
|
|
586
|
+
// - 150 edits via Morph fallback (15%, avg 6000ms, $1.50)
|
|
587
|
+
// - Overall avg latency: 934ms (vs 6000ms pure Morph)
|
|
588
|
+
// - Overall cost: $1.50 (vs $10 pure Morph)
|
|
589
|
+
// - 6.4x faster, 85% cost savings
|
|
590
|
+
```
|
|
591
|
+
|
|
592
|
+
## 📊 Benchmark Report Template
|
|
593
|
+
|
|
594
|
+
```markdown
|
|
595
|
+
# Agent Booster Benchmark Report
|
|
596
|
+
|
|
597
|
+
**Date**: YYYY-MM-DD
|
|
598
|
+
**Version**: agent-booster@0.1.0
|
|
599
|
+
**Dataset**: 100 samples (40 simple, 40 medium, 20 complex)
|
|
600
|
+
**Iterations**: 3 per sample (baseline), 10 per sample (Agent Booster)
|
|
601
|
+
|
|
602
|
+
## Executive Summary
|
|
603
|
+
|
|
604
|
+
- **Speed**: Agent Booster is **166x faster** than Morph + Claude Sonnet 4
|
|
605
|
+
- **Accuracy**: 96.8% vs 98.0% (-1.2 percentage points)
|
|
606
|
+
- **Cost**: **100% savings** ($0 vs $0.01 per edit)
|
|
607
|
+
- **Recommendation**: Use Agent Booster with fallback for best ROI
|
|
608
|
+
|
|
609
|
+
## Detailed Results
|
|
610
|
+
|
|
611
|
+
[Charts and tables here]
|
|
612
|
+
|
|
613
|
+
## Conclusions
|
|
614
|
+
|
|
615
|
+
[Analysis and recommendations]
|
|
616
|
+
```
|