universal-agent-memory 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/improved-benchmark.d.ts +1 -0
- package/dist/benchmarks/improved-benchmark.d.ts.map +1 -1
- package/dist/benchmarks/improved-benchmark.js +70 -22
- package/dist/benchmarks/improved-benchmark.js.map +1 -1
- package/dist/benchmarks/model-integration.d.ts +111 -0
- package/dist/benchmarks/model-integration.d.ts.map +1 -0
- package/dist/benchmarks/model-integration.js +923 -0
- package/dist/benchmarks/model-integration.js.map +1 -0
- package/package.json +1 -1
|
@@ -0,0 +1,923 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model Integration Benchmark
|
|
3
|
+
*
|
|
4
|
+
* Runs real API calls against multiple LLM providers via Factory.ai droid exec CLI
|
|
5
|
+
* to compare model performance on UAM memory-enhanced tasks.
|
|
6
|
+
*
|
|
7
|
+
* Assumptions:
|
|
8
|
+
* - FACTORY_API_KEY is set in environment for Factory.ai API access
|
|
9
|
+
* - Models: Claude Opus 4.5, GLM 4.7, GPT 5.2
|
|
10
|
+
* - droid CLI is installed and accessible
|
|
11
|
+
* - UAM CLI is available for memory initialization
|
|
12
|
+
*
|
|
13
|
+
* What this handles:
|
|
14
|
+
* - Full UAM setup (init, analyze, generate, memory start, prepopulate)
|
|
15
|
+
* - CLAUDE.md reading and context injection
|
|
16
|
+
* - Real API calls to multiple LLM providers via droid exec
|
|
17
|
+
* - Task execution comparison across models with/without UAM
|
|
18
|
+
* - Performance metrics collection (latency, success, tokens)
|
|
19
|
+
* - Result aggregation and reporting
|
|
20
|
+
*
|
|
21
|
+
* What this does NOT handle:
|
|
22
|
+
* - Rate limiting (caller responsibility)
|
|
23
|
+
* - Cost tracking (would require billing API)
|
|
24
|
+
* - Streaming responses (uses completion mode)
|
|
25
|
+
*/
|
|
26
|
+
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
|
27
|
+
import { join, dirname } from 'path';
|
|
28
|
+
import { fileURLToPath } from 'url';
|
|
29
|
+
import { execSync } from 'child_process';
|
|
30
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
31
|
+
const __dirname = dirname(__filename);
|
|
32
|
+
const PROJECT_ROOT = join(__dirname, '../..');
|
|
33
|
+
/**
|
|
34
|
+
* Initialize UAM system for benchmark testing
|
|
35
|
+
* Runs: uam init, uam analyze, uam generate, uam memory start, uam memory prepopulate
|
|
36
|
+
*/
|
|
37
|
+
async function setupUAM(verbose = false) {
|
|
38
|
+
const result = {
|
|
39
|
+
initialized: false,
|
|
40
|
+
memoryStarted: false,
|
|
41
|
+
memoryPrepopulated: false,
|
|
42
|
+
claudeMdLoaded: false,
|
|
43
|
+
errors: [],
|
|
44
|
+
};
|
|
45
|
+
const log = (msg) => { if (verbose)
|
|
46
|
+
console.log(` [UAM Setup] ${msg}`); };
|
|
47
|
+
try {
|
|
48
|
+
// Step 1: Check if UAM CLI is available
|
|
49
|
+
log('Checking UAM CLI availability...');
|
|
50
|
+
try {
|
|
51
|
+
execSync('uam --version', { encoding: 'utf-8', cwd: PROJECT_ROOT, stdio: 'pipe' });
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
// Try with npx
|
|
55
|
+
execSync('npx uam --version', { encoding: 'utf-8', cwd: PROJECT_ROOT, stdio: 'pipe' });
|
|
56
|
+
}
|
|
57
|
+
// Step 2: Initialize UAM (idempotent - safe to run multiple times)
|
|
58
|
+
log('Running uam init...');
|
|
59
|
+
try {
|
|
60
|
+
execSync('uam init --non-interactive 2>/dev/null || true', {
|
|
61
|
+
encoding: 'utf-8',
|
|
62
|
+
cwd: PROJECT_ROOT,
|
|
63
|
+
stdio: 'pipe',
|
|
64
|
+
timeout: 30000,
|
|
65
|
+
});
|
|
66
|
+
result.initialized = true;
|
|
67
|
+
}
|
|
68
|
+
catch (e) {
|
|
69
|
+
result.errors.push(`init failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
70
|
+
}
|
|
71
|
+
// Step 3: Analyze project structure
|
|
72
|
+
log('Running uam analyze...');
|
|
73
|
+
try {
|
|
74
|
+
execSync('uam analyze 2>/dev/null || true', {
|
|
75
|
+
encoding: 'utf-8',
|
|
76
|
+
cwd: PROJECT_ROOT,
|
|
77
|
+
stdio: 'pipe',
|
|
78
|
+
timeout: 60000,
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
catch (e) {
|
|
82
|
+
result.errors.push(`analyze failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
83
|
+
}
|
|
84
|
+
// Step 4: Generate/update CLAUDE.md
|
|
85
|
+
log('Running uam generate...');
|
|
86
|
+
try {
|
|
87
|
+
execSync('uam generate 2>/dev/null || true', {
|
|
88
|
+
encoding: 'utf-8',
|
|
89
|
+
cwd: PROJECT_ROOT,
|
|
90
|
+
stdio: 'pipe',
|
|
91
|
+
timeout: 30000,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
catch (e) {
|
|
95
|
+
result.errors.push(`generate failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
96
|
+
}
|
|
97
|
+
// Step 5: Start memory services
|
|
98
|
+
log('Starting memory services...');
|
|
99
|
+
try {
|
|
100
|
+
execSync('uam memory start 2>/dev/null || true', {
|
|
101
|
+
encoding: 'utf-8',
|
|
102
|
+
cwd: PROJECT_ROOT,
|
|
103
|
+
stdio: 'pipe',
|
|
104
|
+
timeout: 60000,
|
|
105
|
+
});
|
|
106
|
+
result.memoryStarted = true;
|
|
107
|
+
}
|
|
108
|
+
catch (e) {
|
|
109
|
+
result.errors.push(`memory start failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
110
|
+
}
|
|
111
|
+
// Step 6: Prepopulate memory from docs and git history
|
|
112
|
+
log('Prepopulating memory from docs and git...');
|
|
113
|
+
try {
|
|
114
|
+
execSync('uam memory prepopulate --docs --git --limit 100 2>/dev/null || true', {
|
|
115
|
+
encoding: 'utf-8',
|
|
116
|
+
cwd: PROJECT_ROOT,
|
|
117
|
+
stdio: 'pipe',
|
|
118
|
+
timeout: 120000,
|
|
119
|
+
});
|
|
120
|
+
result.memoryPrepopulated = true;
|
|
121
|
+
}
|
|
122
|
+
catch (e) {
|
|
123
|
+
result.errors.push(`memory prepopulate failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
124
|
+
}
|
|
125
|
+
// Step 7: Verify CLAUDE.md exists
|
|
126
|
+
const claudeMdPath = join(PROJECT_ROOT, 'CLAUDE.md');
|
|
127
|
+
if (existsSync(claudeMdPath)) {
|
|
128
|
+
result.claudeMdLoaded = true;
|
|
129
|
+
log('CLAUDE.md found and ready');
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
result.errors.push('CLAUDE.md not found after setup');
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
catch (error) {
|
|
136
|
+
result.errors.push(`UAM setup error: ${error instanceof Error ? error.message : String(error)}`);
|
|
137
|
+
}
|
|
138
|
+
return result;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Load UAM memory context from CLAUDE.md and short-term memory
|
|
142
|
+
*/
|
|
143
|
+
function loadUAMMemoryContext() {
|
|
144
|
+
const sections = [];
|
|
145
|
+
// Read CLAUDE.md
|
|
146
|
+
const claudeMdPath = join(PROJECT_ROOT, 'CLAUDE.md');
|
|
147
|
+
if (existsSync(claudeMdPath)) {
|
|
148
|
+
const claudeMd = readFileSync(claudeMdPath, 'utf-8');
|
|
149
|
+
// Extract key sections from CLAUDE.md
|
|
150
|
+
sections.push('## UAM Memory Context (from CLAUDE.md)\n');
|
|
151
|
+
// Extract Code Field section
|
|
152
|
+
const codeFieldMatch = claudeMd.match(/## .*CODE FIELD.*?(?=\n## |\n---\n|$)/s);
|
|
153
|
+
if (codeFieldMatch) {
|
|
154
|
+
sections.push('### Code Field Guidelines\n');
|
|
155
|
+
sections.push(codeFieldMatch[0].slice(0, 1500) + '\n');
|
|
156
|
+
}
|
|
157
|
+
// Extract Testing Requirements
|
|
158
|
+
const testingMatch = claudeMd.match(/## .*Testing Requirements.*?(?=\n## |\n---\n|$)/s);
|
|
159
|
+
if (testingMatch) {
|
|
160
|
+
sections.push('### Testing Requirements\n');
|
|
161
|
+
sections.push(testingMatch[0].slice(0, 500) + '\n');
|
|
162
|
+
}
|
|
163
|
+
// Extract Repository Structure
|
|
164
|
+
const structureMatch = claudeMd.match(/## Repository Structure.*?```[\s\S]*?```/);
|
|
165
|
+
if (structureMatch) {
|
|
166
|
+
sections.push('### Repository Structure\n');
|
|
167
|
+
sections.push(structureMatch[0].slice(0, 1000) + '\n');
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
// Query short-term memory from SQLite
|
|
171
|
+
const dbPath = join(PROJECT_ROOT, 'agents/data/memory/short_term.db');
|
|
172
|
+
if (existsSync(dbPath)) {
|
|
173
|
+
try {
|
|
174
|
+
const recentMemories = execSync(`sqlite3 "${dbPath}" "SELECT type, content FROM memories ORDER BY id DESC LIMIT 10;" 2>/dev/null || true`, { encoding: 'utf-8', cwd: PROJECT_ROOT }).trim();
|
|
175
|
+
if (recentMemories) {
|
|
176
|
+
sections.push('### Recent Session Memory\n');
|
|
177
|
+
sections.push('```\n' + recentMemories.slice(0, 1000) + '\n```\n');
|
|
178
|
+
}
|
|
179
|
+
// Get lessons learned
|
|
180
|
+
const lessons = execSync(`sqlite3 "${dbPath}" "SELECT content FROM memories WHERE type='lesson' ORDER BY id DESC LIMIT 5;" 2>/dev/null || true`, { encoding: 'utf-8', cwd: PROJECT_ROOT }).trim();
|
|
181
|
+
if (lessons) {
|
|
182
|
+
sections.push('### Lessons Learned\n');
|
|
183
|
+
sections.push(lessons.slice(0, 500) + '\n');
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
catch {
|
|
187
|
+
// Memory DB not available
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
// Add static context as fallback/supplement
|
|
191
|
+
sections.push(`
|
|
192
|
+
### Project Coding Standards
|
|
193
|
+
- Use TypeScript strict mode
|
|
194
|
+
- All functions must have JSDoc comments with @param and @returns
|
|
195
|
+
- Error handling uses custom AppError class that extends Error
|
|
196
|
+
- Prefer async/await over callbacks and Promises
|
|
197
|
+
- Use zod for runtime input validation
|
|
198
|
+
- Export types and interfaces alongside implementations
|
|
199
|
+
- Use Map for key-value storage, Set for unique collections
|
|
200
|
+
|
|
201
|
+
### Common Patterns
|
|
202
|
+
- Singleton pattern: private constructor + static getInstance()
|
|
203
|
+
- Strategy pattern: interface + multiple implementations
|
|
204
|
+
- Factory pattern: static create() methods
|
|
205
|
+
- Error handling: try/catch with specific error types
|
|
206
|
+
- Exponential backoff: delay = baseMs * Math.pow(2, attempt)
|
|
207
|
+
|
|
208
|
+
### Known Gotchas (from memory)
|
|
209
|
+
- Always check array bounds: use i < length, not i <= length
|
|
210
|
+
- Handle empty arrays explicitly before operations
|
|
211
|
+
- Include cleanup logic for resources (timers, connections)
|
|
212
|
+
- JSON.parse throws on invalid input - always wrap in try/catch
|
|
213
|
+
- Array methods like reduce need initial value for empty arrays
|
|
214
|
+
- Map.get() returns undefined for missing keys
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
`);
|
|
219
|
+
return sections.join('\n');
|
|
220
|
+
}
|
|
221
|
+
// Cached memory context (loaded once per benchmark run)
|
|
222
|
+
let cachedMemoryContext = null;
|
|
223
|
+
function getUAMMemoryContext() {
|
|
224
|
+
if (!cachedMemoryContext) {
|
|
225
|
+
cachedMemoryContext = loadUAMMemoryContext();
|
|
226
|
+
}
|
|
227
|
+
return cachedMemoryContext;
|
|
228
|
+
}
|
|
229
|
+
// ============================================================================
|
|
230
|
+
// Model Configurations (per Factory.ai droid CLI available models)
|
|
231
|
+
// ============================================================================
|
|
232
|
+
const MODELS = [
|
|
233
|
+
{
|
|
234
|
+
id: 'opus-4.5',
|
|
235
|
+
name: 'Claude Opus 4.5',
|
|
236
|
+
provider: 'anthropic',
|
|
237
|
+
apiModel: 'claude-opus-4-5-20251101',
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
id: 'glm-4.7',
|
|
241
|
+
name: 'GLM 4.7 (Droid Core)',
|
|
242
|
+
provider: 'zhipu',
|
|
243
|
+
apiModel: 'glm-4.7',
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
id: 'gpt-5.2-codex',
|
|
247
|
+
name: 'GPT 5.2 Codex',
|
|
248
|
+
provider: 'openai',
|
|
249
|
+
apiModel: 'gpt-5.2-codex',
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
id: 'gpt-5.2',
|
|
253
|
+
name: 'GPT 5.2',
|
|
254
|
+
provider: 'openai',
|
|
255
|
+
apiModel: 'gpt-5.2',
|
|
256
|
+
},
|
|
257
|
+
];
|
|
258
|
+
// ============================================================================
|
|
259
|
+
// Benchmark Tasks
|
|
260
|
+
// ============================================================================
|
|
261
|
+
const BENCHMARK_TASKS = [
|
|
262
|
+
{
|
|
263
|
+
id: 'task-001-code-generation',
|
|
264
|
+
name: 'TypeScript Function Generation',
|
|
265
|
+
description: 'Generate a well-typed TypeScript function',
|
|
266
|
+
prompt: `Write a TypeScript function called 'calculateAverage' that:
|
|
267
|
+
1. Takes an array of numbers as input
|
|
268
|
+
2. Returns the arithmetic mean
|
|
269
|
+
3. Handles empty arrays (return 0)
|
|
270
|
+
4. Has proper type annotations
|
|
271
|
+
|
|
272
|
+
Return ONLY the function code, no explanations.`,
|
|
273
|
+
difficulty: 'easy',
|
|
274
|
+
category: 'code-generation',
|
|
275
|
+
expectedPatterns: [
|
|
276
|
+
'function calculateAverage',
|
|
277
|
+
'number[]',
|
|
278
|
+
': number',
|
|
279
|
+
'length',
|
|
280
|
+
'return',
|
|
281
|
+
],
|
|
282
|
+
maxTokens: 500,
|
|
283
|
+
},
|
|
284
|
+
{
|
|
285
|
+
id: 'task-002-bug-fix',
|
|
286
|
+
name: 'Bug Detection and Fix',
|
|
287
|
+
description: 'Identify and fix a bug in code',
|
|
288
|
+
prompt: `Find and fix the bug in this TypeScript code:
|
|
289
|
+
|
|
290
|
+
function sumPositive(nums: number[]): number {
|
|
291
|
+
let sum = 0;
|
|
292
|
+
for (let i = 0; i <= nums.length; i++) {
|
|
293
|
+
if (nums[i] > 0) {
|
|
294
|
+
sum += nums[i];
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return sum;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
Return ONLY the corrected function code.`,
|
|
301
|
+
difficulty: 'easy',
|
|
302
|
+
category: 'bug-fix',
|
|
303
|
+
expectedPatterns: [
|
|
304
|
+
'i < nums.length',
|
|
305
|
+
'function sumPositive',
|
|
306
|
+
'return sum',
|
|
307
|
+
],
|
|
308
|
+
maxTokens: 500,
|
|
309
|
+
},
|
|
310
|
+
{
|
|
311
|
+
id: 'task-003-pattern-application',
|
|
312
|
+
name: 'Design Pattern Implementation',
|
|
313
|
+
description: 'Implement a singleton pattern',
|
|
314
|
+
prompt: `Implement a TypeScript singleton class called 'ConfigManager' that:
|
|
315
|
+
1. Has a private constructor
|
|
316
|
+
2. Has a static getInstance() method
|
|
317
|
+
3. Has get(key: string) and set(key: string, value: any) methods
|
|
318
|
+
4. Stores configuration in a private Map
|
|
319
|
+
|
|
320
|
+
Return ONLY the class code.`,
|
|
321
|
+
difficulty: 'medium',
|
|
322
|
+
category: 'patterns',
|
|
323
|
+
expectedPatterns: [
|
|
324
|
+
'class ConfigManager',
|
|
325
|
+
'private constructor',
|
|
326
|
+
'static getInstance',
|
|
327
|
+
'private static instance',
|
|
328
|
+
'Map',
|
|
329
|
+
],
|
|
330
|
+
maxTokens: 800,
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
id: 'task-004-refactoring',
|
|
334
|
+
name: 'Code Refactoring',
|
|
335
|
+
description: 'Refactor code for better maintainability',
|
|
336
|
+
prompt: `Refactor this code to follow SOLID principles and improve readability:
|
|
337
|
+
|
|
338
|
+
function processOrder(order: any) {
|
|
339
|
+
if (order.type === 'digital') {
|
|
340
|
+
console.log('Sending email with download link');
|
|
341
|
+
order.status = 'delivered';
|
|
342
|
+
} else if (order.type === 'physical') {
|
|
343
|
+
console.log('Creating shipping label');
|
|
344
|
+
order.status = 'shipped';
|
|
345
|
+
} else if (order.type === 'subscription') {
|
|
346
|
+
console.log('Activating subscription');
|
|
347
|
+
order.status = 'active';
|
|
348
|
+
}
|
|
349
|
+
console.log('Order processed: ' + order.id);
|
|
350
|
+
return order;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
Provide the refactored TypeScript code using proper interfaces and a strategy pattern.`,
|
|
354
|
+
difficulty: 'medium',
|
|
355
|
+
category: 'refactoring',
|
|
356
|
+
expectedPatterns: [
|
|
357
|
+
'interface',
|
|
358
|
+
'class',
|
|
359
|
+
'implements',
|
|
360
|
+
'process',
|
|
361
|
+
],
|
|
362
|
+
maxTokens: 1200,
|
|
363
|
+
},
|
|
364
|
+
{
|
|
365
|
+
id: 'task-005-memory-context',
|
|
366
|
+
name: 'Context-Aware Code Generation',
|
|
367
|
+
description: 'Generate code using provided context',
|
|
368
|
+
prompt: `Given the following project context from memory:
|
|
369
|
+
|
|
370
|
+
MEMORY CONTEXT:
|
|
371
|
+
- Project uses src/utils/ for utility functions
|
|
372
|
+
- All functions must have JSDoc comments
|
|
373
|
+
- Error handling uses custom AppError class
|
|
374
|
+
- Prefer async/await over callbacks
|
|
375
|
+
- Use zod for input validation
|
|
376
|
+
|
|
377
|
+
Write a utility function 'validateAndParseJSON' that:
|
|
378
|
+
1. Takes a string input
|
|
379
|
+
2. Validates it's valid JSON using zod
|
|
380
|
+
3. Returns the parsed object or throws AppError
|
|
381
|
+
4. Has proper JSDoc documentation
|
|
382
|
+
|
|
383
|
+
Return ONLY the function code with JSDoc.`,
|
|
384
|
+
difficulty: 'medium',
|
|
385
|
+
category: 'memory',
|
|
386
|
+
expectedPatterns: [
|
|
387
|
+
'async',
|
|
388
|
+
'zod',
|
|
389
|
+
'AppError',
|
|
390
|
+
'@param',
|
|
391
|
+
'@returns',
|
|
392
|
+
'validateAndParseJSON',
|
|
393
|
+
],
|
|
394
|
+
maxTokens: 800,
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
id: 'task-006-complex-algorithm',
|
|
398
|
+
name: 'Algorithm Implementation',
|
|
399
|
+
description: 'Implement a complex algorithm with proper typing',
|
|
400
|
+
prompt: `Implement a TypeScript function 'findShortestPath' using Dijkstra's algorithm:
|
|
401
|
+
|
|
402
|
+
1. Input: weighted graph as adjacency list Map<string, Map<string, number>>
|
|
403
|
+
2. Input: start node (string), end node (string)
|
|
404
|
+
3. Output: { path: string[], distance: number } or null if no path
|
|
405
|
+
4. Handle disconnected nodes properly
|
|
406
|
+
5. Use proper TypeScript types
|
|
407
|
+
|
|
408
|
+
Return ONLY the function code with type definitions.`,
|
|
409
|
+
difficulty: 'hard',
|
|
410
|
+
category: 'algorithms',
|
|
411
|
+
expectedPatterns: [
|
|
412
|
+
'function findShortestPath',
|
|
413
|
+
'Map<string',
|
|
414
|
+
'distance',
|
|
415
|
+
'path',
|
|
416
|
+
'while',
|
|
417
|
+
'return',
|
|
418
|
+
],
|
|
419
|
+
maxTokens: 1500,
|
|
420
|
+
},
|
|
421
|
+
{
|
|
422
|
+
id: 'task-007-multi-step-task',
|
|
423
|
+
name: 'Multi-Step Code Generation',
|
|
424
|
+
description: 'Complete a multi-step implementation task',
|
|
425
|
+
prompt: `Create a complete TypeScript module for a rate limiter with these requirements:
|
|
426
|
+
|
|
427
|
+
1. Interface RateLimiterConfig { maxRequests: number; windowMs: number; }
|
|
428
|
+
2. Class RateLimiter with:
|
|
429
|
+
- constructor(config: RateLimiterConfig)
|
|
430
|
+
- isAllowed(clientId: string): boolean
|
|
431
|
+
- getRemainingRequests(clientId: string): number
|
|
432
|
+
- reset(clientId?: string): void
|
|
433
|
+
3. Use Map for tracking requests per client
|
|
434
|
+
4. Include proper cleanup of expired entries
|
|
435
|
+
5. Export both the class and interface
|
|
436
|
+
|
|
437
|
+
Return the complete module code.`,
|
|
438
|
+
difficulty: 'hard',
|
|
439
|
+
category: 'multi-step',
|
|
440
|
+
expectedPatterns: [
|
|
441
|
+
'interface RateLimiterConfig',
|
|
442
|
+
'class RateLimiter',
|
|
443
|
+
'isAllowed',
|
|
444
|
+
'getRemainingRequests',
|
|
445
|
+
'reset',
|
|
446
|
+
'Map',
|
|
447
|
+
'export',
|
|
448
|
+
],
|
|
449
|
+
maxTokens: 2000,
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
id: 'task-008-error-handling',
|
|
453
|
+
name: 'Comprehensive Error Handling',
|
|
454
|
+
description: 'Implement robust error handling',
|
|
455
|
+
prompt: `Create a TypeScript async function 'fetchWithRetry' that:
|
|
456
|
+
|
|
457
|
+
1. Takes url: string, options?: RequestInit, retryConfig?: { maxRetries: number; backoffMs: number; }
|
|
458
|
+
2. Implements exponential backoff retry logic
|
|
459
|
+
3. Handles network errors, timeout, and HTTP errors (4xx, 5xx)
|
|
460
|
+
4. Returns Promise<Response> or throws a detailed custom error
|
|
461
|
+
5. Logs each retry attempt
|
|
462
|
+
6. Has proper TypeScript types for all parameters and return values
|
|
463
|
+
|
|
464
|
+
Return ONLY the function code with any necessary type definitions.`,
|
|
465
|
+
difficulty: 'hard',
|
|
466
|
+
category: 'error-handling',
|
|
467
|
+
expectedPatterns: [
|
|
468
|
+
'async function fetchWithRetry',
|
|
469
|
+
'retry',
|
|
470
|
+
'backoff',
|
|
471
|
+
'catch',
|
|
472
|
+
'throw',
|
|
473
|
+
'Promise<Response>',
|
|
474
|
+
],
|
|
475
|
+
maxTokens: 1200,
|
|
476
|
+
},
|
|
477
|
+
];
|
|
478
|
+
// ============================================================================
|
|
479
|
+
// Droid Exec Client
|
|
480
|
+
// ============================================================================
|
|
481
|
+
class DroidExecClient {
|
|
482
|
+
apiKey;
|
|
483
|
+
tmpDir;
|
|
484
|
+
autoLevel;
|
|
485
|
+
constructor(apiKey, autoLevel = 'low') {
|
|
486
|
+
this.apiKey = apiKey;
|
|
487
|
+
this.autoLevel = autoLevel;
|
|
488
|
+
this.tmpDir = '/tmp/uam-benchmark';
|
|
489
|
+
try {
|
|
490
|
+
execSync(`mkdir -p ${this.tmpDir}`, { encoding: 'utf-8' });
|
|
491
|
+
}
|
|
492
|
+
catch {
|
|
493
|
+
// ignore
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
async complete(model, prompt) {
|
|
497
|
+
const startTime = Date.now();
|
|
498
|
+
// Write prompt to temp file to avoid shell escaping issues
|
|
499
|
+
const promptFile = `${this.tmpDir}/prompt-${Date.now()}.txt`;
|
|
500
|
+
writeFileSync(promptFile, prompt, 'utf-8');
|
|
501
|
+
try {
|
|
502
|
+
// Use --auto low to allow file operations without system modifications
|
|
503
|
+
const result = execSync(`FACTORY_API_KEY="${this.apiKey}" droid exec --model "${model}" --auto ${this.autoLevel} -f "${promptFile}"`, {
|
|
504
|
+
encoding: 'utf-8',
|
|
505
|
+
timeout: 300000, // 5 minutes for complex tasks
|
|
506
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
507
|
+
env: { ...process.env, FACTORY_API_KEY: this.apiKey },
|
|
508
|
+
});
|
|
509
|
+
const latencyMs = Date.now() - startTime;
|
|
510
|
+
// Clean up temp file
|
|
511
|
+
try {
|
|
512
|
+
execSync(`rm "${promptFile}"`, { encoding: 'utf-8' });
|
|
513
|
+
}
|
|
514
|
+
catch {
|
|
515
|
+
// ignore cleanup failures
|
|
516
|
+
}
|
|
517
|
+
return {
|
|
518
|
+
content: result.trim(),
|
|
519
|
+
tokensUsed: 0,
|
|
520
|
+
latencyMs,
|
|
521
|
+
};
|
|
522
|
+
}
|
|
523
|
+
catch (error) {
|
|
524
|
+
// Clean up temp file
|
|
525
|
+
try {
|
|
526
|
+
execSync(`rm "${promptFile}"`, { encoding: 'utf-8' });
|
|
527
|
+
}
|
|
528
|
+
catch {
|
|
529
|
+
// ignore cleanup failures
|
|
530
|
+
}
|
|
531
|
+
const errMsg = error instanceof Error ? error.message : String(error);
|
|
532
|
+
throw new Error(`droid exec failed: ${errMsg}`);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
// ============================================================================
|
|
537
|
+
// Benchmark Runner
|
|
538
|
+
// ============================================================================
|
|
539
|
+
function evaluateResponse(response, expectedPatterns) {
|
|
540
|
+
const normalizedResponse = response.toLowerCase();
|
|
541
|
+
return expectedPatterns.filter(pattern => normalizedResponse.includes(pattern.toLowerCase()));
|
|
542
|
+
}
|
|
543
|
+
async function runTaskForModel(client, model, task, withMemory = false) {
|
|
544
|
+
const result = {
|
|
545
|
+
taskId: task.id,
|
|
546
|
+
modelId: model.id,
|
|
547
|
+
success: false,
|
|
548
|
+
latencyMs: 0,
|
|
549
|
+
tokensUsed: 0,
|
|
550
|
+
response: '',
|
|
551
|
+
matchedPatterns: [],
|
|
552
|
+
};
|
|
553
|
+
try {
|
|
554
|
+
// Inject UAM memory context if enabled (loaded from CLAUDE.md + memory DB)
|
|
555
|
+
const prompt = withMemory
|
|
556
|
+
? getUAMMemoryContext() + task.prompt
|
|
557
|
+
: task.prompt;
|
|
558
|
+
const completion = await client.complete(model.apiModel, prompt);
|
|
559
|
+
result.response = completion.content;
|
|
560
|
+
result.latencyMs = completion.latencyMs;
|
|
561
|
+
result.tokensUsed = completion.tokensUsed;
|
|
562
|
+
result.matchedPatterns = evaluateResponse(completion.content, task.expectedPatterns);
|
|
563
|
+
const matchRatio = result.matchedPatterns.length / task.expectedPatterns.length;
|
|
564
|
+
result.success = matchRatio >= 0.6;
|
|
565
|
+
}
|
|
566
|
+
catch (error) {
|
|
567
|
+
result.error = error instanceof Error ? error.message : String(error);
|
|
568
|
+
}
|
|
569
|
+
return result;
|
|
570
|
+
}
|
|
571
|
+
async function runBenchmarkForModel(client, model, tasks, withMemory = false) {
|
|
572
|
+
const memoryLabel = withMemory ? ' (with UAM Memory)' : ' (without Memory)';
|
|
573
|
+
console.log(`\n${'='.repeat(60)}`);
|
|
574
|
+
console.log(`Running benchmark for: ${model.name}${memoryLabel}`);
|
|
575
|
+
console.log(`${'='.repeat(60)}`);
|
|
576
|
+
const results = [];
|
|
577
|
+
for (const task of tasks) {
|
|
578
|
+
console.log(` [${task.difficulty.toUpperCase()}] ${task.name}...`);
|
|
579
|
+
const result = await runTaskForModel(client, model, task, withMemory);
|
|
580
|
+
results.push(result);
|
|
581
|
+
if (result.success) {
|
|
582
|
+
console.log(` ✓ Success (${result.latencyMs}ms)`);
|
|
583
|
+
}
|
|
584
|
+
else {
|
|
585
|
+
console.log(` ✗ Failed: ${result.error || 'Pattern mismatch'}`);
|
|
586
|
+
}
|
|
587
|
+
// Small delay between tasks
|
|
588
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
589
|
+
}
|
|
590
|
+
const succeeded = results.filter(r => r.success).length;
|
|
591
|
+
const successfulResults = results.filter(r => r.latencyMs > 0);
|
|
592
|
+
const avgLatency = successfulResults.length > 0
|
|
593
|
+
? successfulResults.reduce((sum, r) => sum + r.latencyMs, 0) / successfulResults.length
|
|
594
|
+
: 0;
|
|
595
|
+
const totalTokens = results.reduce((sum, r) => sum + r.tokensUsed, 0);
|
|
596
|
+
return {
|
|
597
|
+
modelId: model.id,
|
|
598
|
+
modelName: model.name,
|
|
599
|
+
tasksRun: tasks.length,
|
|
600
|
+
tasksSucceeded: succeeded,
|
|
601
|
+
successRate: (succeeded / tasks.length) * 100,
|
|
602
|
+
avgLatencyMs: Math.round(avgLatency),
|
|
603
|
+
totalTokens,
|
|
604
|
+
results,
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
function generateComparison(modelResults) {
|
|
608
|
+
const sorted = [...modelResults].sort((a, b) => b.successRate - a.successRate);
|
|
609
|
+
const fastest = [...modelResults].sort((a, b) => a.avgLatencyMs - b.avgLatencyMs);
|
|
610
|
+
const byDifficulty = {};
|
|
611
|
+
for (const diff of ['easy', 'medium', 'hard']) {
|
|
612
|
+
let bestModel = '';
|
|
613
|
+
let bestRate = 0;
|
|
614
|
+
for (const modelResult of modelResults) {
|
|
615
|
+
const diffTasks = modelResult.results.filter(r => {
|
|
616
|
+
const task = BENCHMARK_TASKS.find(t => t.id === r.taskId);
|
|
617
|
+
return task?.difficulty === diff;
|
|
618
|
+
});
|
|
619
|
+
if (diffTasks.length > 0) {
|
|
620
|
+
const rate = (diffTasks.filter(t => t.success).length / diffTasks.length) * 100;
|
|
621
|
+
if (rate > bestRate) {
|
|
622
|
+
bestRate = rate;
|
|
623
|
+
bestModel = modelResult.modelName;
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
byDifficulty[diff] = { model: bestModel, successRate: bestRate };
|
|
628
|
+
}
|
|
629
|
+
return {
|
|
630
|
+
bestOverall: sorted[0]?.modelName || 'N/A',
|
|
631
|
+
fastestModel: fastest[0]?.modelName || 'N/A',
|
|
632
|
+
mostAccurate: sorted[0]?.modelName || 'N/A',
|
|
633
|
+
byDifficulty,
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
function generateMarkdownReport(report) {
|
|
637
|
+
const lines = [
|
|
638
|
+
'# Model Integration Benchmark Results',
|
|
639
|
+
'',
|
|
640
|
+
`**Generated:** ${report.timestamp}`,
|
|
641
|
+
`**Models Tested:** ${report.models.map(m => m.modelName).join(', ')}`,
|
|
642
|
+
`**Tasks Run:** ${BENCHMARK_TASKS.length}`,
|
|
643
|
+
'',
|
|
644
|
+
'---',
|
|
645
|
+
'',
|
|
646
|
+
'## Executive Summary',
|
|
647
|
+
'',
|
|
648
|
+
'| Model | Success Rate | Avg Latency | Total Tokens |',
|
|
649
|
+
'|-------|--------------|-------------|--------------|',
|
|
650
|
+
];
|
|
651
|
+
for (const model of report.models) {
|
|
652
|
+
lines.push(`| ${model.modelName} | ${model.successRate.toFixed(1)}% | ${model.avgLatencyMs}ms | ${model.totalTokens} |`);
|
|
653
|
+
}
|
|
654
|
+
lines.push('', '---', '', '## Comparison', '');
|
|
655
|
+
lines.push(`- **Best Overall:** ${report.comparison.bestOverall}`);
|
|
656
|
+
lines.push(`- **Fastest Model:** ${report.comparison.fastestModel}`);
|
|
657
|
+
lines.push(`- **Most Accurate:** ${report.comparison.mostAccurate}`);
|
|
658
|
+
lines.push('', '### By Difficulty', '');
|
|
659
|
+
lines.push('| Difficulty | Best Model | Success Rate |');
|
|
660
|
+
lines.push('|------------|------------|--------------|');
|
|
661
|
+
for (const [diff, data] of Object.entries(report.comparison.byDifficulty)) {
|
|
662
|
+
lines.push(`| ${diff} | ${data.model} | ${data.successRate.toFixed(1)}% |`);
|
|
663
|
+
}
|
|
664
|
+
lines.push('', '---', '', '## Detailed Results', '');
|
|
665
|
+
for (const model of report.models) {
|
|
666
|
+
lines.push(`### ${model.modelName}`, '');
|
|
667
|
+
lines.push('| Task | Difficulty | Success | Latency | Patterns Matched |');
|
|
668
|
+
lines.push('|------|------------|---------|---------|------------------|');
|
|
669
|
+
for (const result of model.results) {
|
|
670
|
+
const task = BENCHMARK_TASKS.find(t => t.id === result.taskId);
|
|
671
|
+
const status = result.success ? '✓' : '✗';
|
|
672
|
+
const patterns = `${result.matchedPatterns.length}/${task?.expectedPatterns.length || 0}`;
|
|
673
|
+
lines.push(`| ${task?.name || result.taskId} | ${task?.difficulty || 'N/A'} | ${status} | ${result.latencyMs}ms | ${patterns} |`);
|
|
674
|
+
}
|
|
675
|
+
lines.push('');
|
|
676
|
+
}
|
|
677
|
+
// Add memory comparison section if available
|
|
678
|
+
if (report.memoryComparison) {
|
|
679
|
+
lines.push('---', '', '## UAM Memory Impact Analysis', '');
|
|
680
|
+
lines.push('### Success Rate Comparison', '');
|
|
681
|
+
lines.push('| Model | Without Memory | With Memory | Improvement |');
|
|
682
|
+
lines.push('|-------|----------------|-------------|-------------|');
|
|
683
|
+
for (const withMem of report.memoryComparison.withMemory) {
|
|
684
|
+
const without = report.memoryComparison.withoutMemory.find(r => r.modelId === withMem.modelId);
|
|
685
|
+
const imp = report.memoryComparison.improvement[withMem.modelId];
|
|
686
|
+
if (without && imp) {
|
|
687
|
+
const sign = imp.successDelta >= 0 ? '+' : '';
|
|
688
|
+
lines.push(`| ${withMem.modelName} | ${without.successRate.toFixed(1)}% | ${withMem.successRate.toFixed(1)}% | ${sign}${imp.successDelta.toFixed(1)}% |`);
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
lines.push('', '### Latency Comparison', '');
|
|
692
|
+
lines.push('| Model | Without Memory | With Memory | Speed Ratio |');
|
|
693
|
+
lines.push('|-------|----------------|-------------|-------------|');
|
|
694
|
+
for (const withMem of report.memoryComparison.withMemory) {
|
|
695
|
+
const without = report.memoryComparison.withoutMemory.find(r => r.modelId === withMem.modelId);
|
|
696
|
+
const imp = report.memoryComparison.improvement[withMem.modelId];
|
|
697
|
+
if (without && imp) {
|
|
698
|
+
const speedLabel = imp.speedupRatio > 1 ? `${imp.speedupRatio.toFixed(2)}x faster` :
|
|
699
|
+
imp.speedupRatio < 1 ? `${(1 / imp.speedupRatio).toFixed(2)}x slower` : 'same';
|
|
700
|
+
lines.push(`| ${withMem.modelName} | ${without.avgLatencyMs}ms | ${withMem.avgLatencyMs}ms | ${speedLabel} |`);
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
lines.push('', '### Key Findings', '');
|
|
704
|
+
// Find best improvement
|
|
705
|
+
const improvements = Object.entries(report.memoryComparison.improvement);
|
|
706
|
+
if (improvements.length > 0) {
|
|
707
|
+
const bestImprovement = improvements.reduce((a, b) => a[1].successDelta > b[1].successDelta ? a : b);
|
|
708
|
+
const bestModel = BENCHMARK_TASKS.length > 0 ?
|
|
709
|
+
report.memoryComparison.withMemory.find(m => m.modelId === bestImprovement[0])?.modelName : 'N/A';
|
|
710
|
+
lines.push(`- **Best Memory Benefit:** ${bestModel} (+${bestImprovement[1].successDelta.toFixed(1)}% success rate)`);
|
|
711
|
+
const avgImprovement = improvements.reduce((sum, [_, imp]) => sum + imp.successDelta, 0) / improvements.length;
|
|
712
|
+
lines.push(`- **Average Improvement:** +${avgImprovement.toFixed(1)}% success rate across all models`);
|
|
713
|
+
lines.push('', '### Interpretation', '');
|
|
714
|
+
lines.push('UAM memory context injection provides models with:');
|
|
715
|
+
lines.push('- Project structure knowledge (file locations, patterns)');
|
|
716
|
+
lines.push('- Coding standards (JSDoc, error handling, async patterns)');
|
|
717
|
+
lines.push('- Common gotchas and lessons learned from previous sessions');
|
|
718
|
+
lines.push('- Design pattern templates (singleton, strategy, factory)');
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
lines.push('', '---', '', '**Report Generated by UAM Model Integration Benchmark**');
|
|
722
|
+
return lines.join('\n');
|
|
723
|
+
}
|
|
724
|
+
// ============================================================================
|
|
725
|
+
// Parallel Execution Utilities
|
|
726
|
+
// ============================================================================
|
|
727
|
+
/**
|
|
728
|
+
* Run multiple model benchmarks in parallel with configurable concurrency
|
|
729
|
+
*/
|
|
730
|
+
async function runModelsInParallel(client, models, tasks, withMemory, concurrency) {
|
|
731
|
+
const results = [];
|
|
732
|
+
const queue = [...models];
|
|
733
|
+
const inProgress = [];
|
|
734
|
+
const runNext = async () => {
|
|
735
|
+
const model = queue.shift();
|
|
736
|
+
if (!model)
|
|
737
|
+
return;
|
|
738
|
+
const result = await runBenchmarkForModel(client, model, tasks, withMemory);
|
|
739
|
+
results.push(result);
|
|
740
|
+
if (queue.length > 0) {
|
|
741
|
+
await runNext();
|
|
742
|
+
}
|
|
743
|
+
};
|
|
744
|
+
// Start initial batch up to concurrency limit
|
|
745
|
+
const initialBatch = Math.min(concurrency, models.length);
|
|
746
|
+
for (let i = 0; i < initialBatch; i++) {
|
|
747
|
+
inProgress.push(runNext());
|
|
748
|
+
}
|
|
749
|
+
await Promise.all(inProgress);
|
|
750
|
+
// Sort results to match original model order
|
|
751
|
+
return models.map(m => results.find(r => r.modelId === m.id)).filter(Boolean);
|
|
752
|
+
}
|
|
753
|
+
export async function runModelBenchmark(apiKeyOrOptions, modelIds, compareMemory = true, parallelModels = 1) {
|
|
754
|
+
// Handle both old signature and new options object
|
|
755
|
+
let key;
|
|
756
|
+
let models;
|
|
757
|
+
let compare;
|
|
758
|
+
let parallel;
|
|
759
|
+
if (typeof apiKeyOrOptions === 'object' && apiKeyOrOptions !== null) {
|
|
760
|
+
key = apiKeyOrOptions.apiKey;
|
|
761
|
+
models = apiKeyOrOptions.modelIds;
|
|
762
|
+
compare = apiKeyOrOptions.compareMemory ?? true;
|
|
763
|
+
parallel = apiKeyOrOptions.parallelModels ?? 1;
|
|
764
|
+
}
|
|
765
|
+
else {
|
|
766
|
+
key = apiKeyOrOptions;
|
|
767
|
+
models = modelIds;
|
|
768
|
+
compare = compareMemory;
|
|
769
|
+
parallel = parallelModels;
|
|
770
|
+
}
|
|
771
|
+
key = key || process.env.FACTORY_API_KEY || process.env.DROID_API_KEY;
|
|
772
|
+
if (!key) {
|
|
773
|
+
throw new Error('FACTORY_API_KEY or DROID_API_KEY not provided and not found in environment');
|
|
774
|
+
}
|
|
775
|
+
const client = new DroidExecClient(key, 'medium');
|
|
776
|
+
const modelsToTest = models
|
|
777
|
+
? MODELS.filter(m => models.includes(m.id))
|
|
778
|
+
: MODELS;
|
|
779
|
+
if (modelsToTest.length === 0) {
|
|
780
|
+
throw new Error('No valid models specified');
|
|
781
|
+
}
|
|
782
|
+
// Determine effective parallelism
|
|
783
|
+
const effectiveParallel = Math.min(parallel, modelsToTest.length);
|
|
784
|
+
const isParallel = effectiveParallel > 1;
|
|
785
|
+
console.log('\n' + '='.repeat(60));
|
|
786
|
+
console.log(' UAM MODEL INTEGRATION BENCHMARK');
|
|
787
|
+
console.log('='.repeat(60));
|
|
788
|
+
console.log(`\nModels: ${modelsToTest.map(m => m.name).join(', ')}`);
|
|
789
|
+
console.log(`Tasks: ${BENCHMARK_TASKS.length}`);
|
|
790
|
+
console.log(`Memory Comparison: ${compare ? 'ENABLED' : 'DISABLED'}`);
|
|
791
|
+
console.log(`Parallel Models: ${effectiveParallel}${isParallel ? ' (ENABLED)' : ' (sequential)'}`);
|
|
792
|
+
let withoutMemoryResults = [];
|
|
793
|
+
let withMemoryResults = [];
|
|
794
|
+
// Run without memory first
|
|
795
|
+
console.log('\n' + '█'.repeat(60));
|
|
796
|
+
console.log(` PHASE 1: WITHOUT UAM MEMORY${isParallel ? ' (PARALLEL)' : ''}`);
|
|
797
|
+
console.log('█'.repeat(60));
|
|
798
|
+
if (isParallel) {
|
|
799
|
+
console.log(`\n Running ${modelsToTest.length} models with concurrency=${effectiveParallel}...\n`);
|
|
800
|
+
withoutMemoryResults = await runModelsInParallel(client, modelsToTest, BENCHMARK_TASKS, false, effectiveParallel);
|
|
801
|
+
}
|
|
802
|
+
else {
|
|
803
|
+
for (const model of modelsToTest) {
|
|
804
|
+
const result = await runBenchmarkForModel(client, model, BENCHMARK_TASKS, false);
|
|
805
|
+
withoutMemoryResults.push(result);
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
// Run with memory if comparison enabled
|
|
809
|
+
if (compare) {
|
|
810
|
+
console.log('\n' + '█'.repeat(60));
|
|
811
|
+
console.log(` PHASE 2: WITH UAM MEMORY${isParallel ? ' (PARALLEL)' : ''}`);
|
|
812
|
+
console.log('█'.repeat(60));
|
|
813
|
+
// Setup UAM before running with-memory tests
|
|
814
|
+
console.log('\n--- Setting up UAM (init, analyze, generate, memory start, prepopulate) ---');
|
|
815
|
+
const uamSetup = await setupUAM(true);
|
|
816
|
+
if (uamSetup.errors.length > 0) {
|
|
817
|
+
console.log('\nUAM Setup warnings:');
|
|
818
|
+
uamSetup.errors.forEach(e => console.log(` - ${e}`));
|
|
819
|
+
}
|
|
820
|
+
console.log(`\nUAM Status:`);
|
|
821
|
+
console.log(` Initialized: ${uamSetup.initialized ? '✓' : '✗'}`);
|
|
822
|
+
console.log(` Memory Started: ${uamSetup.memoryStarted ? '✓' : '✗'}`);
|
|
823
|
+
console.log(` Memory Prepopulated: ${uamSetup.memoryPrepopulated ? '✓' : '✗'}`);
|
|
824
|
+
console.log(` CLAUDE.md Loaded: ${uamSetup.claudeMdLoaded ? '✓' : '✗'}`);
|
|
825
|
+
// Clear cached context to force reload with fresh memory
|
|
826
|
+
cachedMemoryContext = null;
|
|
827
|
+
// Log memory context size
|
|
828
|
+
const memoryContext = getUAMMemoryContext();
|
|
829
|
+
console.log(` Memory Context Size: ${memoryContext.length} chars\n`);
|
|
830
|
+
if (isParallel) {
|
|
831
|
+
console.log(` Running ${modelsToTest.length} models with concurrency=${effectiveParallel}...\n`);
|
|
832
|
+
withMemoryResults = await runModelsInParallel(client, modelsToTest, BENCHMARK_TASKS, true, effectiveParallel);
|
|
833
|
+
}
|
|
834
|
+
else {
|
|
835
|
+
for (const model of modelsToTest) {
|
|
836
|
+
const result = await runBenchmarkForModel(client, model, BENCHMARK_TASKS, true);
|
|
837
|
+
withMemoryResults.push(result);
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
// Calculate memory improvement for each model
|
|
842
|
+
const improvement = {};
|
|
843
|
+
if (compare) {
|
|
844
|
+
for (const model of modelsToTest) {
|
|
845
|
+
const without = withoutMemoryResults.find(r => r.modelId === model.id);
|
|
846
|
+
const withMem = withMemoryResults.find(r => r.modelId === model.id);
|
|
847
|
+
if (without && withMem) {
|
|
848
|
+
improvement[model.id] = {
|
|
849
|
+
successDelta: withMem.successRate - without.successRate,
|
|
850
|
+
speedupRatio: without.avgLatencyMs > 0 ? without.avgLatencyMs / withMem.avgLatencyMs : 1,
|
|
851
|
+
};
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
// Use with-memory results as primary if available, otherwise without
|
|
856
|
+
const primaryResults = compare && withMemoryResults.length > 0
|
|
857
|
+
? withMemoryResults
|
|
858
|
+
: withoutMemoryResults;
|
|
859
|
+
const report = {
|
|
860
|
+
timestamp: new Date().toISOString(),
|
|
861
|
+
models: primaryResults,
|
|
862
|
+
comparison: generateComparison(primaryResults),
|
|
863
|
+
memoryComparison: compare ? {
|
|
864
|
+
withMemory: withMemoryResults,
|
|
865
|
+
withoutMemory: withoutMemoryResults,
|
|
866
|
+
improvement,
|
|
867
|
+
} : undefined,
|
|
868
|
+
};
|
|
869
|
+
// Generate and save markdown report
|
|
870
|
+
const markdown = generateMarkdownReport(report);
|
|
871
|
+
const reportPath = join(__dirname, '../../MODEL_BENCHMARK_RESULTS.md');
|
|
872
|
+
writeFileSync(reportPath, markdown);
|
|
873
|
+
console.log(`\nReport saved to: ${reportPath}`);
|
|
874
|
+
// Print summary
|
|
875
|
+
console.log('\n' + '='.repeat(60));
|
|
876
|
+
console.log(' BENCHMARK COMPLETE');
|
|
877
|
+
console.log('='.repeat(60));
|
|
878
|
+
if (compare) {
|
|
879
|
+
console.log('\n--- Without Memory ---');
|
|
880
|
+
for (const model of withoutMemoryResults) {
|
|
881
|
+
console.log(` ${model.modelName}: ${model.successRate.toFixed(1)}% success, ${model.avgLatencyMs}ms avg`);
|
|
882
|
+
}
|
|
883
|
+
console.log('\n--- With UAM Memory ---');
|
|
884
|
+
for (const model of withMemoryResults) {
|
|
885
|
+
console.log(` ${model.modelName}: ${model.successRate.toFixed(1)}% success, ${model.avgLatencyMs}ms avg`);
|
|
886
|
+
}
|
|
887
|
+
console.log('\n--- Memory Improvement ---');
|
|
888
|
+
for (const [modelId, imp] of Object.entries(improvement)) {
|
|
889
|
+
const model = modelsToTest.find(m => m.id === modelId);
|
|
890
|
+
const sign = imp.successDelta >= 0 ? '+' : '';
|
|
891
|
+
console.log(` ${model?.name}: ${sign}${imp.successDelta.toFixed(1)}% success, ${imp.speedupRatio.toFixed(2)}x speed`);
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
else {
|
|
895
|
+
console.log('\nSummary:');
|
|
896
|
+
for (const model of primaryResults) {
|
|
897
|
+
console.log(` ${model.modelName}: ${model.successRate.toFixed(1)}% success, ${model.avgLatencyMs}ms avg`);
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
console.log(`\nBest Overall: ${report.comparison.bestOverall}`);
|
|
901
|
+
return report;
|
|
902
|
+
}
|
|
903
|
+
// CLI entry point
|
|
904
|
+
if (process.argv[1]?.includes('model-integration')) {
|
|
905
|
+
const envPath = join(__dirname, '../../.env');
|
|
906
|
+
if (existsSync(envPath)) {
|
|
907
|
+
const envContent = readFileSync(envPath, 'utf-8');
|
|
908
|
+
for (const line of envContent.split('\n')) {
|
|
909
|
+
const [key, ...valueParts] = line.split('=');
|
|
910
|
+
if (key && valueParts.length > 0) {
|
|
911
|
+
process.env[key.trim()] = valueParts.join('=').trim();
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
runModelBenchmark()
|
|
916
|
+
.then(() => process.exit(0))
|
|
917
|
+
.catch(err => {
|
|
918
|
+
console.error('Benchmark failed:', err);
|
|
919
|
+
process.exit(1);
|
|
920
|
+
});
|
|
921
|
+
}
|
|
922
|
+
export { MODELS, BENCHMARK_TASKS, setupUAM, loadUAMMemoryContext };
|
|
923
|
+
//# sourceMappingURL=model-integration.js.map
|