universal-agent-memory 6.1.1 → 6.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generators/claude-md.js +1 -8
- package/dist/generators/claude-md.js.map +1 -1
- package/package.json +4 -10
- package/templates/CLAUDE.template.md +303 -100
- package/scripts/README.md +0 -161
- package/scripts/generate-comparison-report.ts +0 -461
- package/scripts/install-desktop.sh +0 -105
- package/scripts/install-web.sh +0 -73
- package/scripts/run-full-benchmark.sh +0 -413
- package/scripts/run-hybrid-adaptive-tbench.sh +0 -252
- package/scripts/run-terminal-bench.sh +0 -302
- package/scripts/run-uam-benchmark.sh +0 -72
- package/scripts/setup.sh +0 -337
|
@@ -1,461 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env npx tsx
|
|
2
|
-
/**
|
|
3
|
-
* Terminal-Bench 2.0 Comparison Report Generator
|
|
4
|
-
*
|
|
5
|
-
* Parses Harbor result.json files from baseline and UAM benchmark runs,
|
|
6
|
-
* computes per-model deltas, category breakdowns, and task-level diffs.
|
|
7
|
-
*
|
|
8
|
-
* Usage:
|
|
9
|
-
* npx tsx scripts/generate-comparison-report.ts \
|
|
10
|
-
* --baseline benchmark-results/baseline_opus45_<ts> \
|
|
11
|
-
* --uam benchmark-results/uam_opus45_<ts> \
|
|
12
|
-
* --baseline benchmark-results/baseline_gpt52_<ts> \
|
|
13
|
-
* --uam benchmark-results/uam_gpt52_<ts> \
|
|
14
|
-
* --output benchmark-results/FULL_COMPARISON_<ts>.md \
|
|
15
|
-
* --timestamp <ts>
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
import { readFileSync, writeFileSync, existsSync, readdirSync } from 'fs';
|
|
19
|
-
import { join, basename } from 'path';
|
|
20
|
-
|
|
21
|
-
// ============================================================================
|
|
22
|
-
// Types
|
|
23
|
-
// ============================================================================
|
|
24
|
-
|
|
25
|
-
interface HarborResult {
|
|
26
|
-
id: string;
|
|
27
|
-
started_at: string;
|
|
28
|
-
finished_at: string | null;
|
|
29
|
-
n_total_trials: number;
|
|
30
|
-
stats: {
|
|
31
|
-
n_trials: number;
|
|
32
|
-
n_errors: number;
|
|
33
|
-
evals: Record<string, {
|
|
34
|
-
n_trials: number;
|
|
35
|
-
n_errors: number;
|
|
36
|
-
metrics: Array<{ mean: number }>;
|
|
37
|
-
reward_stats: {
|
|
38
|
-
reward: Record<string, string[]>;
|
|
39
|
-
};
|
|
40
|
-
}>;
|
|
41
|
-
};
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
interface TaskStatus {
|
|
45
|
-
taskName: string;
|
|
46
|
-
passed: boolean;
|
|
47
|
-
trialId: string;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
interface RunSummary {
|
|
51
|
-
jobName: string;
|
|
52
|
-
model: string;
|
|
53
|
-
config: 'baseline' | 'uam';
|
|
54
|
-
totalTrials: number;
|
|
55
|
-
errors: number;
|
|
56
|
-
passed: TaskStatus[];
|
|
57
|
-
failed: TaskStatus[];
|
|
58
|
-
passRate: number;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
interface ModelComparison {
|
|
62
|
-
model: string;
|
|
63
|
-
baseline: RunSummary | null;
|
|
64
|
-
uam: RunSummary | null;
|
|
65
|
-
uamWins: string[];
|
|
66
|
-
baselineWins: string[];
|
|
67
|
-
bothPass: string[];
|
|
68
|
-
bothFail: string[];
|
|
69
|
-
delta: number;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// ============================================================================
|
|
73
|
-
// Parse CLI args
|
|
74
|
-
// ============================================================================
|
|
75
|
-
|
|
76
|
-
function parseArgs(): { baselineDirs: string[]; uamDirs: string[]; output: string; timestamp: string } {
|
|
77
|
-
const args = process.argv.slice(2);
|
|
78
|
-
const baselineDirs: string[] = [];
|
|
79
|
-
const uamDirs: string[] = [];
|
|
80
|
-
let output = '';
|
|
81
|
-
let timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
82
|
-
|
|
83
|
-
for (let i = 0; i < args.length; i++) {
|
|
84
|
-
switch (args[i]) {
|
|
85
|
-
case '--baseline': baselineDirs.push(args[++i]); break;
|
|
86
|
-
case '--uam': uamDirs.push(args[++i]); break;
|
|
87
|
-
case '--output': output = args[++i]; break;
|
|
88
|
-
case '--timestamp': timestamp = args[++i]; break;
|
|
89
|
-
case '--help':
|
|
90
|
-
console.log('Usage: npx tsx generate-comparison-report.ts --baseline <dir> --uam <dir> [--output <file>] [--timestamp <ts>]');
|
|
91
|
-
process.exit(0);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
if (baselineDirs.length === 0 && uamDirs.length === 0) {
|
|
96
|
-
console.error('Error: Provide at least one --baseline or --uam directory');
|
|
97
|
-
process.exit(1);
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
if (!output) {
|
|
101
|
-
output = `benchmark-results/FULL_COMPARISON_${timestamp}.md`;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
return { baselineDirs, uamDirs, output, timestamp };
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// ============================================================================
|
|
108
|
-
// Parse Harbor results
|
|
109
|
-
// ============================================================================
|
|
110
|
-
|
|
111
|
-
function extractModelFromJobName(jobName: string): string {
|
|
112
|
-
// Job names follow pattern: (baseline|uam)_<model_short>_<timestamp>
|
|
113
|
-
// e.g. baseline_opus45_20260213_120000, uam_gpt52_20260213_120000
|
|
114
|
-
// Also handles legacy names like uam_v200_optb_full89, opus45_baseline_no_uam
|
|
115
|
-
const modelAliases: Record<string, string> = {
|
|
116
|
-
opus45: 'claude-opus-4-5',
|
|
117
|
-
opus_4_5: 'claude-opus-4-5',
|
|
118
|
-
'claude-opus': 'claude-opus-4-5',
|
|
119
|
-
gpt52: 'gpt-5.2-codex',
|
|
120
|
-
'gpt-5': 'gpt-5.2-codex',
|
|
121
|
-
glm47: 'glm-4.7',
|
|
122
|
-
'glm-4': 'glm-4.7',
|
|
123
|
-
};
|
|
124
|
-
|
|
125
|
-
for (const [alias, fullName] of Object.entries(modelAliases)) {
|
|
126
|
-
if (jobName.includes(alias)) return fullName;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
// For UAM version runs without model in name, default to Opus 4.5 (most common)
|
|
130
|
-
if (/^uam_v\d+/.test(jobName)) return 'claude-opus-4-5';
|
|
131
|
-
|
|
132
|
-
return 'unknown';
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
function extractModelFromEvalKey(evalKey: string): string {
|
|
136
|
-
// Format can be: agent__model__dataset (3 parts) or agent__dataset (2 parts)
|
|
137
|
-
const parts = evalKey.split('__');
|
|
138
|
-
if (parts.length >= 3) return parts[1];
|
|
139
|
-
return '';
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
function parseResultDir(dir: string, config: 'baseline' | 'uam'): RunSummary | null {
|
|
143
|
-
const resultPath = join(dir, 'result.json');
|
|
144
|
-
if (!existsSync(resultPath)) {
|
|
145
|
-
console.warn(` Warning: ${resultPath} not found`);
|
|
146
|
-
return null;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
const data: HarborResult = JSON.parse(readFileSync(resultPath, 'utf-8'));
|
|
150
|
-
const jobName = basename(dir);
|
|
151
|
-
|
|
152
|
-
const evalKeys = Object.keys(data.stats.evals);
|
|
153
|
-
if (evalKeys.length === 0) {
|
|
154
|
-
console.warn(` Warning: No evals in ${resultPath}`);
|
|
155
|
-
return null;
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const evalKey = evalKeys[0];
|
|
159
|
-
// Try model from eval key first, fall back to job name
|
|
160
|
-
const model = extractModelFromEvalKey(evalKey) || extractModelFromJobName(jobName);
|
|
161
|
-
const evalData = data.stats.evals[evalKey];
|
|
162
|
-
|
|
163
|
-
const rewards = evalData.reward_stats?.reward || {};
|
|
164
|
-
const passedTrials = rewards['1.0'] || [];
|
|
165
|
-
const failedTrials = rewards['0.0'] || [];
|
|
166
|
-
|
|
167
|
-
const passed: TaskStatus[] = passedTrials.map((t: string) => ({
|
|
168
|
-
taskName: t.split('__')[0],
|
|
169
|
-
passed: true,
|
|
170
|
-
trialId: t,
|
|
171
|
-
}));
|
|
172
|
-
|
|
173
|
-
const failed: TaskStatus[] = failedTrials.map((t: string) => ({
|
|
174
|
-
taskName: t.split('__')[0],
|
|
175
|
-
passed: false,
|
|
176
|
-
trialId: t,
|
|
177
|
-
}));
|
|
178
|
-
|
|
179
|
-
const total = passed.length + failed.length;
|
|
180
|
-
const passRate = total > 0 ? (passed.length / total) * 100 : 0;
|
|
181
|
-
|
|
182
|
-
return {
|
|
183
|
-
jobName,
|
|
184
|
-
model,
|
|
185
|
-
config,
|
|
186
|
-
totalTrials: data.stats.n_trials,
|
|
187
|
-
errors: data.stats.n_errors,
|
|
188
|
-
passed,
|
|
189
|
-
failed,
|
|
190
|
-
passRate,
|
|
191
|
-
};
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
function extractTaskNames(tasks: TaskStatus[]): Set<string> {
|
|
195
|
-
return new Set(tasks.map(t => t.taskName));
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
// ============================================================================
|
|
199
|
-
// Build comparisons
|
|
200
|
-
// ============================================================================
|
|
201
|
-
|
|
202
|
-
function buildModelComparison(baseline: RunSummary | null, uam: RunSummary | null): ModelComparison {
|
|
203
|
-
const model = baseline?.model || uam?.model || 'unknown';
|
|
204
|
-
|
|
205
|
-
const bPassed = baseline ? extractTaskNames(baseline.passed) : new Set<string>();
|
|
206
|
-
const bFailed = baseline ? extractTaskNames(baseline.failed) : new Set<string>();
|
|
207
|
-
const uPassed = uam ? extractTaskNames(uam.passed) : new Set<string>();
|
|
208
|
-
const uFailed = uam ? extractTaskNames(uam.failed) : new Set<string>();
|
|
209
|
-
|
|
210
|
-
const uamWins = [...uPassed].filter(t => !bPassed.has(t)).sort();
|
|
211
|
-
const baselineWins = [...bPassed].filter(t => !uPassed.has(t)).sort();
|
|
212
|
-
const bothPass = [...bPassed].filter(t => uPassed.has(t)).sort();
|
|
213
|
-
const bothFail = [...bFailed].filter(t => uFailed.has(t)).sort();
|
|
214
|
-
|
|
215
|
-
const bRate = baseline?.passRate || 0;
|
|
216
|
-
const uRate = uam?.passRate || 0;
|
|
217
|
-
const delta = uRate - bRate;
|
|
218
|
-
|
|
219
|
-
return { model, baseline, uam, uamWins, baselineWins, bothPass, bothFail, delta };
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
// ============================================================================
|
|
223
|
-
// Binomial test (approximate)
|
|
224
|
-
// ============================================================================
|
|
225
|
-
|
|
226
|
-
function binomialPValue(wins: number, losses: number): string {
|
|
227
|
-
const n = wins + losses;
|
|
228
|
-
if (n === 0) return 'N/A';
|
|
229
|
-
// Simple sign test approximation
|
|
230
|
-
const p = Math.min(wins, losses);
|
|
231
|
-
// Use normal approximation for binomial test
|
|
232
|
-
const expected = n / 2;
|
|
233
|
-
const stddev = Math.sqrt(n * 0.25);
|
|
234
|
-
if (stddev === 0) return 'N/A';
|
|
235
|
-
const z = Math.abs(p - expected) / stddev;
|
|
236
|
-
// Rough 2-sided p-value from z-score
|
|
237
|
-
if (z < 1.645) return '>0.10';
|
|
238
|
-
if (z < 1.96) return '<0.10';
|
|
239
|
-
if (z < 2.576) return '<0.05';
|
|
240
|
-
return '<0.01';
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
// ============================================================================
|
|
244
|
-
// Generate markdown report
|
|
245
|
-
// ============================================================================
|
|
246
|
-
|
|
247
|
-
function generateReport(
|
|
248
|
-
comparisons: ModelComparison[],
|
|
249
|
-
timestamp: string,
|
|
250
|
-
): string {
|
|
251
|
-
const lines: string[] = [];
|
|
252
|
-
|
|
253
|
-
lines.push('# Terminal-Bench 2.0 Full Comparison: UAM v3.1.0 vs Baseline');
|
|
254
|
-
lines.push('');
|
|
255
|
-
lines.push(`**Generated:** ${new Date().toISOString()}`);
|
|
256
|
-
lines.push(`**Dataset:** Terminal-Bench 2.0 (89 tasks)`);
|
|
257
|
-
lines.push(`**UAM Version:** 3.1.0`);
|
|
258
|
-
lines.push(`**Benchmark ID:** ${timestamp}`);
|
|
259
|
-
lines.push('');
|
|
260
|
-
|
|
261
|
-
// Executive summary
|
|
262
|
-
lines.push('## Executive Summary');
|
|
263
|
-
lines.push('');
|
|
264
|
-
lines.push('| Model | Baseline | UAM | Delta | UAM Wins | Baseline Wins | p-value |');
|
|
265
|
-
lines.push('|-------|----------|-----|-------|----------|---------------|---------|');
|
|
266
|
-
|
|
267
|
-
for (const c of comparisons) {
|
|
268
|
-
const bRate = c.baseline ? `${c.baseline.passRate.toFixed(1)}% (${c.baseline.passed.length}/${c.baseline.passed.length + c.baseline.failed.length})` : 'N/A';
|
|
269
|
-
const uRate = c.uam ? `${c.uam.passRate.toFixed(1)}% (${c.uam.passed.length}/${c.uam.passed.length + c.uam.failed.length})` : 'N/A';
|
|
270
|
-
const delta = c.baseline && c.uam ? `${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}%` : 'N/A';
|
|
271
|
-
const pval = binomialPValue(c.uamWins.length, c.baselineWins.length);
|
|
272
|
-
lines.push(`| ${c.model} | ${bRate} | ${uRate} | **${delta}** | ${c.uamWins.length} | ${c.baselineWins.length} | ${pval} |`);
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
lines.push('');
|
|
276
|
-
|
|
277
|
-
// Aggregate stats
|
|
278
|
-
const totalUamWins = comparisons.reduce((s, c) => s + c.uamWins.length, 0);
|
|
279
|
-
const totalBaselineWins = comparisons.reduce((s, c) => s + c.baselineWins.length, 0);
|
|
280
|
-
const netTasks = totalUamWins - totalBaselineWins;
|
|
281
|
-
|
|
282
|
-
lines.push(`**Across all models:** UAM wins ${totalUamWins} tasks, Baseline wins ${totalBaselineWins} tasks, Net: ${netTasks >= 0 ? '+' : ''}${netTasks} tasks for UAM.`);
|
|
283
|
-
lines.push('');
|
|
284
|
-
|
|
285
|
-
// Per-model detailed sections
|
|
286
|
-
for (const c of comparisons) {
|
|
287
|
-
lines.push(`---`);
|
|
288
|
-
lines.push('');
|
|
289
|
-
lines.push(`## ${c.model}`);
|
|
290
|
-
lines.push('');
|
|
291
|
-
|
|
292
|
-
if (c.baseline) {
|
|
293
|
-
lines.push(`- **Baseline:** ${c.baseline.passRate.toFixed(1)}% (${c.baseline.passed.length} passed, ${c.baseline.failed.length} failed, ${c.baseline.errors} errors)`);
|
|
294
|
-
}
|
|
295
|
-
if (c.uam) {
|
|
296
|
-
lines.push(`- **UAM:** ${c.uam.passRate.toFixed(1)}% (${c.uam.passed.length} passed, ${c.uam.failed.length} failed, ${c.uam.errors} errors)`);
|
|
297
|
-
}
|
|
298
|
-
if (c.baseline && c.uam) {
|
|
299
|
-
lines.push(`- **Net Delta:** ${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}% (${c.uamWins.length - c.baselineWins.length >= 0 ? '+' : ''}${c.uamWins.length - c.baselineWins.length} tasks)`);
|
|
300
|
-
}
|
|
301
|
-
lines.push('');
|
|
302
|
-
|
|
303
|
-
// UAM wins
|
|
304
|
-
if (c.uamWins.length > 0) {
|
|
305
|
-
lines.push('### Tasks UAM Wins (pass with UAM, fail without)');
|
|
306
|
-
lines.push('');
|
|
307
|
-
for (const t of c.uamWins) {
|
|
308
|
-
lines.push(`- \`${t}\``);
|
|
309
|
-
}
|
|
310
|
-
lines.push('');
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
// Baseline wins
|
|
314
|
-
if (c.baselineWins.length > 0) {
|
|
315
|
-
lines.push('### Tasks Baseline Wins (pass without UAM, fail with)');
|
|
316
|
-
lines.push('');
|
|
317
|
-
for (const t of c.baselineWins) {
|
|
318
|
-
lines.push(`- \`${t}\``);
|
|
319
|
-
}
|
|
320
|
-
lines.push('');
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
// Full task-level diff table
|
|
324
|
-
if (c.baseline && c.uam) {
|
|
325
|
-
const allTasks = new Set([
|
|
326
|
-
...c.baseline.passed.map(t => t.taskName),
|
|
327
|
-
...c.baseline.failed.map(t => t.taskName),
|
|
328
|
-
...c.uam.passed.map(t => t.taskName),
|
|
329
|
-
...c.uam.failed.map(t => t.taskName),
|
|
330
|
-
]);
|
|
331
|
-
|
|
332
|
-
const bPassSet = extractTaskNames(c.baseline.passed);
|
|
333
|
-
const uPassSet = extractTaskNames(c.uam.passed);
|
|
334
|
-
|
|
335
|
-
lines.push('### Full Task Comparison');
|
|
336
|
-
lines.push('');
|
|
337
|
-
lines.push('| Task | Baseline | UAM | Delta |');
|
|
338
|
-
lines.push('|------|----------|-----|-------|');
|
|
339
|
-
|
|
340
|
-
for (const t of [...allTasks].sort()) {
|
|
341
|
-
const bStatus = bPassSet.has(t) ? 'PASS' : 'FAIL';
|
|
342
|
-
const uStatus = uPassSet.has(t) ? 'PASS' : 'FAIL';
|
|
343
|
-
let delta = '=';
|
|
344
|
-
if (bStatus === 'FAIL' && uStatus === 'PASS') delta = '**+UAM**';
|
|
345
|
-
if (bStatus === 'PASS' && uStatus === 'FAIL') delta = '**-UAM**';
|
|
346
|
-
lines.push(`| ${t} | ${bStatus} | ${uStatus} | ${delta} |`);
|
|
347
|
-
}
|
|
348
|
-
lines.push('');
|
|
349
|
-
}
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
// Cross-model analysis
|
|
353
|
-
if (comparisons.length > 1) {
|
|
354
|
-
lines.push('---');
|
|
355
|
-
lines.push('');
|
|
356
|
-
lines.push('## Cross-Model Analysis');
|
|
357
|
-
lines.push('');
|
|
358
|
-
|
|
359
|
-
// Which tasks does UAM help consistently across models?
|
|
360
|
-
const uamWinSets = comparisons.map(c => new Set(c.uamWins));
|
|
361
|
-
const baselineWinSets = comparisons.map(c => new Set(c.baselineWins));
|
|
362
|
-
|
|
363
|
-
if (uamWinSets.length >= 2) {
|
|
364
|
-
const consistentUamWins = [...uamWinSets[0]].filter(t => uamWinSets.every(s => s.has(t)));
|
|
365
|
-
const consistentBaselineWins = [...baselineWinSets[0]].filter(t => baselineWinSets.every(s => s.has(t)));
|
|
366
|
-
|
|
367
|
-
if (consistentUamWins.length > 0) {
|
|
368
|
-
lines.push(`**Tasks where UAM helps across ALL models:** ${consistentUamWins.join(', ')}`);
|
|
369
|
-
lines.push('');
|
|
370
|
-
}
|
|
371
|
-
if (consistentBaselineWins.length > 0) {
|
|
372
|
-
lines.push(`**Tasks where UAM hurts across ALL models:** ${consistentBaselineWins.join(', ')}`);
|
|
373
|
-
lines.push('');
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
// Which model benefits most from UAM?
|
|
378
|
-
const sorted = [...comparisons].sort((a, b) => b.delta - a.delta);
|
|
379
|
-
lines.push('**Model benefit ranking (most to least improvement from UAM):**');
|
|
380
|
-
lines.push('');
|
|
381
|
-
for (const c of sorted) {
|
|
382
|
-
lines.push(`1. **${c.model}**: ${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}% (${c.uamWins.length} wins, ${c.baselineWins.length} losses)`);
|
|
383
|
-
}
|
|
384
|
-
lines.push('');
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
// Methodology
|
|
388
|
-
lines.push('---');
|
|
389
|
-
lines.push('');
|
|
390
|
-
lines.push('## Methodology');
|
|
391
|
-
lines.push('');
|
|
392
|
-
lines.push('- **Baseline:** `harbor run` with `--ak "system_prompt="` to clear UAM context');
|
|
393
|
-
lines.push('- **UAM:** `harbor run` with default CLAUDE.md and UAM memory system active');
|
|
394
|
-
lines.push('- **Dataset:** Terminal-Bench 2.0 (89 tasks across systems, ML, security, algorithms)');
|
|
395
|
-
lines.push('- **Scoring:** Binary pass/fail per task based on Harbor reward (1.0 = pass, 0.0 = fail)');
|
|
396
|
-
lines.push('- **Statistical test:** Sign test on UAM-wins vs Baseline-wins (binomial, 2-sided)');
|
|
397
|
-
lines.push('');
|
|
398
|
-
lines.push('---');
|
|
399
|
-
lines.push(`*Report generated by \`scripts/generate-comparison-report.ts\` at ${new Date().toISOString()}*`);
|
|
400
|
-
|
|
401
|
-
return lines.join('\n');
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
// ============================================================================
|
|
405
|
-
// Main
|
|
406
|
-
// ============================================================================
|
|
407
|
-
|
|
408
|
-
function main(): void {
|
|
409
|
-
const { baselineDirs, uamDirs, output, timestamp } = parseArgs();
|
|
410
|
-
|
|
411
|
-
console.log('Parsing benchmark results...');
|
|
412
|
-
|
|
413
|
-
const baselineRuns: RunSummary[] = [];
|
|
414
|
-
const uamRuns: RunSummary[] = [];
|
|
415
|
-
|
|
416
|
-
for (const dir of baselineDirs) {
|
|
417
|
-
const run = parseResultDir(dir, 'baseline');
|
|
418
|
-
if (run) {
|
|
419
|
-
baselineRuns.push(run);
|
|
420
|
-
console.log(` Baseline: ${run.model} - ${run.passRate.toFixed(1)}% (${run.passed.length}/${run.passed.length + run.failed.length})`);
|
|
421
|
-
}
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
for (const dir of uamDirs) {
|
|
425
|
-
const run = parseResultDir(dir, 'uam');
|
|
426
|
-
if (run) {
|
|
427
|
-
uamRuns.push(run);
|
|
428
|
-
console.log(` UAM: ${run.model} - ${run.passRate.toFixed(1)}% (${run.passed.length}/${run.passed.length + run.failed.length})`);
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
// Match baseline and UAM runs by model
|
|
433
|
-
const modelSet = new Set([
|
|
434
|
-
...baselineRuns.map(r => r.model),
|
|
435
|
-
...uamRuns.map(r => r.model),
|
|
436
|
-
]);
|
|
437
|
-
|
|
438
|
-
const comparisons: ModelComparison[] = [];
|
|
439
|
-
|
|
440
|
-
for (const model of modelSet) {
|
|
441
|
-
const baseline = baselineRuns.find(r => r.model === model) || null;
|
|
442
|
-
const uam = uamRuns.find(r => r.model === model) || null;
|
|
443
|
-
comparisons.push(buildModelComparison(baseline, uam));
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
// Sort by model name for consistent output
|
|
447
|
-
comparisons.sort((a, b) => a.model.localeCompare(b.model));
|
|
448
|
-
|
|
449
|
-
// Generate report
|
|
450
|
-
const report = generateReport(comparisons, timestamp);
|
|
451
|
-
writeFileSync(output, report + '\n');
|
|
452
|
-
console.log(`\nReport written to: ${output}`);
|
|
453
|
-
console.log(`Models compared: ${comparisons.length}`);
|
|
454
|
-
|
|
455
|
-
for (const c of comparisons) {
|
|
456
|
-
const sym = c.delta >= 0 ? '+' : '';
|
|
457
|
-
console.log(` ${c.model}: ${sym}${c.delta.toFixed(1)}% (UAM wins ${c.uamWins.length}, Baseline wins ${c.baselineWins.length})`);
|
|
458
|
-
}
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
main();
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -euo pipefail
|
|
3
|
-
|
|
4
|
-
# Colors
|
|
5
|
-
GREEN='\033[0;32m'
|
|
6
|
-
YELLOW='\033[1;33m'
|
|
7
|
-
RED='\033[0;31m'
|
|
8
|
-
NC='\033[0m' # No Color
|
|
9
|
-
|
|
10
|
-
REPO_URL="https://github.com/DammianMiller/universal-agent-memory"
|
|
11
|
-
|
|
12
|
-
echo -e "${GREEN}Universal Agent Memory - Desktop Installation${NC}"
|
|
13
|
-
echo "============================================"
|
|
14
|
-
echo ""
|
|
15
|
-
|
|
16
|
-
# Check for Node.js
|
|
17
|
-
if ! command -v node &> /dev/null; then
|
|
18
|
-
echo -e "${RED}Error: Node.js is not installed${NC}"
|
|
19
|
-
echo "Please install Node.js 18+ from https://nodejs.org/"
|
|
20
|
-
exit 1
|
|
21
|
-
fi
|
|
22
|
-
|
|
23
|
-
NODE_VERSION=$(node -v | cut -d'v' -f2 | cut -d'.' -f1)
|
|
24
|
-
if [ "$NODE_VERSION" -lt 18 ]; then
|
|
25
|
-
echo -e "${RED}Error: Node.js 18+ required (you have $(node -v))${NC}"
|
|
26
|
-
exit 1
|
|
27
|
-
fi
|
|
28
|
-
|
|
29
|
-
echo -e "${GREEN}✓${NC} Node.js $(node -v) detected"
|
|
30
|
-
|
|
31
|
-
# Check for npm
|
|
32
|
-
if ! command -v npm &> /dev/null; then
|
|
33
|
-
echo -e "${RED}Error: npm is not installed${NC}"
|
|
34
|
-
exit 1
|
|
35
|
-
fi
|
|
36
|
-
|
|
37
|
-
echo -e "${GREEN}✓${NC} npm $(npm -v) detected"
|
|
38
|
-
|
|
39
|
-
# Check for Docker (optional)
|
|
40
|
-
if command -v docker &> /dev/null; then
|
|
41
|
-
echo -e "${GREEN}✓${NC} Docker detected - local Qdrant available"
|
|
42
|
-
DOCKER_AVAILABLE=true
|
|
43
|
-
else
|
|
44
|
-
echo -e "${YELLOW}⚠${NC} Docker not found - will use cloud backends only"
|
|
45
|
-
DOCKER_AVAILABLE=false
|
|
46
|
-
fi
|
|
47
|
-
|
|
48
|
-
# Install the CLI globally
|
|
49
|
-
echo ""
|
|
50
|
-
echo "Installing universal-agent-memory..."
|
|
51
|
-
|
|
52
|
-
# Try npm install first, fall back to git clone if package not published yet
|
|
53
|
-
if npm install -g universal-agent-memory 2>/dev/null; then
|
|
54
|
-
echo -e "${GREEN}✓${NC} Installed from npm registry"
|
|
55
|
-
else
|
|
56
|
-
echo -e "${YELLOW}Package not yet on npm, installing from GitHub...${NC}"
|
|
57
|
-
|
|
58
|
-
# Install to user's local directory
|
|
59
|
-
INSTALL_DIR="${HOME}/.universal-agent-memory"
|
|
60
|
-
|
|
61
|
-
# Remove old installation if exists
|
|
62
|
-
if [ -d "$INSTALL_DIR" ]; then
|
|
63
|
-
echo "Removing previous installation..."
|
|
64
|
-
rm -rf "$INSTALL_DIR"
|
|
65
|
-
fi
|
|
66
|
-
|
|
67
|
-
# Clone and install
|
|
68
|
-
git clone --depth 1 "$REPO_URL.git" "$INSTALL_DIR"
|
|
69
|
-
cd "$INSTALL_DIR"
|
|
70
|
-
npm install --production=false
|
|
71
|
-
npm run build
|
|
72
|
-
npm link
|
|
73
|
-
|
|
74
|
-
echo -e "${GREEN}✓${NC} Installed from GitHub to $INSTALL_DIR"
|
|
75
|
-
fi
|
|
76
|
-
|
|
77
|
-
echo ""
|
|
78
|
-
echo -e "${GREEN}Installation complete!${NC}"
|
|
79
|
-
echo ""
|
|
80
|
-
echo "Next steps:"
|
|
81
|
-
echo " 1. Initialize UAM in your project:"
|
|
82
|
-
echo " $ cd /path/to/your/project"
|
|
83
|
-
echo " $ uam init"
|
|
84
|
-
echo ""
|
|
85
|
-
echo " 2. Review the generated CLAUDE.md"
|
|
86
|
-
echo ""
|
|
87
|
-
echo " 3. Start working - your AI assistant will follow the workflows!"
|
|
88
|
-
echo ""
|
|
89
|
-
|
|
90
|
-
if [ "$DOCKER_AVAILABLE" = true ]; then
|
|
91
|
-
echo " 2. Start local memory services (optional):"
|
|
92
|
-
echo " $ uam memory start"
|
|
93
|
-
echo ""
|
|
94
|
-
echo " Or use cloud backends:"
|
|
95
|
-
else
|
|
96
|
-
echo " 2. Configure cloud memory backends:"
|
|
97
|
-
fi
|
|
98
|
-
|
|
99
|
-
echo " - GitHub: export GITHUB_TOKEN=your_token"
|
|
100
|
-
echo " - Qdrant Cloud: export QDRANT_API_KEY=your_key && export QDRANT_URL=your_url"
|
|
101
|
-
echo ""
|
|
102
|
-
echo " 3. Generate CLAUDE.md for your project:"
|
|
103
|
-
echo " $ uam generate"
|
|
104
|
-
echo ""
|
|
105
|
-
echo "Documentation: ${REPO_URL}#readme"
|
package/scripts/install-web.sh
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
set -e
|
|
3
|
-
|
|
4
|
-
REPO_URL="https://github.com/DammianMiller/universal-agent-memory"
|
|
5
|
-
|
|
6
|
-
echo "🚀 Universal Agent Memory - Web Platform Setup"
|
|
7
|
-
echo ""
|
|
8
|
-
|
|
9
|
-
# Check for required tools
|
|
10
|
-
if ! command -v node &> /dev/null; then
|
|
11
|
-
echo "❌ Node.js is required. Install from https://nodejs.org"
|
|
12
|
-
exit 1
|
|
13
|
-
fi
|
|
14
|
-
|
|
15
|
-
if ! command -v npm &> /dev/null; then
|
|
16
|
-
echo "❌ npm is required. Install from https://nodejs.org"
|
|
17
|
-
exit 1
|
|
18
|
-
fi
|
|
19
|
-
|
|
20
|
-
echo "✅ Node.js $(node -v) detected"
|
|
21
|
-
echo "✅ npm $(npm -v) detected"
|
|
22
|
-
|
|
23
|
-
# Install CLI globally
|
|
24
|
-
echo ""
|
|
25
|
-
echo "📦 Installing universal-agent-memory CLI..."
|
|
26
|
-
|
|
27
|
-
# Try npm install first, fall back to git clone if package not published yet
|
|
28
|
-
if npm install -g universal-agent-memory 2>/dev/null; then
|
|
29
|
-
echo "✅ Installed from npm registry"
|
|
30
|
-
else
|
|
31
|
-
echo "⚠️ Package not yet on npm, installing from GitHub..."
|
|
32
|
-
|
|
33
|
-
# Install to user's local directory
|
|
34
|
-
INSTALL_DIR="${HOME}/.universal-agent-memory"
|
|
35
|
-
|
|
36
|
-
# Remove old installation if exists
|
|
37
|
-
if [ -d "$INSTALL_DIR" ]; then
|
|
38
|
-
echo "Removing previous installation..."
|
|
39
|
-
rm -rf "$INSTALL_DIR"
|
|
40
|
-
fi
|
|
41
|
-
|
|
42
|
-
# Clone and install
|
|
43
|
-
git clone --depth 1 "$REPO_URL.git" "$INSTALL_DIR"
|
|
44
|
-
cd "$INSTALL_DIR"
|
|
45
|
-
npm install --production=false
|
|
46
|
-
npm run build
|
|
47
|
-
npm link
|
|
48
|
-
|
|
49
|
-
echo "✅ Installed from GitHub to $INSTALL_DIR"
|
|
50
|
-
fi
|
|
51
|
-
|
|
52
|
-
# Initialize in current directory
|
|
53
|
-
echo ""
|
|
54
|
-
echo "⚙️ Initializing project..."
|
|
55
|
-
uam init --web --interactive
|
|
56
|
-
|
|
57
|
-
echo ""
|
|
58
|
-
echo "✅ Setup complete!"
|
|
59
|
-
echo ""
|
|
60
|
-
echo "Next steps:"
|
|
61
|
-
echo " 1. Initialize UAM in your project:"
|
|
62
|
-
echo " uam init"
|
|
63
|
-
echo ""
|
|
64
|
-
echo " 2. Review the generated CLAUDE.md"
|
|
65
|
-
echo ""
|
|
66
|
-
echo " 3. Start working - your AI assistant will follow the workflows!"
|
|
67
|
-
echo ""
|
|
68
|
-
echo "Optional: Set up cloud memory backends"
|
|
69
|
-
echo " export GITHUB_TOKEN=your_token"
|
|
70
|
-
echo " export QDRANT_API_KEY=your_key"
|
|
71
|
-
echo " export QDRANT_URL=your_url"
|
|
72
|
-
echo ""
|
|
73
|
-
echo "Documentation: ${REPO_URL}#readme"
|