universal-agent-memory 6.1.1 → 6.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scripts/README.md DELETED
@@ -1,161 +0,0 @@
1
- # Setup Scripts
2
-
3
- This directory contains automated setup and installation scripts for UAM.
4
-
5
- ## Scripts
6
-
7
- ### `setup.sh` - Complete Setup
8
-
9
- ```bash
10
- npm run setup
11
- ```
12
-
13
- Performs a comprehensive setup including:
14
-
15
- - ✅ Dependency checking (Node.js, npm, git, npx)
16
- - ✅ Optional dependency recommendations (Docker, Python, pre-commit)
17
- - ✅ npm install (if node_modules missing)
18
- - ✅ TypeScript build
19
- - ✅ Git hooks configuration:
20
- - `pre-commit` - Secrets detection, linting
21
- - `commit-msg` - Conventional commits validation
22
- - `pre-push` - Test execution before push
23
- - ✅ GitHub PR template (if gh CLI available)
24
-
25
- ### `install-web.sh` - Web Platform Setup
26
-
27
- ```bash
28
- npm run install:web
29
- ```
30
-
31
- Installs UAM for web platform usage (claude.ai, Factory.AI):
32
-
33
- - Installs CLI globally or from GitHub
34
- - Initializes web platform configuration
35
- - Sets up for web-based AI assistants
36
-
37
- ### `install-desktop.sh` - Desktop Setup
38
-
39
- ```bash
40
- npm run install:desktop
41
- ```
42
-
43
- Installs UAM for desktop usage:
44
-
45
- - Installs CLI globally or from GitHub
46
- - Detects Docker for local Qdrant
47
- - Initializes desktop platform configuration
48
- - Provides setup guidance
49
-
50
- ## Usage
51
-
52
- ### Quick Setup
53
-
54
- ```bash
55
- # Install UAM globally
56
- npm install -g universal-agent-memory
57
-
58
- # Run comprehensive setup
59
- npm run setup
60
-
61
- # Initialize in your project
62
- uam init
63
- ```
64
-
65
- ### Platform-Specific Setup
66
-
67
- ```bash
68
- # For web platforms (claude.ai, Factory.AI)
69
- npm run install:web
70
-
71
- # For desktop (Claude Code, opencode)
72
- npm run install:desktop
73
- ```
74
-
75
- ## Git Hooks
76
-
77
- The `setup.sh` script configures three git hooks:
78
-
79
- ### Pre-commit Hook
80
-
81
- - **Purpose**: Prevent secrets from being committed
82
- - **Checks**:
83
- - Scans for API keys, passwords, tokens in code
84
- - Runs linter with zero warnings allowed
85
- - **Bypass**: `git commit --no-verify`
86
-
87
- ### Commit-msg Hook
88
-
89
- - **Purpose**: Enforce conventional commits format
90
- - **Validates**: `type(scope): description` format
91
- - **Types**: feat, fix, docs, style, refactor, test, chore, perf, ci, build, revert
92
- - **Bypass**: Confirm with 'y' when prompted
93
-
94
- ### Pre-push Hook
95
-
96
- - **Purpose**: Ensure tests pass before pushing
97
- - **Runs**: `npm test`
98
- - **Bypass**: None (tests must pass)
99
-
100
- ## Troubleshooting
101
-
102
- ### Hooks not executing
103
-
104
- ```bash
105
- # Make hooks executable
106
- chmod +x .git/hooks/*
107
-
108
- # Verify hooks exist
109
- ls -la .git/hooks/ | grep -v sample
110
- ```
111
-
112
- ### Setup script fails
113
-
114
- ```bash
115
- # Check Node.js version
116
- node --version # Should be >= 18.0.0
117
-
118
- # Check npm
119
- npm --version
120
-
121
- # Clear and reinstall
122
- rm -rf node_modules package-lock.json
123
- npm install
124
- ```
125
-
126
- ### Manual hook installation
127
-
128
- If automatic setup fails, manually create hooks:
129
-
130
- ```bash
131
- # Pre-commit
132
- cat > .git/hooks/pre-commit << 'EOF'
133
- #!/bin/bash
134
- npm run lint -- --max-warnings=0
135
- exit $?
136
- EOF
137
- chmod +x .git/hooks/pre-commit
138
-
139
- # Commit-msg
140
- cat > .git/hooks/commit-msg << 'EOF'
141
- #!/bin/bash
142
- # Conventional commits validation
143
- exit 0
144
- EOF
145
- chmod +x .git/hooks/commit-msg
146
- ```
147
-
148
- ## Best Practices
149
-
150
- 1. **Always run `npm run setup`** after cloning or updating UAM
151
- 2. **Review generated hooks** before committing
152
- 3. **Keep hooks in sync** with project requirements
153
- 4. **Document custom hooks** in project README
154
- 5. **Test hooks** with `git commit --no-verify` first
155
-
156
- ## Security Notes
157
-
158
- - Git hooks run locally and cannot access remote repositories
159
- - Pre-commit hook only scans TypeScript/JavaScript/JSON files
160
- - Secrets detection is best-effort (not exhaustive)
161
- - Always use environment variables for sensitive data
@@ -1,461 +0,0 @@
1
- #!/usr/bin/env npx tsx
2
- /**
3
- * Terminal-Bench 2.0 Comparison Report Generator
4
- *
5
- * Parses Harbor result.json files from baseline and UAM benchmark runs,
6
- * computes per-model deltas, category breakdowns, and task-level diffs.
7
- *
8
- * Usage:
9
- * npx tsx scripts/generate-comparison-report.ts \
10
- * --baseline benchmark-results/baseline_opus45_<ts> \
11
- * --uam benchmark-results/uam_opus45_<ts> \
12
- * --baseline benchmark-results/baseline_gpt52_<ts> \
13
- * --uam benchmark-results/uam_gpt52_<ts> \
14
- * --output benchmark-results/FULL_COMPARISON_<ts>.md \
15
- * --timestamp <ts>
16
- */
17
-
18
- import { readFileSync, writeFileSync, existsSync, readdirSync } from 'fs';
19
- import { join, basename } from 'path';
20
-
21
- // ============================================================================
22
- // Types
23
- // ============================================================================
24
-
25
- interface HarborResult {
26
- id: string;
27
- started_at: string;
28
- finished_at: string | null;
29
- n_total_trials: number;
30
- stats: {
31
- n_trials: number;
32
- n_errors: number;
33
- evals: Record<string, {
34
- n_trials: number;
35
- n_errors: number;
36
- metrics: Array<{ mean: number }>;
37
- reward_stats: {
38
- reward: Record<string, string[]>;
39
- };
40
- }>;
41
- };
42
- }
43
-
44
- interface TaskStatus {
45
- taskName: string;
46
- passed: boolean;
47
- trialId: string;
48
- }
49
-
50
- interface RunSummary {
51
- jobName: string;
52
- model: string;
53
- config: 'baseline' | 'uam';
54
- totalTrials: number;
55
- errors: number;
56
- passed: TaskStatus[];
57
- failed: TaskStatus[];
58
- passRate: number;
59
- }
60
-
61
- interface ModelComparison {
62
- model: string;
63
- baseline: RunSummary | null;
64
- uam: RunSummary | null;
65
- uamWins: string[];
66
- baselineWins: string[];
67
- bothPass: string[];
68
- bothFail: string[];
69
- delta: number;
70
- }
71
-
72
- // ============================================================================
73
- // Parse CLI args
74
- // ============================================================================
75
-
76
- function parseArgs(): { baselineDirs: string[]; uamDirs: string[]; output: string; timestamp: string } {
77
- const args = process.argv.slice(2);
78
- const baselineDirs: string[] = [];
79
- const uamDirs: string[] = [];
80
- let output = '';
81
- let timestamp = new Date().toISOString().replace(/[:.]/g, '-');
82
-
83
- for (let i = 0; i < args.length; i++) {
84
- switch (args[i]) {
85
- case '--baseline': baselineDirs.push(args[++i]); break;
86
- case '--uam': uamDirs.push(args[++i]); break;
87
- case '--output': output = args[++i]; break;
88
- case '--timestamp': timestamp = args[++i]; break;
89
- case '--help':
90
- console.log('Usage: npx tsx generate-comparison-report.ts --baseline <dir> --uam <dir> [--output <file>] [--timestamp <ts>]');
91
- process.exit(0);
92
- }
93
- }
94
-
95
- if (baselineDirs.length === 0 && uamDirs.length === 0) {
96
- console.error('Error: Provide at least one --baseline or --uam directory');
97
- process.exit(1);
98
- }
99
-
100
- if (!output) {
101
- output = `benchmark-results/FULL_COMPARISON_${timestamp}.md`;
102
- }
103
-
104
- return { baselineDirs, uamDirs, output, timestamp };
105
- }
106
-
107
- // ============================================================================
108
- // Parse Harbor results
109
- // ============================================================================
110
-
111
- function extractModelFromJobName(jobName: string): string {
112
- // Job names follow pattern: (baseline|uam)_<model_short>_<timestamp>
113
- // e.g. baseline_opus45_20260213_120000, uam_gpt52_20260213_120000
114
- // Also handles legacy names like uam_v200_optb_full89, opus45_baseline_no_uam
115
- const modelAliases: Record<string, string> = {
116
- opus45: 'claude-opus-4-5',
117
- opus_4_5: 'claude-opus-4-5',
118
- 'claude-opus': 'claude-opus-4-5',
119
- gpt52: 'gpt-5.2-codex',
120
- 'gpt-5': 'gpt-5.2-codex',
121
- glm47: 'glm-4.7',
122
- 'glm-4': 'glm-4.7',
123
- };
124
-
125
- for (const [alias, fullName] of Object.entries(modelAliases)) {
126
- if (jobName.includes(alias)) return fullName;
127
- }
128
-
129
- // For UAM version runs without model in name, default to Opus 4.5 (most common)
130
- if (/^uam_v\d+/.test(jobName)) return 'claude-opus-4-5';
131
-
132
- return 'unknown';
133
- }
134
-
135
- function extractModelFromEvalKey(evalKey: string): string {
136
- // Format can be: agent__model__dataset (3 parts) or agent__dataset (2 parts)
137
- const parts = evalKey.split('__');
138
- if (parts.length >= 3) return parts[1];
139
- return '';
140
- }
141
-
142
- function parseResultDir(dir: string, config: 'baseline' | 'uam'): RunSummary | null {
143
- const resultPath = join(dir, 'result.json');
144
- if (!existsSync(resultPath)) {
145
- console.warn(` Warning: ${resultPath} not found`);
146
- return null;
147
- }
148
-
149
- const data: HarborResult = JSON.parse(readFileSync(resultPath, 'utf-8'));
150
- const jobName = basename(dir);
151
-
152
- const evalKeys = Object.keys(data.stats.evals);
153
- if (evalKeys.length === 0) {
154
- console.warn(` Warning: No evals in ${resultPath}`);
155
- return null;
156
- }
157
-
158
- const evalKey = evalKeys[0];
159
- // Try model from eval key first, fall back to job name
160
- const model = extractModelFromEvalKey(evalKey) || extractModelFromJobName(jobName);
161
- const evalData = data.stats.evals[evalKey];
162
-
163
- const rewards = evalData.reward_stats?.reward || {};
164
- const passedTrials = rewards['1.0'] || [];
165
- const failedTrials = rewards['0.0'] || [];
166
-
167
- const passed: TaskStatus[] = passedTrials.map((t: string) => ({
168
- taskName: t.split('__')[0],
169
- passed: true,
170
- trialId: t,
171
- }));
172
-
173
- const failed: TaskStatus[] = failedTrials.map((t: string) => ({
174
- taskName: t.split('__')[0],
175
- passed: false,
176
- trialId: t,
177
- }));
178
-
179
- const total = passed.length + failed.length;
180
- const passRate = total > 0 ? (passed.length / total) * 100 : 0;
181
-
182
- return {
183
- jobName,
184
- model,
185
- config,
186
- totalTrials: data.stats.n_trials,
187
- errors: data.stats.n_errors,
188
- passed,
189
- failed,
190
- passRate,
191
- };
192
- }
193
-
194
- function extractTaskNames(tasks: TaskStatus[]): Set<string> {
195
- return new Set(tasks.map(t => t.taskName));
196
- }
197
-
198
- // ============================================================================
199
- // Build comparisons
200
- // ============================================================================
201
-
202
- function buildModelComparison(baseline: RunSummary | null, uam: RunSummary | null): ModelComparison {
203
- const model = baseline?.model || uam?.model || 'unknown';
204
-
205
- const bPassed = baseline ? extractTaskNames(baseline.passed) : new Set<string>();
206
- const bFailed = baseline ? extractTaskNames(baseline.failed) : new Set<string>();
207
- const uPassed = uam ? extractTaskNames(uam.passed) : new Set<string>();
208
- const uFailed = uam ? extractTaskNames(uam.failed) : new Set<string>();
209
-
210
- const uamWins = [...uPassed].filter(t => !bPassed.has(t)).sort();
211
- const baselineWins = [...bPassed].filter(t => !uPassed.has(t)).sort();
212
- const bothPass = [...bPassed].filter(t => uPassed.has(t)).sort();
213
- const bothFail = [...bFailed].filter(t => uFailed.has(t)).sort();
214
-
215
- const bRate = baseline?.passRate || 0;
216
- const uRate = uam?.passRate || 0;
217
- const delta = uRate - bRate;
218
-
219
- return { model, baseline, uam, uamWins, baselineWins, bothPass, bothFail, delta };
220
- }
221
-
222
- // ============================================================================
223
- // Binomial test (approximate)
224
- // ============================================================================
225
-
226
- function binomialPValue(wins: number, losses: number): string {
227
- const n = wins + losses;
228
- if (n === 0) return 'N/A';
229
- // Simple sign test approximation
230
- const p = Math.min(wins, losses);
231
- // Use normal approximation for binomial test
232
- const expected = n / 2;
233
- const stddev = Math.sqrt(n * 0.25);
234
- if (stddev === 0) return 'N/A';
235
- const z = Math.abs(p - expected) / stddev;
236
- // Rough 2-sided p-value from z-score
237
- if (z < 1.645) return '>0.10';
238
- if (z < 1.96) return '<0.10';
239
- if (z < 2.576) return '<0.05';
240
- return '<0.01';
241
- }
242
-
243
- // ============================================================================
244
- // Generate markdown report
245
- // ============================================================================
246
-
247
- function generateReport(
248
- comparisons: ModelComparison[],
249
- timestamp: string,
250
- ): string {
251
- const lines: string[] = [];
252
-
253
- lines.push('# Terminal-Bench 2.0 Full Comparison: UAM v3.1.0 vs Baseline');
254
- lines.push('');
255
- lines.push(`**Generated:** ${new Date().toISOString()}`);
256
- lines.push(`**Dataset:** Terminal-Bench 2.0 (89 tasks)`);
257
- lines.push(`**UAM Version:** 3.1.0`);
258
- lines.push(`**Benchmark ID:** ${timestamp}`);
259
- lines.push('');
260
-
261
- // Executive summary
262
- lines.push('## Executive Summary');
263
- lines.push('');
264
- lines.push('| Model | Baseline | UAM | Delta | UAM Wins | Baseline Wins | p-value |');
265
- lines.push('|-------|----------|-----|-------|----------|---------------|---------|');
266
-
267
- for (const c of comparisons) {
268
- const bRate = c.baseline ? `${c.baseline.passRate.toFixed(1)}% (${c.baseline.passed.length}/${c.baseline.passed.length + c.baseline.failed.length})` : 'N/A';
269
- const uRate = c.uam ? `${c.uam.passRate.toFixed(1)}% (${c.uam.passed.length}/${c.uam.passed.length + c.uam.failed.length})` : 'N/A';
270
- const delta = c.baseline && c.uam ? `${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}%` : 'N/A';
271
- const pval = binomialPValue(c.uamWins.length, c.baselineWins.length);
272
- lines.push(`| ${c.model} | ${bRate} | ${uRate} | **${delta}** | ${c.uamWins.length} | ${c.baselineWins.length} | ${pval} |`);
273
- }
274
-
275
- lines.push('');
276
-
277
- // Aggregate stats
278
- const totalUamWins = comparisons.reduce((s, c) => s + c.uamWins.length, 0);
279
- const totalBaselineWins = comparisons.reduce((s, c) => s + c.baselineWins.length, 0);
280
- const netTasks = totalUamWins - totalBaselineWins;
281
-
282
- lines.push(`**Across all models:** UAM wins ${totalUamWins} tasks, Baseline wins ${totalBaselineWins} tasks, Net: ${netTasks >= 0 ? '+' : ''}${netTasks} tasks for UAM.`);
283
- lines.push('');
284
-
285
- // Per-model detailed sections
286
- for (const c of comparisons) {
287
- lines.push(`---`);
288
- lines.push('');
289
- lines.push(`## ${c.model}`);
290
- lines.push('');
291
-
292
- if (c.baseline) {
293
- lines.push(`- **Baseline:** ${c.baseline.passRate.toFixed(1)}% (${c.baseline.passed.length} passed, ${c.baseline.failed.length} failed, ${c.baseline.errors} errors)`);
294
- }
295
- if (c.uam) {
296
- lines.push(`- **UAM:** ${c.uam.passRate.toFixed(1)}% (${c.uam.passed.length} passed, ${c.uam.failed.length} failed, ${c.uam.errors} errors)`);
297
- }
298
- if (c.baseline && c.uam) {
299
- lines.push(`- **Net Delta:** ${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}% (${c.uamWins.length - c.baselineWins.length >= 0 ? '+' : ''}${c.uamWins.length - c.baselineWins.length} tasks)`);
300
- }
301
- lines.push('');
302
-
303
- // UAM wins
304
- if (c.uamWins.length > 0) {
305
- lines.push('### Tasks UAM Wins (pass with UAM, fail without)');
306
- lines.push('');
307
- for (const t of c.uamWins) {
308
- lines.push(`- \`${t}\``);
309
- }
310
- lines.push('');
311
- }
312
-
313
- // Baseline wins
314
- if (c.baselineWins.length > 0) {
315
- lines.push('### Tasks Baseline Wins (pass without UAM, fail with)');
316
- lines.push('');
317
- for (const t of c.baselineWins) {
318
- lines.push(`- \`${t}\``);
319
- }
320
- lines.push('');
321
- }
322
-
323
- // Full task-level diff table
324
- if (c.baseline && c.uam) {
325
- const allTasks = new Set([
326
- ...c.baseline.passed.map(t => t.taskName),
327
- ...c.baseline.failed.map(t => t.taskName),
328
- ...c.uam.passed.map(t => t.taskName),
329
- ...c.uam.failed.map(t => t.taskName),
330
- ]);
331
-
332
- const bPassSet = extractTaskNames(c.baseline.passed);
333
- const uPassSet = extractTaskNames(c.uam.passed);
334
-
335
- lines.push('### Full Task Comparison');
336
- lines.push('');
337
- lines.push('| Task | Baseline | UAM | Delta |');
338
- lines.push('|------|----------|-----|-------|');
339
-
340
- for (const t of [...allTasks].sort()) {
341
- const bStatus = bPassSet.has(t) ? 'PASS' : 'FAIL';
342
- const uStatus = uPassSet.has(t) ? 'PASS' : 'FAIL';
343
- let delta = '=';
344
- if (bStatus === 'FAIL' && uStatus === 'PASS') delta = '**+UAM**';
345
- if (bStatus === 'PASS' && uStatus === 'FAIL') delta = '**-UAM**';
346
- lines.push(`| ${t} | ${bStatus} | ${uStatus} | ${delta} |`);
347
- }
348
- lines.push('');
349
- }
350
- }
351
-
352
- // Cross-model analysis
353
- if (comparisons.length > 1) {
354
- lines.push('---');
355
- lines.push('');
356
- lines.push('## Cross-Model Analysis');
357
- lines.push('');
358
-
359
- // Which tasks does UAM help consistently across models?
360
- const uamWinSets = comparisons.map(c => new Set(c.uamWins));
361
- const baselineWinSets = comparisons.map(c => new Set(c.baselineWins));
362
-
363
- if (uamWinSets.length >= 2) {
364
- const consistentUamWins = [...uamWinSets[0]].filter(t => uamWinSets.every(s => s.has(t)));
365
- const consistentBaselineWins = [...baselineWinSets[0]].filter(t => baselineWinSets.every(s => s.has(t)));
366
-
367
- if (consistentUamWins.length > 0) {
368
- lines.push(`**Tasks where UAM helps across ALL models:** ${consistentUamWins.join(', ')}`);
369
- lines.push('');
370
- }
371
- if (consistentBaselineWins.length > 0) {
372
- lines.push(`**Tasks where UAM hurts across ALL models:** ${consistentBaselineWins.join(', ')}`);
373
- lines.push('');
374
- }
375
- }
376
-
377
- // Which model benefits most from UAM?
378
- const sorted = [...comparisons].sort((a, b) => b.delta - a.delta);
379
- lines.push('**Model benefit ranking (most to least improvement from UAM):**');
380
- lines.push('');
381
- for (const c of sorted) {
382
- lines.push(`1. **${c.model}**: ${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}% (${c.uamWins.length} wins, ${c.baselineWins.length} losses)`);
383
- }
384
- lines.push('');
385
- }
386
-
387
- // Methodology
388
- lines.push('---');
389
- lines.push('');
390
- lines.push('## Methodology');
391
- lines.push('');
392
- lines.push('- **Baseline:** `harbor run` with `--ak "system_prompt="` to clear UAM context');
393
- lines.push('- **UAM:** `harbor run` with default CLAUDE.md and UAM memory system active');
394
- lines.push('- **Dataset:** Terminal-Bench 2.0 (89 tasks across systems, ML, security, algorithms)');
395
- lines.push('- **Scoring:** Binary pass/fail per task based on Harbor reward (1.0 = pass, 0.0 = fail)');
396
- lines.push('- **Statistical test:** Sign test on UAM-wins vs Baseline-wins (binomial, 2-sided)');
397
- lines.push('');
398
- lines.push('---');
399
- lines.push(`*Report generated by \`scripts/generate-comparison-report.ts\` at ${new Date().toISOString()}*`);
400
-
401
- return lines.join('\n');
402
- }
403
-
404
- // ============================================================================
405
- // Main
406
- // ============================================================================
407
-
408
- function main(): void {
409
- const { baselineDirs, uamDirs, output, timestamp } = parseArgs();
410
-
411
- console.log('Parsing benchmark results...');
412
-
413
- const baselineRuns: RunSummary[] = [];
414
- const uamRuns: RunSummary[] = [];
415
-
416
- for (const dir of baselineDirs) {
417
- const run = parseResultDir(dir, 'baseline');
418
- if (run) {
419
- baselineRuns.push(run);
420
- console.log(` Baseline: ${run.model} - ${run.passRate.toFixed(1)}% (${run.passed.length}/${run.passed.length + run.failed.length})`);
421
- }
422
- }
423
-
424
- for (const dir of uamDirs) {
425
- const run = parseResultDir(dir, 'uam');
426
- if (run) {
427
- uamRuns.push(run);
428
- console.log(` UAM: ${run.model} - ${run.passRate.toFixed(1)}% (${run.passed.length}/${run.passed.length + run.failed.length})`);
429
- }
430
- }
431
-
432
- // Match baseline and UAM runs by model
433
- const modelSet = new Set([
434
- ...baselineRuns.map(r => r.model),
435
- ...uamRuns.map(r => r.model),
436
- ]);
437
-
438
- const comparisons: ModelComparison[] = [];
439
-
440
- for (const model of modelSet) {
441
- const baseline = baselineRuns.find(r => r.model === model) || null;
442
- const uam = uamRuns.find(r => r.model === model) || null;
443
- comparisons.push(buildModelComparison(baseline, uam));
444
- }
445
-
446
- // Sort by model name for consistent output
447
- comparisons.sort((a, b) => a.model.localeCompare(b.model));
448
-
449
- // Generate report
450
- const report = generateReport(comparisons, timestamp);
451
- writeFileSync(output, report + '\n');
452
- console.log(`\nReport written to: ${output}`);
453
- console.log(`Models compared: ${comparisons.length}`);
454
-
455
- for (const c of comparisons) {
456
- const sym = c.delta >= 0 ? '+' : '';
457
- console.log(` ${c.model}: ${sym}${c.delta.toFixed(1)}% (UAM wins ${c.uamWins.length}, Baseline wins ${c.baselineWins.length})`);
458
- }
459
- }
460
-
461
- main();