universal-agent-memory 6.1.1 → 6.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/tool-calls.d.ts +16 -0
- package/dist/cli/tool-calls.d.ts.map +1 -0
- package/dist/cli/tool-calls.js +374 -0
- package/dist/cli/tool-calls.js.map +1 -0
- package/dist/generators/claude-md.js +1 -8
- package/dist/generators/claude-md.js.map +1 -1
- package/package.json +5 -10
- package/templates/CLAUDE.template.md +303 -100
- package/tools/agents/README.md +224 -0
- package/tools/agents/benchmarks/benchmark_memory_systems.py +637 -0
- package/tools/agents/benchmarks/results/benchmark_20260106_064817.json +170 -0
- package/tools/agents/benchmarks/results/benchmark_20260106_064817.md +51 -0
- package/tools/agents/config/chat_template.jinja +172 -0
- package/tools/agents/scripts/fix_qwen_chat_template.py +314 -0
- package/tools/agents/scripts/memory_migration.py +518 -0
- package/tools/agents/scripts/migrate_memory_to_qdrant.py +113 -0
- package/tools/agents/scripts/query_memory.py +189 -0
- package/tools/agents/scripts/qwen_tool_call_test.py +419 -0
- package/tools/agents/scripts/qwen_tool_call_wrapper.py +517 -0
- package/tools/agents/scripts/start-services.sh +96 -0
- package/scripts/README.md +0 -161
- package/scripts/generate-comparison-report.ts +0 -461
- package/scripts/install-desktop.sh +0 -105
- package/scripts/install-web.sh +0 -73
- package/scripts/run-full-benchmark.sh +0 -413
- package/scripts/run-hybrid-adaptive-tbench.sh +0 -252
- package/scripts/run-terminal-bench.sh +0 -302
- package/scripts/run-uam-benchmark.sh +0 -72
- package/scripts/setup.sh +0 -337
package/scripts/README.md
DELETED
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
# Setup Scripts
|
|
2
|
-
|
|
3
|
-
This directory contains automated setup and installation scripts for UAM.
|
|
4
|
-
|
|
5
|
-
## Scripts
|
|
6
|
-
|
|
7
|
-
### `setup.sh` - Complete Setup
|
|
8
|
-
|
|
9
|
-
```bash
|
|
10
|
-
npm run setup
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
Performs a comprehensive setup including:
|
|
14
|
-
|
|
15
|
-
- ✅ Dependency checking (Node.js, npm, git, npx)
|
|
16
|
-
- ✅ Optional dependency recommendations (Docker, Python, pre-commit)
|
|
17
|
-
- ✅ npm install (if node_modules missing)
|
|
18
|
-
- ✅ TypeScript build
|
|
19
|
-
- ✅ Git hooks configuration:
|
|
20
|
-
- `pre-commit` - Secrets detection, linting
|
|
21
|
-
- `commit-msg` - Conventional commits validation
|
|
22
|
-
- `pre-push` - Test execution before push
|
|
23
|
-
- ✅ GitHub PR template (if gh CLI available)
|
|
24
|
-
|
|
25
|
-
### `install-web.sh` - Web Platform Setup
|
|
26
|
-
|
|
27
|
-
```bash
|
|
28
|
-
npm run install:web
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
Installs UAM for web platform usage (claude.ai, Factory.AI):
|
|
32
|
-
|
|
33
|
-
- Installs CLI globally or from GitHub
|
|
34
|
-
- Initializes web platform configuration
|
|
35
|
-
- Sets up for web-based AI assistants
|
|
36
|
-
|
|
37
|
-
### `install-desktop.sh` - Desktop Setup
|
|
38
|
-
|
|
39
|
-
```bash
|
|
40
|
-
npm run install:desktop
|
|
41
|
-
```
|
|
42
|
-
|
|
43
|
-
Installs UAM for desktop usage:
|
|
44
|
-
|
|
45
|
-
- Installs CLI globally or from GitHub
|
|
46
|
-
- Detects Docker for local Qdrant
|
|
47
|
-
- Initializes desktop platform configuration
|
|
48
|
-
- Provides setup guidance
|
|
49
|
-
|
|
50
|
-
## Usage
|
|
51
|
-
|
|
52
|
-
### Quick Setup
|
|
53
|
-
|
|
54
|
-
```bash
|
|
55
|
-
# Install UAM globally
|
|
56
|
-
npm install -g universal-agent-memory
|
|
57
|
-
|
|
58
|
-
# Run comprehensive setup
|
|
59
|
-
npm run setup
|
|
60
|
-
|
|
61
|
-
# Initialize in your project
|
|
62
|
-
uam init
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
### Platform-Specific Setup
|
|
66
|
-
|
|
67
|
-
```bash
|
|
68
|
-
# For web platforms (claude.ai, Factory.AI)
|
|
69
|
-
npm run install:web
|
|
70
|
-
|
|
71
|
-
# For desktop (Claude Code, opencode)
|
|
72
|
-
npm run install:desktop
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
## Git Hooks
|
|
76
|
-
|
|
77
|
-
The `setup.sh` script configures three git hooks:
|
|
78
|
-
|
|
79
|
-
### Pre-commit Hook
|
|
80
|
-
|
|
81
|
-
- **Purpose**: Prevent secrets from being committed
|
|
82
|
-
- **Checks**:
|
|
83
|
-
- Scans for API keys, passwords, tokens in code
|
|
84
|
-
- Runs linter with zero warnings allowed
|
|
85
|
-
- **Bypass**: `git commit --no-verify`
|
|
86
|
-
|
|
87
|
-
### Commit-msg Hook
|
|
88
|
-
|
|
89
|
-
- **Purpose**: Enforce conventional commits format
|
|
90
|
-
- **Validates**: `type(scope): description` format
|
|
91
|
-
- **Types**: feat, fix, docs, style, refactor, test, chore, perf, ci, build, revert
|
|
92
|
-
- **Bypass**: Confirm with 'y' when prompted
|
|
93
|
-
|
|
94
|
-
### Pre-push Hook
|
|
95
|
-
|
|
96
|
-
- **Purpose**: Ensure tests pass before pushing
|
|
97
|
-
- **Runs**: `npm test`
|
|
98
|
-
- **Bypass**: None (tests must pass)
|
|
99
|
-
|
|
100
|
-
## Troubleshooting
|
|
101
|
-
|
|
102
|
-
### Hooks not executing
|
|
103
|
-
|
|
104
|
-
```bash
|
|
105
|
-
# Make hooks executable
|
|
106
|
-
chmod +x .git/hooks/*
|
|
107
|
-
|
|
108
|
-
# Verify hooks exist
|
|
109
|
-
ls -la .git/hooks/ | grep -v sample
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
### Setup script fails
|
|
113
|
-
|
|
114
|
-
```bash
|
|
115
|
-
# Check Node.js version
|
|
116
|
-
node --version # Should be >= 18.0.0
|
|
117
|
-
|
|
118
|
-
# Check npm
|
|
119
|
-
npm --version
|
|
120
|
-
|
|
121
|
-
# Clear and reinstall
|
|
122
|
-
rm -rf node_modules package-lock.json
|
|
123
|
-
npm install
|
|
124
|
-
```
|
|
125
|
-
|
|
126
|
-
### Manual hook installation
|
|
127
|
-
|
|
128
|
-
If automatic setup fails, manually create hooks:
|
|
129
|
-
|
|
130
|
-
```bash
|
|
131
|
-
# Pre-commit
|
|
132
|
-
cat > .git/hooks/pre-commit << 'EOF'
|
|
133
|
-
#!/bin/bash
|
|
134
|
-
npm run lint -- --max-warnings=0
|
|
135
|
-
exit $?
|
|
136
|
-
EOF
|
|
137
|
-
chmod +x .git/hooks/pre-commit
|
|
138
|
-
|
|
139
|
-
# Commit-msg
|
|
140
|
-
cat > .git/hooks/commit-msg << 'EOF'
|
|
141
|
-
#!/bin/bash
|
|
142
|
-
# Conventional commits validation
|
|
143
|
-
exit 0
|
|
144
|
-
EOF
|
|
145
|
-
chmod +x .git/hooks/commit-msg
|
|
146
|
-
```
|
|
147
|
-
|
|
148
|
-
## Best Practices
|
|
149
|
-
|
|
150
|
-
1. **Always run `npm run setup`** after cloning or updating UAM
|
|
151
|
-
2. **Review generated hooks** before committing
|
|
152
|
-
3. **Keep hooks in sync** with project requirements
|
|
153
|
-
4. **Document custom hooks** in project README
|
|
154
|
-
5. **Test hooks** with `git commit --no-verify` first
|
|
155
|
-
|
|
156
|
-
## Security Notes
|
|
157
|
-
|
|
158
|
-
- Git hooks run locally and cannot access remote repositories
|
|
159
|
-
- Pre-commit hook only scans TypeScript/JavaScript/JSON files
|
|
160
|
-
- Secrets detection is best-effort (not exhaustive)
|
|
161
|
-
- Always use environment variables for sensitive data
|
|
@@ -1,461 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env npx tsx
|
|
2
|
-
/**
|
|
3
|
-
* Terminal-Bench 2.0 Comparison Report Generator
|
|
4
|
-
*
|
|
5
|
-
* Parses Harbor result.json files from baseline and UAM benchmark runs,
|
|
6
|
-
* computes per-model deltas, category breakdowns, and task-level diffs.
|
|
7
|
-
*
|
|
8
|
-
* Usage:
|
|
9
|
-
* npx tsx scripts/generate-comparison-report.ts \
|
|
10
|
-
* --baseline benchmark-results/baseline_opus45_<ts> \
|
|
11
|
-
* --uam benchmark-results/uam_opus45_<ts> \
|
|
12
|
-
* --baseline benchmark-results/baseline_gpt52_<ts> \
|
|
13
|
-
* --uam benchmark-results/uam_gpt52_<ts> \
|
|
14
|
-
* --output benchmark-results/FULL_COMPARISON_<ts>.md \
|
|
15
|
-
* --timestamp <ts>
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
import { readFileSync, writeFileSync, existsSync, readdirSync } from 'fs';
|
|
19
|
-
import { join, basename } from 'path';
|
|
20
|
-
|
|
21
|
-
// ============================================================================
|
|
22
|
-
// Types
|
|
23
|
-
// ============================================================================
|
|
24
|
-
|
|
25
|
-
interface HarborResult {
|
|
26
|
-
id: string;
|
|
27
|
-
started_at: string;
|
|
28
|
-
finished_at: string | null;
|
|
29
|
-
n_total_trials: number;
|
|
30
|
-
stats: {
|
|
31
|
-
n_trials: number;
|
|
32
|
-
n_errors: number;
|
|
33
|
-
evals: Record<string, {
|
|
34
|
-
n_trials: number;
|
|
35
|
-
n_errors: number;
|
|
36
|
-
metrics: Array<{ mean: number }>;
|
|
37
|
-
reward_stats: {
|
|
38
|
-
reward: Record<string, string[]>;
|
|
39
|
-
};
|
|
40
|
-
}>;
|
|
41
|
-
};
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
interface TaskStatus {
|
|
45
|
-
taskName: string;
|
|
46
|
-
passed: boolean;
|
|
47
|
-
trialId: string;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
interface RunSummary {
|
|
51
|
-
jobName: string;
|
|
52
|
-
model: string;
|
|
53
|
-
config: 'baseline' | 'uam';
|
|
54
|
-
totalTrials: number;
|
|
55
|
-
errors: number;
|
|
56
|
-
passed: TaskStatus[];
|
|
57
|
-
failed: TaskStatus[];
|
|
58
|
-
passRate: number;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
interface ModelComparison {
|
|
62
|
-
model: string;
|
|
63
|
-
baseline: RunSummary | null;
|
|
64
|
-
uam: RunSummary | null;
|
|
65
|
-
uamWins: string[];
|
|
66
|
-
baselineWins: string[];
|
|
67
|
-
bothPass: string[];
|
|
68
|
-
bothFail: string[];
|
|
69
|
-
delta: number;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// ============================================================================
|
|
73
|
-
// Parse CLI args
|
|
74
|
-
// ============================================================================
|
|
75
|
-
|
|
76
|
-
function parseArgs(): { baselineDirs: string[]; uamDirs: string[]; output: string; timestamp: string } {
|
|
77
|
-
const args = process.argv.slice(2);
|
|
78
|
-
const baselineDirs: string[] = [];
|
|
79
|
-
const uamDirs: string[] = [];
|
|
80
|
-
let output = '';
|
|
81
|
-
let timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
82
|
-
|
|
83
|
-
for (let i = 0; i < args.length; i++) {
|
|
84
|
-
switch (args[i]) {
|
|
85
|
-
case '--baseline': baselineDirs.push(args[++i]); break;
|
|
86
|
-
case '--uam': uamDirs.push(args[++i]); break;
|
|
87
|
-
case '--output': output = args[++i]; break;
|
|
88
|
-
case '--timestamp': timestamp = args[++i]; break;
|
|
89
|
-
case '--help':
|
|
90
|
-
console.log('Usage: npx tsx generate-comparison-report.ts --baseline <dir> --uam <dir> [--output <file>] [--timestamp <ts>]');
|
|
91
|
-
process.exit(0);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
if (baselineDirs.length === 0 && uamDirs.length === 0) {
|
|
96
|
-
console.error('Error: Provide at least one --baseline or --uam directory');
|
|
97
|
-
process.exit(1);
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
if (!output) {
|
|
101
|
-
output = `benchmark-results/FULL_COMPARISON_${timestamp}.md`;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
return { baselineDirs, uamDirs, output, timestamp };
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// ============================================================================
|
|
108
|
-
// Parse Harbor results
|
|
109
|
-
// ============================================================================
|
|
110
|
-
|
|
111
|
-
function extractModelFromJobName(jobName: string): string {
|
|
112
|
-
// Job names follow pattern: (baseline|uam)_<model_short>_<timestamp>
|
|
113
|
-
// e.g. baseline_opus45_20260213_120000, uam_gpt52_20260213_120000
|
|
114
|
-
// Also handles legacy names like uam_v200_optb_full89, opus45_baseline_no_uam
|
|
115
|
-
const modelAliases: Record<string, string> = {
|
|
116
|
-
opus45: 'claude-opus-4-5',
|
|
117
|
-
opus_4_5: 'claude-opus-4-5',
|
|
118
|
-
'claude-opus': 'claude-opus-4-5',
|
|
119
|
-
gpt52: 'gpt-5.2-codex',
|
|
120
|
-
'gpt-5': 'gpt-5.2-codex',
|
|
121
|
-
glm47: 'glm-4.7',
|
|
122
|
-
'glm-4': 'glm-4.7',
|
|
123
|
-
};
|
|
124
|
-
|
|
125
|
-
for (const [alias, fullName] of Object.entries(modelAliases)) {
|
|
126
|
-
if (jobName.includes(alias)) return fullName;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
// For UAM version runs without model in name, default to Opus 4.5 (most common)
|
|
130
|
-
if (/^uam_v\d+/.test(jobName)) return 'claude-opus-4-5';
|
|
131
|
-
|
|
132
|
-
return 'unknown';
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
function extractModelFromEvalKey(evalKey: string): string {
|
|
136
|
-
// Format can be: agent__model__dataset (3 parts) or agent__dataset (2 parts)
|
|
137
|
-
const parts = evalKey.split('__');
|
|
138
|
-
if (parts.length >= 3) return parts[1];
|
|
139
|
-
return '';
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
function parseResultDir(dir: string, config: 'baseline' | 'uam'): RunSummary | null {
|
|
143
|
-
const resultPath = join(dir, 'result.json');
|
|
144
|
-
if (!existsSync(resultPath)) {
|
|
145
|
-
console.warn(` Warning: ${resultPath} not found`);
|
|
146
|
-
return null;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
const data: HarborResult = JSON.parse(readFileSync(resultPath, 'utf-8'));
|
|
150
|
-
const jobName = basename(dir);
|
|
151
|
-
|
|
152
|
-
const evalKeys = Object.keys(data.stats.evals);
|
|
153
|
-
if (evalKeys.length === 0) {
|
|
154
|
-
console.warn(` Warning: No evals in ${resultPath}`);
|
|
155
|
-
return null;
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const evalKey = evalKeys[0];
|
|
159
|
-
// Try model from eval key first, fall back to job name
|
|
160
|
-
const model = extractModelFromEvalKey(evalKey) || extractModelFromJobName(jobName);
|
|
161
|
-
const evalData = data.stats.evals[evalKey];
|
|
162
|
-
|
|
163
|
-
const rewards = evalData.reward_stats?.reward || {};
|
|
164
|
-
const passedTrials = rewards['1.0'] || [];
|
|
165
|
-
const failedTrials = rewards['0.0'] || [];
|
|
166
|
-
|
|
167
|
-
const passed: TaskStatus[] = passedTrials.map((t: string) => ({
|
|
168
|
-
taskName: t.split('__')[0],
|
|
169
|
-
passed: true,
|
|
170
|
-
trialId: t,
|
|
171
|
-
}));
|
|
172
|
-
|
|
173
|
-
const failed: TaskStatus[] = failedTrials.map((t: string) => ({
|
|
174
|
-
taskName: t.split('__')[0],
|
|
175
|
-
passed: false,
|
|
176
|
-
trialId: t,
|
|
177
|
-
}));
|
|
178
|
-
|
|
179
|
-
const total = passed.length + failed.length;
|
|
180
|
-
const passRate = total > 0 ? (passed.length / total) * 100 : 0;
|
|
181
|
-
|
|
182
|
-
return {
|
|
183
|
-
jobName,
|
|
184
|
-
model,
|
|
185
|
-
config,
|
|
186
|
-
totalTrials: data.stats.n_trials,
|
|
187
|
-
errors: data.stats.n_errors,
|
|
188
|
-
passed,
|
|
189
|
-
failed,
|
|
190
|
-
passRate,
|
|
191
|
-
};
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
function extractTaskNames(tasks: TaskStatus[]): Set<string> {
|
|
195
|
-
return new Set(tasks.map(t => t.taskName));
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
// ============================================================================
|
|
199
|
-
// Build comparisons
|
|
200
|
-
// ============================================================================
|
|
201
|
-
|
|
202
|
-
function buildModelComparison(baseline: RunSummary | null, uam: RunSummary | null): ModelComparison {
|
|
203
|
-
const model = baseline?.model || uam?.model || 'unknown';
|
|
204
|
-
|
|
205
|
-
const bPassed = baseline ? extractTaskNames(baseline.passed) : new Set<string>();
|
|
206
|
-
const bFailed = baseline ? extractTaskNames(baseline.failed) : new Set<string>();
|
|
207
|
-
const uPassed = uam ? extractTaskNames(uam.passed) : new Set<string>();
|
|
208
|
-
const uFailed = uam ? extractTaskNames(uam.failed) : new Set<string>();
|
|
209
|
-
|
|
210
|
-
const uamWins = [...uPassed].filter(t => !bPassed.has(t)).sort();
|
|
211
|
-
const baselineWins = [...bPassed].filter(t => !uPassed.has(t)).sort();
|
|
212
|
-
const bothPass = [...bPassed].filter(t => uPassed.has(t)).sort();
|
|
213
|
-
const bothFail = [...bFailed].filter(t => uFailed.has(t)).sort();
|
|
214
|
-
|
|
215
|
-
const bRate = baseline?.passRate || 0;
|
|
216
|
-
const uRate = uam?.passRate || 0;
|
|
217
|
-
const delta = uRate - bRate;
|
|
218
|
-
|
|
219
|
-
return { model, baseline, uam, uamWins, baselineWins, bothPass, bothFail, delta };
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
// ============================================================================
|
|
223
|
-
// Binomial test (approximate)
|
|
224
|
-
// ============================================================================
|
|
225
|
-
|
|
226
|
-
function binomialPValue(wins: number, losses: number): string {
|
|
227
|
-
const n = wins + losses;
|
|
228
|
-
if (n === 0) return 'N/A';
|
|
229
|
-
// Simple sign test approximation
|
|
230
|
-
const p = Math.min(wins, losses);
|
|
231
|
-
// Use normal approximation for binomial test
|
|
232
|
-
const expected = n / 2;
|
|
233
|
-
const stddev = Math.sqrt(n * 0.25);
|
|
234
|
-
if (stddev === 0) return 'N/A';
|
|
235
|
-
const z = Math.abs(p - expected) / stddev;
|
|
236
|
-
// Rough 2-sided p-value from z-score
|
|
237
|
-
if (z < 1.645) return '>0.10';
|
|
238
|
-
if (z < 1.96) return '<0.10';
|
|
239
|
-
if (z < 2.576) return '<0.05';
|
|
240
|
-
return '<0.01';
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
// ============================================================================
|
|
244
|
-
// Generate markdown report
|
|
245
|
-
// ============================================================================
|
|
246
|
-
|
|
247
|
-
function generateReport(
|
|
248
|
-
comparisons: ModelComparison[],
|
|
249
|
-
timestamp: string,
|
|
250
|
-
): string {
|
|
251
|
-
const lines: string[] = [];
|
|
252
|
-
|
|
253
|
-
lines.push('# Terminal-Bench 2.0 Full Comparison: UAM v3.1.0 vs Baseline');
|
|
254
|
-
lines.push('');
|
|
255
|
-
lines.push(`**Generated:** ${new Date().toISOString()}`);
|
|
256
|
-
lines.push(`**Dataset:** Terminal-Bench 2.0 (89 tasks)`);
|
|
257
|
-
lines.push(`**UAM Version:** 3.1.0`);
|
|
258
|
-
lines.push(`**Benchmark ID:** ${timestamp}`);
|
|
259
|
-
lines.push('');
|
|
260
|
-
|
|
261
|
-
// Executive summary
|
|
262
|
-
lines.push('## Executive Summary');
|
|
263
|
-
lines.push('');
|
|
264
|
-
lines.push('| Model | Baseline | UAM | Delta | UAM Wins | Baseline Wins | p-value |');
|
|
265
|
-
lines.push('|-------|----------|-----|-------|----------|---------------|---------|');
|
|
266
|
-
|
|
267
|
-
for (const c of comparisons) {
|
|
268
|
-
const bRate = c.baseline ? `${c.baseline.passRate.toFixed(1)}% (${c.baseline.passed.length}/${c.baseline.passed.length + c.baseline.failed.length})` : 'N/A';
|
|
269
|
-
const uRate = c.uam ? `${c.uam.passRate.toFixed(1)}% (${c.uam.passed.length}/${c.uam.passed.length + c.uam.failed.length})` : 'N/A';
|
|
270
|
-
const delta = c.baseline && c.uam ? `${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}%` : 'N/A';
|
|
271
|
-
const pval = binomialPValue(c.uamWins.length, c.baselineWins.length);
|
|
272
|
-
lines.push(`| ${c.model} | ${bRate} | ${uRate} | **${delta}** | ${c.uamWins.length} | ${c.baselineWins.length} | ${pval} |`);
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
lines.push('');
|
|
276
|
-
|
|
277
|
-
// Aggregate stats
|
|
278
|
-
const totalUamWins = comparisons.reduce((s, c) => s + c.uamWins.length, 0);
|
|
279
|
-
const totalBaselineWins = comparisons.reduce((s, c) => s + c.baselineWins.length, 0);
|
|
280
|
-
const netTasks = totalUamWins - totalBaselineWins;
|
|
281
|
-
|
|
282
|
-
lines.push(`**Across all models:** UAM wins ${totalUamWins} tasks, Baseline wins ${totalBaselineWins} tasks, Net: ${netTasks >= 0 ? '+' : ''}${netTasks} tasks for UAM.`);
|
|
283
|
-
lines.push('');
|
|
284
|
-
|
|
285
|
-
// Per-model detailed sections
|
|
286
|
-
for (const c of comparisons) {
|
|
287
|
-
lines.push(`---`);
|
|
288
|
-
lines.push('');
|
|
289
|
-
lines.push(`## ${c.model}`);
|
|
290
|
-
lines.push('');
|
|
291
|
-
|
|
292
|
-
if (c.baseline) {
|
|
293
|
-
lines.push(`- **Baseline:** ${c.baseline.passRate.toFixed(1)}% (${c.baseline.passed.length} passed, ${c.baseline.failed.length} failed, ${c.baseline.errors} errors)`);
|
|
294
|
-
}
|
|
295
|
-
if (c.uam) {
|
|
296
|
-
lines.push(`- **UAM:** ${c.uam.passRate.toFixed(1)}% (${c.uam.passed.length} passed, ${c.uam.failed.length} failed, ${c.uam.errors} errors)`);
|
|
297
|
-
}
|
|
298
|
-
if (c.baseline && c.uam) {
|
|
299
|
-
lines.push(`- **Net Delta:** ${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}% (${c.uamWins.length - c.baselineWins.length >= 0 ? '+' : ''}${c.uamWins.length - c.baselineWins.length} tasks)`);
|
|
300
|
-
}
|
|
301
|
-
lines.push('');
|
|
302
|
-
|
|
303
|
-
// UAM wins
|
|
304
|
-
if (c.uamWins.length > 0) {
|
|
305
|
-
lines.push('### Tasks UAM Wins (pass with UAM, fail without)');
|
|
306
|
-
lines.push('');
|
|
307
|
-
for (const t of c.uamWins) {
|
|
308
|
-
lines.push(`- \`${t}\``);
|
|
309
|
-
}
|
|
310
|
-
lines.push('');
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
// Baseline wins
|
|
314
|
-
if (c.baselineWins.length > 0) {
|
|
315
|
-
lines.push('### Tasks Baseline Wins (pass without UAM, fail with)');
|
|
316
|
-
lines.push('');
|
|
317
|
-
for (const t of c.baselineWins) {
|
|
318
|
-
lines.push(`- \`${t}\``);
|
|
319
|
-
}
|
|
320
|
-
lines.push('');
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
// Full task-level diff table
|
|
324
|
-
if (c.baseline && c.uam) {
|
|
325
|
-
const allTasks = new Set([
|
|
326
|
-
...c.baseline.passed.map(t => t.taskName),
|
|
327
|
-
...c.baseline.failed.map(t => t.taskName),
|
|
328
|
-
...c.uam.passed.map(t => t.taskName),
|
|
329
|
-
...c.uam.failed.map(t => t.taskName),
|
|
330
|
-
]);
|
|
331
|
-
|
|
332
|
-
const bPassSet = extractTaskNames(c.baseline.passed);
|
|
333
|
-
const uPassSet = extractTaskNames(c.uam.passed);
|
|
334
|
-
|
|
335
|
-
lines.push('### Full Task Comparison');
|
|
336
|
-
lines.push('');
|
|
337
|
-
lines.push('| Task | Baseline | UAM | Delta |');
|
|
338
|
-
lines.push('|------|----------|-----|-------|');
|
|
339
|
-
|
|
340
|
-
for (const t of [...allTasks].sort()) {
|
|
341
|
-
const bStatus = bPassSet.has(t) ? 'PASS' : 'FAIL';
|
|
342
|
-
const uStatus = uPassSet.has(t) ? 'PASS' : 'FAIL';
|
|
343
|
-
let delta = '=';
|
|
344
|
-
if (bStatus === 'FAIL' && uStatus === 'PASS') delta = '**+UAM**';
|
|
345
|
-
if (bStatus === 'PASS' && uStatus === 'FAIL') delta = '**-UAM**';
|
|
346
|
-
lines.push(`| ${t} | ${bStatus} | ${uStatus} | ${delta} |`);
|
|
347
|
-
}
|
|
348
|
-
lines.push('');
|
|
349
|
-
}
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
// Cross-model analysis
|
|
353
|
-
if (comparisons.length > 1) {
|
|
354
|
-
lines.push('---');
|
|
355
|
-
lines.push('');
|
|
356
|
-
lines.push('## Cross-Model Analysis');
|
|
357
|
-
lines.push('');
|
|
358
|
-
|
|
359
|
-
// Which tasks does UAM help consistently across models?
|
|
360
|
-
const uamWinSets = comparisons.map(c => new Set(c.uamWins));
|
|
361
|
-
const baselineWinSets = comparisons.map(c => new Set(c.baselineWins));
|
|
362
|
-
|
|
363
|
-
if (uamWinSets.length >= 2) {
|
|
364
|
-
const consistentUamWins = [...uamWinSets[0]].filter(t => uamWinSets.every(s => s.has(t)));
|
|
365
|
-
const consistentBaselineWins = [...baselineWinSets[0]].filter(t => baselineWinSets.every(s => s.has(t)));
|
|
366
|
-
|
|
367
|
-
if (consistentUamWins.length > 0) {
|
|
368
|
-
lines.push(`**Tasks where UAM helps across ALL models:** ${consistentUamWins.join(', ')}`);
|
|
369
|
-
lines.push('');
|
|
370
|
-
}
|
|
371
|
-
if (consistentBaselineWins.length > 0) {
|
|
372
|
-
lines.push(`**Tasks where UAM hurts across ALL models:** ${consistentBaselineWins.join(', ')}`);
|
|
373
|
-
lines.push('');
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
// Which model benefits most from UAM?
|
|
378
|
-
const sorted = [...comparisons].sort((a, b) => b.delta - a.delta);
|
|
379
|
-
lines.push('**Model benefit ranking (most to least improvement from UAM):**');
|
|
380
|
-
lines.push('');
|
|
381
|
-
for (const c of sorted) {
|
|
382
|
-
lines.push(`1. **${c.model}**: ${c.delta >= 0 ? '+' : ''}${c.delta.toFixed(1)}% (${c.uamWins.length} wins, ${c.baselineWins.length} losses)`);
|
|
383
|
-
}
|
|
384
|
-
lines.push('');
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
// Methodology
|
|
388
|
-
lines.push('---');
|
|
389
|
-
lines.push('');
|
|
390
|
-
lines.push('## Methodology');
|
|
391
|
-
lines.push('');
|
|
392
|
-
lines.push('- **Baseline:** `harbor run` with `--ak "system_prompt="` to clear UAM context');
|
|
393
|
-
lines.push('- **UAM:** `harbor run` with default CLAUDE.md and UAM memory system active');
|
|
394
|
-
lines.push('- **Dataset:** Terminal-Bench 2.0 (89 tasks across systems, ML, security, algorithms)');
|
|
395
|
-
lines.push('- **Scoring:** Binary pass/fail per task based on Harbor reward (1.0 = pass, 0.0 = fail)');
|
|
396
|
-
lines.push('- **Statistical test:** Sign test on UAM-wins vs Baseline-wins (binomial, 2-sided)');
|
|
397
|
-
lines.push('');
|
|
398
|
-
lines.push('---');
|
|
399
|
-
lines.push(`*Report generated by \`scripts/generate-comparison-report.ts\` at ${new Date().toISOString()}*`);
|
|
400
|
-
|
|
401
|
-
return lines.join('\n');
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
// ============================================================================
|
|
405
|
-
// Main
|
|
406
|
-
// ============================================================================
|
|
407
|
-
|
|
408
|
-
function main(): void {
|
|
409
|
-
const { baselineDirs, uamDirs, output, timestamp } = parseArgs();
|
|
410
|
-
|
|
411
|
-
console.log('Parsing benchmark results...');
|
|
412
|
-
|
|
413
|
-
const baselineRuns: RunSummary[] = [];
|
|
414
|
-
const uamRuns: RunSummary[] = [];
|
|
415
|
-
|
|
416
|
-
for (const dir of baselineDirs) {
|
|
417
|
-
const run = parseResultDir(dir, 'baseline');
|
|
418
|
-
if (run) {
|
|
419
|
-
baselineRuns.push(run);
|
|
420
|
-
console.log(` Baseline: ${run.model} - ${run.passRate.toFixed(1)}% (${run.passed.length}/${run.passed.length + run.failed.length})`);
|
|
421
|
-
}
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
for (const dir of uamDirs) {
|
|
425
|
-
const run = parseResultDir(dir, 'uam');
|
|
426
|
-
if (run) {
|
|
427
|
-
uamRuns.push(run);
|
|
428
|
-
console.log(` UAM: ${run.model} - ${run.passRate.toFixed(1)}% (${run.passed.length}/${run.passed.length + run.failed.length})`);
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
|
|
432
|
-
// Match baseline and UAM runs by model
|
|
433
|
-
const modelSet = new Set([
|
|
434
|
-
...baselineRuns.map(r => r.model),
|
|
435
|
-
...uamRuns.map(r => r.model),
|
|
436
|
-
]);
|
|
437
|
-
|
|
438
|
-
const comparisons: ModelComparison[] = [];
|
|
439
|
-
|
|
440
|
-
for (const model of modelSet) {
|
|
441
|
-
const baseline = baselineRuns.find(r => r.model === model) || null;
|
|
442
|
-
const uam = uamRuns.find(r => r.model === model) || null;
|
|
443
|
-
comparisons.push(buildModelComparison(baseline, uam));
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
// Sort by model name for consistent output
|
|
447
|
-
comparisons.sort((a, b) => a.model.localeCompare(b.model));
|
|
448
|
-
|
|
449
|
-
// Generate report
|
|
450
|
-
const report = generateReport(comparisons, timestamp);
|
|
451
|
-
writeFileSync(output, report + '\n');
|
|
452
|
-
console.log(`\nReport written to: ${output}`);
|
|
453
|
-
console.log(`Models compared: ${comparisons.length}`);
|
|
454
|
-
|
|
455
|
-
for (const c of comparisons) {
|
|
456
|
-
const sym = c.delta >= 0 ? '+' : '';
|
|
457
|
-
console.log(` ${c.model}: ${sym}${c.delta.toFixed(1)}% (UAM wins ${c.uamWins.length}, Baseline wins ${c.baselineWins.length})`);
|
|
458
|
-
}
|
|
459
|
-
}
|
|
460
|
-
|
|
461
|
-
main();
|