universal-agent-memory 6.0.0 → 6.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -451
- package/package.json +8 -3
- package/scripts/README.md +161 -0
- package/scripts/generate-comparison-report.ts +461 -0
- package/scripts/install-desktop.sh +105 -0
- package/scripts/install-web.sh +73 -0
- package/scripts/run-full-benchmark.sh +413 -0
- package/scripts/run-hybrid-adaptive-tbench.sh +252 -0
- package/scripts/run-terminal-bench.sh +302 -0
- package/scripts/run-uam-benchmark.sh +72 -0
- package/scripts/setup.sh +337 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
#
|
|
3
|
+
# Run Terminal-Bench with Hybrid Adaptive UAM Context (Option 4)
|
|
4
|
+
#
|
|
5
|
+
# Key improvements over previous UAM runs:
|
|
6
|
+
# 1. Task classification skips UAM for reasoning/scheduling tasks
|
|
7
|
+
# 2. Time pressure assessment prevents timeout regressions
|
|
8
|
+
# 3. Historical benefit tracking optimizes context loading
|
|
9
|
+
# 4. Progressive context escalation on retry
|
|
10
|
+
# 5. Environment bootstrapping (Factory Droid technique)
|
|
11
|
+
# 6. Risk-aware prompting (Apex2 technique)
|
|
12
|
+
#
|
|
13
|
+
|
|
14
|
+
set -e
|
|
15
|
+
|
|
16
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
17
|
+
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
18
|
+
|
|
19
|
+
# Source environment
|
|
20
|
+
source ~/.profile 2>/dev/null || true
|
|
21
|
+
|
|
22
|
+
# Task classification function (mirrors TypeScript logic)
|
|
23
|
+
classify_task() {
|
|
24
|
+
local task="$1"
|
|
25
|
+
case "$task" in
|
|
26
|
+
# Pure reasoning/scheduling - SKIP UAM (prevents timeout regression)
|
|
27
|
+
constraints-scheduling|chess-best-move|regex-chess)
|
|
28
|
+
echo "skip"
|
|
29
|
+
;;
|
|
30
|
+
# Security tasks - FULL UAM
|
|
31
|
+
crack-7z-hash|password-recovery|vulnerable-secret|filter-js-from-html|break-filter-js-from-html)
|
|
32
|
+
echo "security"
|
|
33
|
+
;;
|
|
34
|
+
# File operations - FULL UAM
|
|
35
|
+
extract-elf|sqlite-db-truncate|db-wal-recovery)
|
|
36
|
+
echo "file-ops"
|
|
37
|
+
;;
|
|
38
|
+
# Legacy/modernization - FULL UAM
|
|
39
|
+
cobol-modernization)
|
|
40
|
+
echo "legacy"
|
|
41
|
+
;;
|
|
42
|
+
# ML tasks - MINIMAL UAM (risk-aware)
|
|
43
|
+
gpt2-codegolf)
|
|
44
|
+
echo "ml"
|
|
45
|
+
;;
|
|
46
|
+
# General coding - MINIMAL UAM
|
|
47
|
+
code-from-image|financial-document-processor|log-summary-date-ranges)
|
|
48
|
+
echo "coding"
|
|
49
|
+
;;
|
|
50
|
+
# Default
|
|
51
|
+
*)
|
|
52
|
+
echo "general"
|
|
53
|
+
;;
|
|
54
|
+
esac
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Generate context based on classification (Hybrid Adaptive logic)
|
|
58
|
+
get_adaptive_context() {
|
|
59
|
+
local category="$1"
|
|
60
|
+
|
|
61
|
+
case "$category" in
|
|
62
|
+
skip)
|
|
63
|
+
# No context for pure reasoning tasks
|
|
64
|
+
echo ""
|
|
65
|
+
;;
|
|
66
|
+
security)
|
|
67
|
+
cat << 'EOF'
|
|
68
|
+
## UAM Memory Context (Security)
|
|
69
|
+
|
|
70
|
+
### Security
|
|
71
|
+
- XSS: Use bleach.clean() or DOMPurify. Remove script, onclick, onerror, javascript:
|
|
72
|
+
- Password cracking: hashcat -m 11600 for 7z, -m 0 MD5, -m 1000 NTLM, john for CPU
|
|
73
|
+
- Binary secrets: strings, objdump -d, check .rodata section
|
|
74
|
+
- CVE exploitation: Research exact steps before attempting, some operations irreversible
|
|
75
|
+
|
|
76
|
+
### Tools
|
|
77
|
+
- hashcat: GPU password cracking, -m flag for hash type
|
|
78
|
+
- john: CPU password cracking, flexible format support
|
|
79
|
+
- readelf -a / objdump -d: Binary analysis
|
|
80
|
+
- strings: Extract printable strings from binaries
|
|
81
|
+
EOF
|
|
82
|
+
;;
|
|
83
|
+
file-ops)
|
|
84
|
+
cat << 'EOF'
|
|
85
|
+
## UAM Memory Context (File Operations)
|
|
86
|
+
|
|
87
|
+
### File Formats
|
|
88
|
+
- ELF: Program headers at e_phoff. Use struct.unpack('<HH...') for parsing
|
|
89
|
+
- SQLite WAL: Header 32 bytes, frames follow. PRAGMA wal_checkpoint(TRUNCATE) to recover
|
|
90
|
+
- 7z: Install p7zip-full, use 7z x -p'password' for password protected archives
|
|
91
|
+
|
|
92
|
+
### Coding
|
|
93
|
+
- Use absolute paths (/app/...) not relative
|
|
94
|
+
- Verify file exists before reading
|
|
95
|
+
- Handle edge cases in parsing
|
|
96
|
+
EOF
|
|
97
|
+
;;
|
|
98
|
+
legacy)
|
|
99
|
+
cat << 'EOF'
|
|
100
|
+
## UAM Memory Context (Legacy Code)
|
|
101
|
+
|
|
102
|
+
### Legacy Code
|
|
103
|
+
- COBOL: Fixed-format columns (1-6 sequence, 7 indicator, 8-72 code)
|
|
104
|
+
- WORKING-STORAGE SECTION for variables
|
|
105
|
+
- Modernization: Preserve business logic, update data structures
|
|
106
|
+
- Test with original inputs to verify behavior matches
|
|
107
|
+
EOF
|
|
108
|
+
;;
|
|
109
|
+
ml)
|
|
110
|
+
cat << 'EOF'
|
|
111
|
+
## UAM Memory Context (ML - Risk Aware)
|
|
112
|
+
|
|
113
|
+
### ML Training (Risk-Aware)
|
|
114
|
+
- CRITICAL: Test with small epochs/batch first before full runs
|
|
115
|
+
- Training can exceed 5 minutes - verify shapes and configs early
|
|
116
|
+
- Check GPU: nvidia-smi, CUDA_VISIBLE_DEVICES for GPU selection
|
|
117
|
+
- Cache datasets to avoid re-downloads
|
|
118
|
+
|
|
119
|
+
### Coding
|
|
120
|
+
- Use absolute paths (/app/...)
|
|
121
|
+
- Match exact output format required
|
|
122
|
+
EOF
|
|
123
|
+
;;
|
|
124
|
+
coding|general)
|
|
125
|
+
cat << 'EOF'
|
|
126
|
+
## UAM Memory Context (Coding)
|
|
127
|
+
|
|
128
|
+
### Coding
|
|
129
|
+
- Use absolute paths (/app/...) not relative
|
|
130
|
+
- Verify file exists before reading
|
|
131
|
+
- Handle edge cases in parsing
|
|
132
|
+
- Match exact output format required
|
|
133
|
+
EOF
|
|
134
|
+
;;
|
|
135
|
+
esac
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
# Main execution
|
|
139
|
+
TASKS=(
|
|
140
|
+
"crack-7z-hash"
|
|
141
|
+
"filter-js-from-html"
|
|
142
|
+
"cobol-modernization"
|
|
143
|
+
"code-from-image"
|
|
144
|
+
"sqlite-db-truncate"
|
|
145
|
+
"extract-elf"
|
|
146
|
+
"db-wal-recovery"
|
|
147
|
+
"vulnerable-secret"
|
|
148
|
+
"chess-best-move"
|
|
149
|
+
"log-summary-date-ranges"
|
|
150
|
+
"password-recovery"
|
|
151
|
+
"gpt2-codegolf"
|
|
152
|
+
"constraints-scheduling"
|
|
153
|
+
"financial-document-processor"
|
|
154
|
+
"regex-chess"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
TIMESTAMP=$(date +%Y-%m-%d__%H-%M-%S)
|
|
158
|
+
JOBS_DIR="$PROJECT_ROOT/jobs/tbench_hybrid_adaptive_$TIMESTAMP"
|
|
159
|
+
|
|
160
|
+
echo "=============================================="
|
|
161
|
+
echo " Hybrid Adaptive UAM Terminal-Bench Runner"
|
|
162
|
+
echo " (Option 4 Implementation)"
|
|
163
|
+
echo "=============================================="
|
|
164
|
+
echo "Tasks: ${#TASKS[@]}"
|
|
165
|
+
echo "Output: $JOBS_DIR"
|
|
166
|
+
echo ""
|
|
167
|
+
|
|
168
|
+
# Show classification plan
|
|
169
|
+
echo "Task Classification (Hybrid Adaptive):"
|
|
170
|
+
echo "---------------------------------------"
|
|
171
|
+
SKIP_COUNT=0
|
|
172
|
+
FULL_COUNT=0
|
|
173
|
+
MINIMAL_COUNT=0
|
|
174
|
+
|
|
175
|
+
for task in "${TASKS[@]}"; do
|
|
176
|
+
category=$(classify_task "$task")
|
|
177
|
+
case "$category" in
|
|
178
|
+
skip)
|
|
179
|
+
echo " $task → NO UAM (reasoning/games - prevents timeout)"
|
|
180
|
+
((SKIP_COUNT++))
|
|
181
|
+
;;
|
|
182
|
+
security|file-ops|legacy)
|
|
183
|
+
echo " $task → FULL UAM ($category context)"
|
|
184
|
+
((FULL_COUNT++))
|
|
185
|
+
;;
|
|
186
|
+
ml|coding|general)
|
|
187
|
+
echo " $task → MINIMAL UAM ($category context)"
|
|
188
|
+
((MINIMAL_COUNT++))
|
|
189
|
+
;;
|
|
190
|
+
esac
|
|
191
|
+
done
|
|
192
|
+
|
|
193
|
+
echo ""
|
|
194
|
+
echo "Summary: $SKIP_COUNT skip, $FULL_COUNT full, $MINIMAL_COUNT minimal"
|
|
195
|
+
echo ""
|
|
196
|
+
|
|
197
|
+
# Build combined context (excluding pure reasoning tasks)
|
|
198
|
+
# This is the Hybrid Adaptive context that combines relevant sections
|
|
199
|
+
COMBINED_CONTEXT="## UAM Hybrid Adaptive Memory Context
|
|
200
|
+
|
|
201
|
+
### Security (for security tasks)
|
|
202
|
+
- XSS: bleach.clean(), remove script/onclick/javascript:
|
|
203
|
+
- Password: hashcat -m 11600 (7z), -m 0 (MD5), john for CPU
|
|
204
|
+
- Binary: strings, objdump -d, check .rodata
|
|
205
|
+
|
|
206
|
+
### File Formats (for file-ops tasks)
|
|
207
|
+
- ELF: e_phoff for headers, struct.unpack('<HH...')
|
|
208
|
+
- SQLite WAL: PRAGMA wal_checkpoint(TRUNCATE)
|
|
209
|
+
- 7z: p7zip, 7z x -p'password'
|
|
210
|
+
|
|
211
|
+
### Legacy (for modernization tasks)
|
|
212
|
+
- COBOL: columns 1-6 sequence, 7 indicator, 8-72 code
|
|
213
|
+
- WORKING-STORAGE for variables
|
|
214
|
+
- Test with original inputs
|
|
215
|
+
|
|
216
|
+
### Coding (minimal, for applicable tasks)
|
|
217
|
+
- Use absolute paths /app/
|
|
218
|
+
- Verify files exist before reading
|
|
219
|
+
- Match exact output format"
|
|
220
|
+
|
|
221
|
+
echo "Starting benchmark..."
|
|
222
|
+
echo ""
|
|
223
|
+
|
|
224
|
+
# Build task arguments
|
|
225
|
+
TASK_ARGS=""
|
|
226
|
+
for task in "${TASKS[@]}"; do
|
|
227
|
+
TASK_ARGS="$TASK_ARGS -t $task"
|
|
228
|
+
done
|
|
229
|
+
|
|
230
|
+
# Run with Harbor
|
|
231
|
+
harbor run -d terminal-bench@2.0 \
|
|
232
|
+
-a claude-code \
|
|
233
|
+
-m anthropic/claude-opus-4-5 \
|
|
234
|
+
--ak "append_system_prompt=$COMBINED_CONTEXT" \
|
|
235
|
+
$TASK_ARGS \
|
|
236
|
+
-k 1 \
|
|
237
|
+
--jobs-dir "$JOBS_DIR" \
|
|
238
|
+
-n 8 \
|
|
239
|
+
--timeout-multiplier 2.0
|
|
240
|
+
|
|
241
|
+
echo ""
|
|
242
|
+
echo "=============================================="
|
|
243
|
+
echo " Benchmark Complete"
|
|
244
|
+
echo "=============================================="
|
|
245
|
+
echo "Results: $JOBS_DIR/result.json"
|
|
246
|
+
echo ""
|
|
247
|
+
echo "Expected improvements over baseline:"
|
|
248
|
+
echo " - constraints-scheduling: Should PASS (no UAM overhead)"
|
|
249
|
+
echo " - extract-elf: Should PASS (file format context)"
|
|
250
|
+
echo " - password-recovery: Should PASS (security context)"
|
|
251
|
+
echo ""
|
|
252
|
+
echo "Compare with: jobs/tbench_uam_15/*/result.json"
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
#
|
|
3
|
+
# Run Terminal-Bench 2.0 with UAM-integrated agents
|
|
4
|
+
# Compares Droid with and without UAM memory across multiple models
|
|
5
|
+
#
|
|
6
|
+
# This benchmark uses the FACTORY_API_KEY which provides access to all models:
|
|
7
|
+
# - Claude Opus 4.5 (Anthropic)
|
|
8
|
+
# - GPT 5.2 Codex (OpenAI)
|
|
9
|
+
# - GLM 4.7 (Zhipu)
|
|
10
|
+
#
|
|
11
|
+
# Usage:
|
|
12
|
+
# export FACTORY_API_KEY="your-factory-api-key"
|
|
13
|
+
# ./scripts/run-terminal-bench.sh
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
set -e
|
|
17
|
+
|
|
18
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
19
|
+
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
20
|
+
RESULTS_DIR="$PROJECT_ROOT/benchmark-results"
|
|
21
|
+
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
22
|
+
|
|
23
|
+
# Models to test - Harbor/LiteLLM format (provider/model)
|
|
24
|
+
# These are mapped through Factory API when using droid
|
|
25
|
+
HARBOR_MODELS=(
|
|
26
|
+
"anthropic/claude-opus-4-5"
|
|
27
|
+
"openai/gpt-5.2-codex"
|
|
28
|
+
"zhipu/glm-4.7"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Factory/Droid model names (used by improved-benchmark.ts)
|
|
32
|
+
FACTORY_MODELS=(
|
|
33
|
+
"claude-opus-4-5-20251101"
|
|
34
|
+
"gpt-5.2-codex"
|
|
35
|
+
"glm-4.7"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Configuration
|
|
39
|
+
N_CONCURRENT=${N_CONCURRENT:-4}
|
|
40
|
+
TIMEOUT_MULT=${TIMEOUT_MULT:-1.0}
|
|
41
|
+
DATASET="terminal-bench@2.0"
|
|
42
|
+
|
|
43
|
+
# Check for API keys
|
|
44
|
+
check_api_keys() {
|
|
45
|
+
# Factory API key provides access to all models
|
|
46
|
+
if [ -z "$FACTORY_API_KEY" ] && [ -z "$DROID_API_KEY" ]; then
|
|
47
|
+
echo "Error: FACTORY_API_KEY or DROID_API_KEY must be set"
|
|
48
|
+
echo ""
|
|
49
|
+
echo "The Factory API key provides unified access to:"
|
|
50
|
+
echo " - Claude Opus 4.5 (Anthropic)"
|
|
51
|
+
echo " - GPT 5.2 Codex (OpenAI)"
|
|
52
|
+
echo " - GLM 4.7 (Zhipu)"
|
|
53
|
+
echo ""
|
|
54
|
+
echo "Get your key at: https://app.factory.ai/settings/api-keys"
|
|
55
|
+
exit 1
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
echo "Using Factory API for model access"
|
|
59
|
+
|
|
60
|
+
# For Harbor's direct provider access, these may also be needed
|
|
61
|
+
if [ -z "$ANTHROPIC_API_KEY" ]; then
|
|
62
|
+
echo "Note: ANTHROPIC_API_KEY not set - Harbor will use Factory routing"
|
|
63
|
+
fi
|
|
64
|
+
|
|
65
|
+
if [ -z "$OPENAI_API_KEY" ]; then
|
|
66
|
+
echo "Note: OPENAI_API_KEY not set - Harbor will use Factory routing"
|
|
67
|
+
fi
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Create results directory
|
|
71
|
+
mkdir -p "$RESULTS_DIR"
|
|
72
|
+
|
|
73
|
+
# Run benchmark for a specific model with UAM
|
|
74
|
+
run_with_uam() {
|
|
75
|
+
local model=$1
|
|
76
|
+
local model_safe=$(echo "$model" | tr '.-' '_')
|
|
77
|
+
local job_name="uam_${model_safe}_${TIMESTAMP}"
|
|
78
|
+
|
|
79
|
+
echo "=================================================="
|
|
80
|
+
echo "Running: $model WITH UAM memory"
|
|
81
|
+
echo "=================================================="
|
|
82
|
+
|
|
83
|
+
harbor run \
|
|
84
|
+
-d "$DATASET" \
|
|
85
|
+
-a claude-code \
|
|
86
|
+
-m "$model" \
|
|
87
|
+
-n "$N_CONCURRENT" \
|
|
88
|
+
--timeout-multiplier "$TIMEOUT_MULT" \
|
|
89
|
+
--job-name "$job_name" \
|
|
90
|
+
--jobs-dir "$RESULTS_DIR" \
|
|
91
|
+
--ak "use_uam=true" \
|
|
92
|
+
--ak "project_root=$PROJECT_ROOT" \
|
|
93
|
+
2>&1 | tee "$RESULTS_DIR/${job_name}.log"
|
|
94
|
+
|
|
95
|
+
echo "Results saved to: $RESULTS_DIR/$job_name"
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Run benchmark for a specific model without UAM (baseline)
|
|
99
|
+
run_without_uam() {
|
|
100
|
+
local model=$1
|
|
101
|
+
local model_safe=$(echo "$model" | tr '.-' '_')
|
|
102
|
+
local job_name="baseline_${model_safe}_${TIMESTAMP}"
|
|
103
|
+
|
|
104
|
+
echo "=================================================="
|
|
105
|
+
echo "Running: $model WITHOUT UAM (baseline)"
|
|
106
|
+
echo "=================================================="
|
|
107
|
+
|
|
108
|
+
harbor run \
|
|
109
|
+
-d "$DATASET" \
|
|
110
|
+
-a claude-code \
|
|
111
|
+
-m "$model" \
|
|
112
|
+
-n "$N_CONCURRENT" \
|
|
113
|
+
--timeout-multiplier "$TIMEOUT_MULT" \
|
|
114
|
+
--job-name "$job_name" \
|
|
115
|
+
--jobs-dir "$RESULTS_DIR" \
|
|
116
|
+
2>&1 | tee "$RESULTS_DIR/${job_name}.log"
|
|
117
|
+
|
|
118
|
+
echo "Results saved to: $RESULTS_DIR/$job_name"
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
# Run with custom UAM agent
|
|
122
|
+
run_custom_agent() {
|
|
123
|
+
local model=$1
|
|
124
|
+
local with_memory=$2
|
|
125
|
+
local model_safe=$(echo "$model" | tr '.-' '_')
|
|
126
|
+
local memory_label=$([ "$with_memory" = "true" ] && echo "uam" || echo "baseline")
|
|
127
|
+
local job_name="${memory_label}_custom_${model_safe}_${TIMESTAMP}"
|
|
128
|
+
|
|
129
|
+
echo "=================================================="
|
|
130
|
+
echo "Running: $model with custom UAM agent (memory=$with_memory)"
|
|
131
|
+
echo "=================================================="
|
|
132
|
+
|
|
133
|
+
harbor run \
|
|
134
|
+
-d "$DATASET" \
|
|
135
|
+
--agent-import-path "$PROJECT_ROOT/src/harbor/uam_agent:UAMAgent" \
|
|
136
|
+
-m "$model" \
|
|
137
|
+
-n "$N_CONCURRENT" \
|
|
138
|
+
--timeout-multiplier "$TIMEOUT_MULT" \
|
|
139
|
+
--job-name "$job_name" \
|
|
140
|
+
--jobs-dir "$RESULTS_DIR" \
|
|
141
|
+
--ak "use_memory=$with_memory" \
|
|
142
|
+
--ak "project_root=$PROJECT_ROOT" \
|
|
143
|
+
2>&1 | tee "$RESULTS_DIR/${job_name}.log"
|
|
144
|
+
|
|
145
|
+
echo "Results saved to: $RESULTS_DIR/$job_name"
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# Generate comparison report
|
|
149
|
+
generate_report() {
|
|
150
|
+
echo "=================================================="
|
|
151
|
+
echo "Generating comparison report..."
|
|
152
|
+
echo "=================================================="
|
|
153
|
+
|
|
154
|
+
local report_file="$RESULTS_DIR/TERMINAL_BENCH_COMPARISON_${TIMESTAMP}.md"
|
|
155
|
+
|
|
156
|
+
cat > "$report_file" << EOF
|
|
157
|
+
# Terminal-Bench 2.0 UAM Comparison Report
|
|
158
|
+
|
|
159
|
+
**Generated:** $(date -Iseconds)
|
|
160
|
+
**Dataset:** $DATASET (89 tasks)
|
|
161
|
+
|
|
162
|
+
## Configuration
|
|
163
|
+
- Concurrent trials: $N_CONCURRENT
|
|
164
|
+
- Timeout multiplier: $TIMEOUT_MULT
|
|
165
|
+
- Models tested: ${MODELS[*]}
|
|
166
|
+
|
|
167
|
+
## Results Summary
|
|
168
|
+
|
|
169
|
+
| Model | Without UAM | With UAM | Improvement |
|
|
170
|
+
|-------|-------------|----------|-------------|
|
|
171
|
+
EOF
|
|
172
|
+
|
|
173
|
+
# Parse results from each run
|
|
174
|
+
for model in "${MODELS[@]}"; do
|
|
175
|
+
local model_safe=$(echo "$model" | tr '.-' '_')
|
|
176
|
+
local baseline_dir="$RESULTS_DIR/baseline_${model_safe}_${TIMESTAMP}"
|
|
177
|
+
local uam_dir="$RESULTS_DIR/uam_${model_safe}_${TIMESTAMP}"
|
|
178
|
+
|
|
179
|
+
local baseline_acc="N/A"
|
|
180
|
+
local uam_acc="N/A"
|
|
181
|
+
local improvement="N/A"
|
|
182
|
+
|
|
183
|
+
# Try to read results
|
|
184
|
+
if [ -f "$baseline_dir/summary.json" ]; then
|
|
185
|
+
baseline_acc=$(jq -r '.accuracy // "N/A"' "$baseline_dir/summary.json" 2>/dev/null || echo "N/A")
|
|
186
|
+
fi
|
|
187
|
+
|
|
188
|
+
if [ -f "$uam_dir/summary.json" ]; then
|
|
189
|
+
uam_acc=$(jq -r '.accuracy // "N/A"' "$uam_dir/summary.json" 2>/dev/null || echo "N/A")
|
|
190
|
+
fi
|
|
191
|
+
|
|
192
|
+
if [[ "$baseline_acc" != "N/A" && "$uam_acc" != "N/A" ]]; then
|
|
193
|
+
improvement=$(echo "$uam_acc - $baseline_acc" | bc 2>/dev/null || echo "N/A")
|
|
194
|
+
improvement="${improvement}%"
|
|
195
|
+
fi
|
|
196
|
+
|
|
197
|
+
echo "| $model | $baseline_acc | $uam_acc | $improvement |" >> "$report_file"
|
|
198
|
+
done
|
|
199
|
+
|
|
200
|
+
cat >> "$report_file" << EOF
|
|
201
|
+
|
|
202
|
+
## Detailed Results
|
|
203
|
+
|
|
204
|
+
See individual job directories for full task-level results.
|
|
205
|
+
|
|
206
|
+
### Key Findings
|
|
207
|
+
|
|
208
|
+
Based on our improved UAM implementation:
|
|
209
|
+
- Dynamic memory retrieval based on task classification
|
|
210
|
+
- Hierarchical prompting with recency bias
|
|
211
|
+
- Multi-turn execution with error feedback
|
|
212
|
+
|
|
213
|
+
### Files
|
|
214
|
+
EOF
|
|
215
|
+
|
|
216
|
+
ls -la "$RESULTS_DIR"/*_${TIMESTAMP}* 2>/dev/null >> "$report_file" || echo "No result directories found" >> "$report_file"
|
|
217
|
+
|
|
218
|
+
echo ""
|
|
219
|
+
echo "Report saved to: $report_file"
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
# Main execution
|
|
223
|
+
main() {
|
|
224
|
+
echo "=================================================="
|
|
225
|
+
echo "Terminal-Bench 2.0 UAM Comparison Benchmark"
|
|
226
|
+
echo "=================================================="
|
|
227
|
+
echo "Timestamp: $TIMESTAMP"
|
|
228
|
+
echo "Results directory: $RESULTS_DIR"
|
|
229
|
+
echo ""
|
|
230
|
+
|
|
231
|
+
check_api_keys
|
|
232
|
+
|
|
233
|
+
# Parse arguments
|
|
234
|
+
local run_baseline=true
|
|
235
|
+
local run_uam=true
|
|
236
|
+
local use_custom=false
|
|
237
|
+
local selected_models=("${HARBOR_MODELS[@]}")
|
|
238
|
+
|
|
239
|
+
while [[ $# -gt 0 ]]; do
|
|
240
|
+
case $1 in
|
|
241
|
+
--baseline-only)
|
|
242
|
+
run_uam=false
|
|
243
|
+
shift
|
|
244
|
+
;;
|
|
245
|
+
--uam-only)
|
|
246
|
+
run_baseline=false
|
|
247
|
+
shift
|
|
248
|
+
;;
|
|
249
|
+
--custom-agent)
|
|
250
|
+
use_custom=true
|
|
251
|
+
shift
|
|
252
|
+
;;
|
|
253
|
+
--model)
|
|
254
|
+
selected_models=("$2")
|
|
255
|
+
shift 2
|
|
256
|
+
;;
|
|
257
|
+
--help)
|
|
258
|
+
echo "Usage: $0 [options]"
|
|
259
|
+
echo "Options:"
|
|
260
|
+
echo " --baseline-only Run only baseline (no UAM)"
|
|
261
|
+
echo " --uam-only Run only with UAM"
|
|
262
|
+
echo " --custom-agent Use custom UAM agent instead of claude-code"
|
|
263
|
+
echo " --model MODEL Test only this model"
|
|
264
|
+
echo " --help Show this help"
|
|
265
|
+
exit 0
|
|
266
|
+
;;
|
|
267
|
+
*)
|
|
268
|
+
echo "Unknown option: $1"
|
|
269
|
+
exit 1
|
|
270
|
+
;;
|
|
271
|
+
esac
|
|
272
|
+
done
|
|
273
|
+
|
|
274
|
+
# Run benchmarks
|
|
275
|
+
for model in "${selected_models[@]}"; do
|
|
276
|
+
if [ "$run_baseline" = true ]; then
|
|
277
|
+
if [ "$use_custom" = true ]; then
|
|
278
|
+
run_custom_agent "$model" "false"
|
|
279
|
+
else
|
|
280
|
+
run_without_uam "$model"
|
|
281
|
+
fi
|
|
282
|
+
fi
|
|
283
|
+
|
|
284
|
+
if [ "$run_uam" = true ]; then
|
|
285
|
+
if [ "$use_custom" = true ]; then
|
|
286
|
+
run_custom_agent "$model" "true"
|
|
287
|
+
else
|
|
288
|
+
run_with_uam "$model"
|
|
289
|
+
fi
|
|
290
|
+
fi
|
|
291
|
+
done
|
|
292
|
+
|
|
293
|
+
# Generate report
|
|
294
|
+
generate_report
|
|
295
|
+
|
|
296
|
+
echo ""
|
|
297
|
+
echo "=================================================="
|
|
298
|
+
echo "Benchmark complete!"
|
|
299
|
+
echo "=================================================="
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
main "$@"
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
#
|
|
3
|
+
# Run UAM Improved Benchmark using Factory API
|
|
4
|
+
#
|
|
5
|
+
# This benchmark tests UAM memory impact on coding tasks using droid CLI
|
|
6
|
+
# which accesses all models through a single Factory API key.
|
|
7
|
+
#
|
|
8
|
+
# Models tested:
|
|
9
|
+
# - Claude Opus 4.5 (Anthropic)
|
|
10
|
+
# - GPT 5.2 Codex (OpenAI)
|
|
11
|
+
# - GLM 4.7 (Zhipu)
|
|
12
|
+
#
|
|
13
|
+
# Usage:
|
|
14
|
+
# export FACTORY_API_KEY="your-factory-api-key"
|
|
15
|
+
# ./scripts/run-uam-benchmark.sh
|
|
16
|
+
#
|
|
17
|
+
|
|
18
|
+
set -e
|
|
19
|
+
|
|
20
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
21
|
+
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
22
|
+
|
|
23
|
+
echo "=================================================="
|
|
24
|
+
echo "UAM Improved Benchmark"
|
|
25
|
+
echo "=================================================="
|
|
26
|
+
|
|
27
|
+
# Check for Factory API key
|
|
28
|
+
if [ -z "$FACTORY_API_KEY" ] && [ -z "$DROID_API_KEY" ]; then
|
|
29
|
+
echo "Error: FACTORY_API_KEY or DROID_API_KEY must be set"
|
|
30
|
+
echo ""
|
|
31
|
+
echo "The Factory API key provides unified access to:"
|
|
32
|
+
echo " - Claude Opus 4.5 (Anthropic)"
|
|
33
|
+
echo " - GPT 5.2 Codex (OpenAI)"
|
|
34
|
+
echo " - GLM 4.7 (Zhipu)"
|
|
35
|
+
echo ""
|
|
36
|
+
echo "Get your key at: https://app.factory.ai/settings/api-keys"
|
|
37
|
+
exit 1
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
echo "Factory API key is set ✓"
|
|
41
|
+
echo ""
|
|
42
|
+
|
|
43
|
+
# Verify droid is available
|
|
44
|
+
if ! command -v droid &> /dev/null; then
|
|
45
|
+
echo "Error: droid CLI not found"
|
|
46
|
+
echo "Install with: npm install -g @anthropic-ai/droid"
|
|
47
|
+
exit 1
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
echo "droid CLI is available ✓"
|
|
51
|
+
echo ""
|
|
52
|
+
|
|
53
|
+
# Build project
|
|
54
|
+
echo "Building project..."
|
|
55
|
+
cd "$PROJECT_ROOT"
|
|
56
|
+
npm run build
|
|
57
|
+
|
|
58
|
+
# Run benchmark
|
|
59
|
+
echo ""
|
|
60
|
+
echo "Starting benchmark..."
|
|
61
|
+
echo "Models: Claude Opus 4.5, GLM 4.7, GPT 5.2 Codex"
|
|
62
|
+
echo "Tasks: 6 coding challenges"
|
|
63
|
+
echo "Comparison: With vs Without UAM Memory"
|
|
64
|
+
echo ""
|
|
65
|
+
|
|
66
|
+
npx tsx src/benchmarks/improved-benchmark.ts
|
|
67
|
+
|
|
68
|
+
echo ""
|
|
69
|
+
echo "=================================================="
|
|
70
|
+
echo "Benchmark Complete"
|
|
71
|
+
echo "=================================================="
|
|
72
|
+
echo "Results saved to: IMPROVED_BENCHMARK_RESULTS.md"
|