universal-agent-memory 6.1.0 → 6.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -451
- package/package.json +8 -3
- package/scripts/README.md +161 -0
- package/scripts/generate-comparison-report.ts +461 -0
- package/scripts/install-desktop.sh +105 -0
- package/scripts/install-web.sh +73 -0
- package/scripts/run-full-benchmark.sh +413 -0
- package/scripts/run-hybrid-adaptive-tbench.sh +252 -0
- package/scripts/run-terminal-bench.sh +302 -0
- package/scripts/run-uam-benchmark.sh +72 -0
- package/scripts/setup.sh +337 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# Colors
|
|
5
|
+
GREEN='\033[0;32m'
|
|
6
|
+
YELLOW='\033[1;33m'
|
|
7
|
+
RED='\033[0;31m'
|
|
8
|
+
NC='\033[0m' # No Color
|
|
9
|
+
|
|
10
|
+
REPO_URL="https://github.com/DammianMiller/universal-agent-memory"
|
|
11
|
+
|
|
12
|
+
echo -e "${GREEN}Universal Agent Memory - Desktop Installation${NC}"
|
|
13
|
+
echo "============================================"
|
|
14
|
+
echo ""
|
|
15
|
+
|
|
16
|
+
# Check for Node.js
|
|
17
|
+
if ! command -v node &> /dev/null; then
|
|
18
|
+
echo -e "${RED}Error: Node.js is not installed${NC}"
|
|
19
|
+
echo "Please install Node.js 18+ from https://nodejs.org/"
|
|
20
|
+
exit 1
|
|
21
|
+
fi
|
|
22
|
+
|
|
23
|
+
NODE_VERSION=$(node -v | cut -d'v' -f2 | cut -d'.' -f1)
|
|
24
|
+
if [ "$NODE_VERSION" -lt 18 ]; then
|
|
25
|
+
echo -e "${RED}Error: Node.js 18+ required (you have $(node -v))${NC}"
|
|
26
|
+
exit 1
|
|
27
|
+
fi
|
|
28
|
+
|
|
29
|
+
echo -e "${GREEN}✓${NC} Node.js $(node -v) detected"
|
|
30
|
+
|
|
31
|
+
# Check for npm
|
|
32
|
+
if ! command -v npm &> /dev/null; then
|
|
33
|
+
echo -e "${RED}Error: npm is not installed${NC}"
|
|
34
|
+
exit 1
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
echo -e "${GREEN}✓${NC} npm $(npm -v) detected"
|
|
38
|
+
|
|
39
|
+
# Check for Docker (optional)
|
|
40
|
+
if command -v docker &> /dev/null; then
|
|
41
|
+
echo -e "${GREEN}✓${NC} Docker detected - local Qdrant available"
|
|
42
|
+
DOCKER_AVAILABLE=true
|
|
43
|
+
else
|
|
44
|
+
echo -e "${YELLOW}⚠${NC} Docker not found - will use cloud backends only"
|
|
45
|
+
DOCKER_AVAILABLE=false
|
|
46
|
+
fi
|
|
47
|
+
|
|
48
|
+
# Install the CLI globally
|
|
49
|
+
echo ""
|
|
50
|
+
echo "Installing universal-agent-memory..."
|
|
51
|
+
|
|
52
|
+
# Try npm install first, fall back to git clone if package not published yet
|
|
53
|
+
if npm install -g universal-agent-memory 2>/dev/null; then
|
|
54
|
+
echo -e "${GREEN}✓${NC} Installed from npm registry"
|
|
55
|
+
else
|
|
56
|
+
echo -e "${YELLOW}Package not yet on npm, installing from GitHub...${NC}"
|
|
57
|
+
|
|
58
|
+
# Install to user's local directory
|
|
59
|
+
INSTALL_DIR="${HOME}/.universal-agent-memory"
|
|
60
|
+
|
|
61
|
+
# Remove old installation if exists
|
|
62
|
+
if [ -d "$INSTALL_DIR" ]; then
|
|
63
|
+
echo "Removing previous installation..."
|
|
64
|
+
rm -rf "$INSTALL_DIR"
|
|
65
|
+
fi
|
|
66
|
+
|
|
67
|
+
# Clone and install
|
|
68
|
+
git clone --depth 1 "$REPO_URL.git" "$INSTALL_DIR"
|
|
69
|
+
cd "$INSTALL_DIR"
|
|
70
|
+
npm install --production=false
|
|
71
|
+
npm run build
|
|
72
|
+
npm link
|
|
73
|
+
|
|
74
|
+
echo -e "${GREEN}✓${NC} Installed from GitHub to $INSTALL_DIR"
|
|
75
|
+
fi
|
|
76
|
+
|
|
77
|
+
echo ""
|
|
78
|
+
echo -e "${GREEN}Installation complete!${NC}"
|
|
79
|
+
echo ""
|
|
80
|
+
echo "Next steps:"
|
|
81
|
+
echo " 1. Initialize UAM in your project:"
|
|
82
|
+
echo " $ cd /path/to/your/project"
|
|
83
|
+
echo " $ uam init"
|
|
84
|
+
echo ""
|
|
85
|
+
echo " 2. Review the generated CLAUDE.md"
|
|
86
|
+
echo ""
|
|
87
|
+
echo " 3. Start working - your AI assistant will follow the workflows!"
|
|
88
|
+
echo ""
|
|
89
|
+
|
|
90
|
+
if [ "$DOCKER_AVAILABLE" = true ]; then
|
|
91
|
+
echo " 2. Start local memory services (optional):"
|
|
92
|
+
echo " $ uam memory start"
|
|
93
|
+
echo ""
|
|
94
|
+
echo " Or use cloud backends:"
|
|
95
|
+
else
|
|
96
|
+
echo " 2. Configure cloud memory backends:"
|
|
97
|
+
fi
|
|
98
|
+
|
|
99
|
+
echo " - GitHub: export GITHUB_TOKEN=your_token"
|
|
100
|
+
echo " - Qdrant Cloud: export QDRANT_API_KEY=your_key && export QDRANT_URL=your_url"
|
|
101
|
+
echo ""
|
|
102
|
+
echo " 3. Generate CLAUDE.md for your project:"
|
|
103
|
+
echo " $ uam generate"
|
|
104
|
+
echo ""
|
|
105
|
+
echo "Documentation: ${REPO_URL}#readme"
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -e
|
|
3
|
+
|
|
4
|
+
REPO_URL="https://github.com/DammianMiller/universal-agent-memory"
|
|
5
|
+
|
|
6
|
+
echo "🚀 Universal Agent Memory - Web Platform Setup"
|
|
7
|
+
echo ""
|
|
8
|
+
|
|
9
|
+
# Check for required tools
|
|
10
|
+
if ! command -v node &> /dev/null; then
|
|
11
|
+
echo "❌ Node.js is required. Install from https://nodejs.org"
|
|
12
|
+
exit 1
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
if ! command -v npm &> /dev/null; then
|
|
16
|
+
echo "❌ npm is required. Install from https://nodejs.org"
|
|
17
|
+
exit 1
|
|
18
|
+
fi
|
|
19
|
+
|
|
20
|
+
echo "✅ Node.js $(node -v) detected"
|
|
21
|
+
echo "✅ npm $(npm -v) detected"
|
|
22
|
+
|
|
23
|
+
# Install CLI globally
|
|
24
|
+
echo ""
|
|
25
|
+
echo "📦 Installing universal-agent-memory CLI..."
|
|
26
|
+
|
|
27
|
+
# Try npm install first, fall back to git clone if package not published yet
|
|
28
|
+
if npm install -g universal-agent-memory 2>/dev/null; then
|
|
29
|
+
echo "✅ Installed from npm registry"
|
|
30
|
+
else
|
|
31
|
+
echo "⚠️ Package not yet on npm, installing from GitHub..."
|
|
32
|
+
|
|
33
|
+
# Install to user's local directory
|
|
34
|
+
INSTALL_DIR="${HOME}/.universal-agent-memory"
|
|
35
|
+
|
|
36
|
+
# Remove old installation if exists
|
|
37
|
+
if [ -d "$INSTALL_DIR" ]; then
|
|
38
|
+
echo "Removing previous installation..."
|
|
39
|
+
rm -rf "$INSTALL_DIR"
|
|
40
|
+
fi
|
|
41
|
+
|
|
42
|
+
# Clone and install
|
|
43
|
+
git clone --depth 1 "$REPO_URL.git" "$INSTALL_DIR"
|
|
44
|
+
cd "$INSTALL_DIR"
|
|
45
|
+
npm install --production=false
|
|
46
|
+
npm run build
|
|
47
|
+
npm link
|
|
48
|
+
|
|
49
|
+
echo "✅ Installed from GitHub to $INSTALL_DIR"
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
# Initialize in current directory
|
|
53
|
+
echo ""
|
|
54
|
+
echo "⚙️ Initializing project..."
|
|
55
|
+
uam init --web --interactive
|
|
56
|
+
|
|
57
|
+
echo ""
|
|
58
|
+
echo "✅ Setup complete!"
|
|
59
|
+
echo ""
|
|
60
|
+
echo "Next steps:"
|
|
61
|
+
echo " 1. Initialize UAM in your project:"
|
|
62
|
+
echo " uam init"
|
|
63
|
+
echo ""
|
|
64
|
+
echo " 2. Review the generated CLAUDE.md"
|
|
65
|
+
echo ""
|
|
66
|
+
echo " 3. Start working - your AI assistant will follow the workflows!"
|
|
67
|
+
echo ""
|
|
68
|
+
echo "Optional: Set up cloud memory backends"
|
|
69
|
+
echo " export GITHUB_TOKEN=your_token"
|
|
70
|
+
echo " export QDRANT_API_KEY=your_key"
|
|
71
|
+
echo " export QDRANT_URL=your_url"
|
|
72
|
+
echo ""
|
|
73
|
+
echo "Documentation: ${REPO_URL}#readme"
|
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
#
|
|
3
|
+
# Full Terminal-Bench 2.0 Benchmark: UAM v3.1.0 vs Baseline
|
|
4
|
+
# Runs all 3 models x 2 configs = 6 total benchmark runs
|
|
5
|
+
#
|
|
6
|
+
# Models: Claude Opus 4.5, GPT 5.2 Codex, GLM 4.7
|
|
7
|
+
# Configs: Baseline (no UAM), With UAM
|
|
8
|
+
#
|
|
9
|
+
# Usage:
|
|
10
|
+
# export FACTORY_API_KEY="your-key"
|
|
11
|
+
# ./scripts/run-full-benchmark.sh [options]
|
|
12
|
+
#
|
|
13
|
+
# Options:
|
|
14
|
+
# --model <model> Run only this model (e.g. anthropic/claude-opus-4-5)
|
|
15
|
+
# --baseline-only Skip UAM runs
|
|
16
|
+
# --uam-only Skip baseline runs
|
|
17
|
+
# --concurrency <n> Parallel tasks per run (default: 4)
|
|
18
|
+
# --timeout-mult <f> Timeout multiplier (default: 2.0)
|
|
19
|
+
# --dry-run Print commands without executing
|
|
20
|
+
# --resume <timestamp> Resume a previous run using its timestamp
|
|
21
|
+
# --help Show help
|
|
22
|
+
#
|
|
23
|
+
|
|
24
|
+
set -euo pipefail
|
|
25
|
+
|
|
26
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
27
|
+
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
28
|
+
RESULTS_DIR="$PROJECT_ROOT/benchmark-results"
|
|
29
|
+
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
30
|
+
|
|
31
|
+
# Models in Harbor format
|
|
32
|
+
declare -A MODEL_MAP=(
|
|
33
|
+
["anthropic/claude-opus-4-5"]="opus45"
|
|
34
|
+
["openai/gpt-5.2-codex"]="gpt52"
|
|
35
|
+
["zhipu/glm-4.7"]="glm47"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
ALL_MODELS=("anthropic/claude-opus-4-5" "openai/gpt-5.2-codex" "zhipu/glm-4.7")
|
|
39
|
+
|
|
40
|
+
# Defaults
|
|
41
|
+
CONCURRENCY=4
|
|
42
|
+
TIMEOUT_MULT=2.0
|
|
43
|
+
DATASET="terminal-bench@2.0"
|
|
44
|
+
RUN_BASELINE=true
|
|
45
|
+
RUN_UAM=true
|
|
46
|
+
DRY_RUN=false
|
|
47
|
+
SELECTED_MODELS=("${ALL_MODELS[@]}")
|
|
48
|
+
RESUME_TS=""
|
|
49
|
+
|
|
50
|
+
# Track run results for summary
|
|
51
|
+
declare -A RUN_STATUS
|
|
52
|
+
declare -A RUN_JOBS
|
|
53
|
+
|
|
54
|
+
usage() {
|
|
55
|
+
sed -n '2,/^$/p' "$0" | sed 's/^#//' | sed 's/^ //'
|
|
56
|
+
exit 0
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
parse_args() {
|
|
60
|
+
while [[ $# -gt 0 ]]; do
|
|
61
|
+
case $1 in
|
|
62
|
+
--model) SELECTED_MODELS=("$2"); shift 2 ;;
|
|
63
|
+
--baseline-only) RUN_UAM=false; shift ;;
|
|
64
|
+
--uam-only) RUN_BASELINE=false; shift ;;
|
|
65
|
+
--concurrency) CONCURRENCY="$2"; shift 2 ;;
|
|
66
|
+
--timeout-mult) TIMEOUT_MULT="$2"; shift 2 ;;
|
|
67
|
+
--dry-run) DRY_RUN=true; shift ;;
|
|
68
|
+
--resume) RESUME_TS="$2"; TIMESTAMP="$2"; shift 2 ;;
|
|
69
|
+
--help) usage ;;
|
|
70
|
+
*) echo "Unknown option: $1"; exit 1 ;;
|
|
71
|
+
esac
|
|
72
|
+
done
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
check_prerequisites() {
|
|
76
|
+
if ! command -v harbor &>/dev/null; then
|
|
77
|
+
echo "Error: 'harbor' CLI not found. Install from https://github.com/laude-institute/harbor"
|
|
78
|
+
exit 1
|
|
79
|
+
fi
|
|
80
|
+
|
|
81
|
+
if [[ -z "${FACTORY_API_KEY:-}" ]] && [[ -z "${DROID_API_KEY:-}" ]] && [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
|
|
82
|
+
echo "Error: No API key found. Set FACTORY_API_KEY, DROID_API_KEY, or ANTHROPIC_API_KEY"
|
|
83
|
+
echo "Get your Factory key at: https://app.factory.ai/settings/api-keys"
|
|
84
|
+
exit 1
|
|
85
|
+
fi
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
log() {
|
|
89
|
+
local level="$1"; shift
|
|
90
|
+
local ts
|
|
91
|
+
ts=$(date +"%H:%M:%S")
|
|
92
|
+
case "$level" in
|
|
93
|
+
INFO) echo -e "[$ts] \033[36mINFO\033[0m $*" ;;
|
|
94
|
+
OK) echo -e "[$ts] \033[32mOK\033[0m $*" ;;
|
|
95
|
+
WARN) echo -e "[$ts] \033[33mWARN\033[0m $*" ;;
|
|
96
|
+
ERROR) echo -e "[$ts] \033[31mERROR\033[0m $*" ;;
|
|
97
|
+
RUN) echo -e "[$ts] \033[35mRUN\033[0m $*" ;;
|
|
98
|
+
esac
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
run_harbor() {
|
|
102
|
+
local config_type="$1" # "baseline" or "uam"
|
|
103
|
+
local model="$2"
|
|
104
|
+
local model_short="${MODEL_MAP[$model]}"
|
|
105
|
+
local job_name="${config_type}_${model_short}_${TIMESTAMP}"
|
|
106
|
+
local log_file="$RESULTS_DIR/${job_name}.log"
|
|
107
|
+
local run_key="${config_type}_${model_short}"
|
|
108
|
+
|
|
109
|
+
# Skip if already completed (resume mode)
|
|
110
|
+
if [[ -n "$RESUME_TS" ]] && [[ -f "$RESULTS_DIR/${job_name}/result.json" ]]; then
|
|
111
|
+
log INFO "Skipping $job_name (already completed)"
|
|
112
|
+
RUN_STATUS[$run_key]="skipped"
|
|
113
|
+
RUN_JOBS[$run_key]="$job_name"
|
|
114
|
+
return 0
|
|
115
|
+
fi
|
|
116
|
+
|
|
117
|
+
log RUN "$config_type | $model | job=$job_name"
|
|
118
|
+
|
|
119
|
+
local cmd=(
|
|
120
|
+
harbor run
|
|
121
|
+
-d "$DATASET"
|
|
122
|
+
-m "$model"
|
|
123
|
+
-n "$CONCURRENCY"
|
|
124
|
+
--timeout-multiplier "$TIMEOUT_MULT"
|
|
125
|
+
--job-name "$job_name"
|
|
126
|
+
--jobs-dir "$RESULTS_DIR"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if [[ "$config_type" == "baseline" ]]; then
|
|
130
|
+
# Baseline: vanilla claude-code agent with no UAM context
|
|
131
|
+
cmd+=(-a claude-code --ak "system_prompt=")
|
|
132
|
+
else
|
|
133
|
+
# UAM: custom agent with classified preamble and pre-execution hooks
|
|
134
|
+
cmd+=(--agent-import-path "uam_harbor.uam_agent:UAMAgent")
|
|
135
|
+
fi
|
|
136
|
+
|
|
137
|
+
if [[ "$DRY_RUN" == true ]]; then
|
|
138
|
+
echo " [DRY RUN] ${cmd[*]}"
|
|
139
|
+
RUN_STATUS[$run_key]="dry-run"
|
|
140
|
+
RUN_JOBS[$run_key]="$job_name"
|
|
141
|
+
return 0
|
|
142
|
+
fi
|
|
143
|
+
|
|
144
|
+
mkdir -p "$RESULTS_DIR"
|
|
145
|
+
|
|
146
|
+
local start_time
|
|
147
|
+
start_time=$(date +%s)
|
|
148
|
+
|
|
149
|
+
if "${cmd[@]}" 2>&1 | tee "$log_file"; then
|
|
150
|
+
RUN_STATUS[$run_key]="success"
|
|
151
|
+
else
|
|
152
|
+
RUN_STATUS[$run_key]="failed"
|
|
153
|
+
log WARN "$job_name exited with non-zero status"
|
|
154
|
+
fi
|
|
155
|
+
|
|
156
|
+
RUN_JOBS[$run_key]="$job_name"
|
|
157
|
+
|
|
158
|
+
local end_time
|
|
159
|
+
end_time=$(date +%s)
|
|
160
|
+
local duration=$(( end_time - start_time ))
|
|
161
|
+
local hours=$(( duration / 3600 ))
|
|
162
|
+
local minutes=$(( (duration % 3600) / 60 ))
|
|
163
|
+
|
|
164
|
+
log OK "$job_name completed in ${hours}h ${minutes}m"
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
print_summary() {
|
|
168
|
+
echo ""
|
|
169
|
+
echo "================================================================"
|
|
170
|
+
echo " BENCHMARK SUMMARY"
|
|
171
|
+
echo "================================================================"
|
|
172
|
+
echo ""
|
|
173
|
+
printf " %-12s %-30s %-10s %s\n" "Config" "Model" "Status" "Job Name"
|
|
174
|
+
printf " %-12s %-30s %-10s %s\n" "------" "-----" "------" "--------"
|
|
175
|
+
|
|
176
|
+
for model in "${SELECTED_MODELS[@]}"; do
|
|
177
|
+
local model_short="${MODEL_MAP[$model]}"
|
|
178
|
+
for config in baseline uam; do
|
|
179
|
+
local key="${config}_${model_short}"
|
|
180
|
+
local status="${RUN_STATUS[$key]:-not-run}"
|
|
181
|
+
local job="${RUN_JOBS[$key]:-N/A}"
|
|
182
|
+
printf " %-12s %-30s %-10s %s\n" "$config" "$model" "$status" "$job"
|
|
183
|
+
done
|
|
184
|
+
done
|
|
185
|
+
|
|
186
|
+
echo ""
|
|
187
|
+
echo " Results directory: $RESULTS_DIR"
|
|
188
|
+
echo " Timestamp: $TIMESTAMP"
|
|
189
|
+
echo ""
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
generate_report() {
|
|
193
|
+
log INFO "Generating comparison report..."
|
|
194
|
+
|
|
195
|
+
local report_script="$SCRIPT_DIR/generate-comparison-report.ts"
|
|
196
|
+
if [[ ! -f "$report_script" ]]; then
|
|
197
|
+
log WARN "Report generator not found at $report_script"
|
|
198
|
+
log INFO "Generating basic summary instead..."
|
|
199
|
+
generate_basic_report
|
|
200
|
+
return
|
|
201
|
+
fi
|
|
202
|
+
|
|
203
|
+
# Run the TypeScript report generator
|
|
204
|
+
local report_output
|
|
205
|
+
report_output="$RESULTS_DIR/FULL_COMPARISON_${TIMESTAMP}.md"
|
|
206
|
+
|
|
207
|
+
local job_args=""
|
|
208
|
+
for model in "${SELECTED_MODELS[@]}"; do
|
|
209
|
+
local model_short="${MODEL_MAP[$model]}"
|
|
210
|
+
if [[ "$RUN_BASELINE" == true ]]; then
|
|
211
|
+
local bj="${RUN_JOBS[baseline_${model_short}]:-}"
|
|
212
|
+
if [[ -n "$bj" ]]; then
|
|
213
|
+
job_args="$job_args --baseline $RESULTS_DIR/$bj"
|
|
214
|
+
fi
|
|
215
|
+
fi
|
|
216
|
+
if [[ "$RUN_UAM" == true ]]; then
|
|
217
|
+
local uj="${RUN_JOBS[uam_${model_short}]:-}"
|
|
218
|
+
if [[ -n "$uj" ]]; then
|
|
219
|
+
job_args="$job_args --uam $RESULTS_DIR/$uj"
|
|
220
|
+
fi
|
|
221
|
+
fi
|
|
222
|
+
done
|
|
223
|
+
|
|
224
|
+
if npx tsx "$report_script" \
|
|
225
|
+
--output "$report_output" \
|
|
226
|
+
--timestamp "$TIMESTAMP" \
|
|
227
|
+
$job_args 2>&1; then
|
|
228
|
+
log OK "Report saved to $report_output"
|
|
229
|
+
else
|
|
230
|
+
log WARN "TypeScript report generator failed, falling back to basic report"
|
|
231
|
+
generate_basic_report
|
|
232
|
+
fi
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
generate_basic_report() {
|
|
236
|
+
local report_file="$RESULTS_DIR/FULL_COMPARISON_${TIMESTAMP}.md"
|
|
237
|
+
|
|
238
|
+
cat > "$report_file" << HEADER
|
|
239
|
+
# Terminal-Bench 2.0 Full Comparison: UAM v3.1.0 vs Baseline
|
|
240
|
+
|
|
241
|
+
**Generated:** $(date -Iseconds)
|
|
242
|
+
**Dataset:** $DATASET (89 tasks)
|
|
243
|
+
**UAM Version:** 3.1.0
|
|
244
|
+
**Concurrency:** $CONCURRENCY | **Timeout Multiplier:** $TIMEOUT_MULT
|
|
245
|
+
|
|
246
|
+
## Results Summary
|
|
247
|
+
|
|
248
|
+
| Model | Config | Pass Rate | Passed | Failed | Errors |
|
|
249
|
+
|-------|--------|-----------|--------|--------|--------|
|
|
250
|
+
HEADER
|
|
251
|
+
|
|
252
|
+
for model in "${SELECTED_MODELS[@]}"; do
|
|
253
|
+
local model_short="${MODEL_MAP[$model]}"
|
|
254
|
+
for config in baseline uam; do
|
|
255
|
+
local key="${config}_${model_short}"
|
|
256
|
+
local job="${RUN_JOBS[$key]:-}"
|
|
257
|
+
local result_file="$RESULTS_DIR/$job/result.json"
|
|
258
|
+
|
|
259
|
+
if [[ -n "$job" ]] && [[ -f "$result_file" ]]; then
|
|
260
|
+
local stats
|
|
261
|
+
stats=$(python3 -c "
|
|
262
|
+
import json, sys
|
|
263
|
+
with open('$result_file') as f:
|
|
264
|
+
d = json.load(f)
|
|
265
|
+
evals = d['stats']['evals']
|
|
266
|
+
for k, v in evals.items():
|
|
267
|
+
rw = v.get('reward_stats', {}).get('reward', {})
|
|
268
|
+
p = len(rw.get('1.0', []))
|
|
269
|
+
f = len(rw.get('0.0', []))
|
|
270
|
+
total = p + f
|
|
271
|
+
rate = p/total*100 if total > 0 else 0
|
|
272
|
+
err = v.get('n_errors', 0)
|
|
273
|
+
print(f'{rate:.1f}%|{p}|{f}|{err}')
|
|
274
|
+
" 2>/dev/null || echo "N/A|N/A|N/A|N/A")
|
|
275
|
+
|
|
276
|
+
IFS='|' read -r rate passed failed errors <<< "$stats"
|
|
277
|
+
echo "| $model | $config | $rate | $passed | $failed | $errors |" >> "$report_file"
|
|
278
|
+
else
|
|
279
|
+
echo "| $model | $config | N/A | N/A | N/A | N/A |" >> "$report_file"
|
|
280
|
+
fi
|
|
281
|
+
done
|
|
282
|
+
done
|
|
283
|
+
|
|
284
|
+
# Add per-model delta section
|
|
285
|
+
cat >> "$report_file" << 'DELTAS'
|
|
286
|
+
|
|
287
|
+
## Per-Model UAM Delta
|
|
288
|
+
|
|
289
|
+
DELTAS
|
|
290
|
+
|
|
291
|
+
for model in "${SELECTED_MODELS[@]}"; do
|
|
292
|
+
local model_short="${MODEL_MAP[$model]}"
|
|
293
|
+
local bj="${RUN_JOBS[baseline_${model_short}]:-}"
|
|
294
|
+
local uj="${RUN_JOBS[uam_${model_short}]:-}"
|
|
295
|
+
local b_result="$RESULTS_DIR/$bj/result.json"
|
|
296
|
+
local u_result="$RESULTS_DIR/$uj/result.json"
|
|
297
|
+
|
|
298
|
+
if [[ -f "$b_result" ]] && [[ -f "$u_result" ]]; then
|
|
299
|
+
echo "### $model" >> "$report_file"
|
|
300
|
+
echo "" >> "$report_file"
|
|
301
|
+
|
|
302
|
+
python3 -c "
|
|
303
|
+
import json
|
|
304
|
+
with open('$b_result') as f:
|
|
305
|
+
bd = json.load(f)
|
|
306
|
+
with open('$u_result') as f:
|
|
307
|
+
ud = json.load(f)
|
|
308
|
+
|
|
309
|
+
def get_tasks(data):
|
|
310
|
+
evals = data['stats']['evals']
|
|
311
|
+
for k, v in evals.items():
|
|
312
|
+
rw = v.get('reward_stats', {}).get('reward', {})
|
|
313
|
+
passed = set(t.split('__')[0] for t in rw.get('1.0', []))
|
|
314
|
+
failed = set(t.split('__')[0] for t in rw.get('0.0', []))
|
|
315
|
+
return passed, failed
|
|
316
|
+
return set(), set()
|
|
317
|
+
|
|
318
|
+
bp, bf = get_tasks(bd)
|
|
319
|
+
up, uf = get_tasks(ud)
|
|
320
|
+
|
|
321
|
+
uam_wins = sorted(up - bp)
|
|
322
|
+
baseline_wins = sorted(bp - up)
|
|
323
|
+
both_pass = sorted(bp & up)
|
|
324
|
+
both_fail = sorted(bf & uf)
|
|
325
|
+
|
|
326
|
+
b_rate = len(bp)/(len(bp)+len(bf))*100 if (len(bp)+len(bf))>0 else 0
|
|
327
|
+
u_rate = len(up)/(len(up)+len(uf))*100 if (len(up)+len(uf))>0 else 0
|
|
328
|
+
delta = u_rate - b_rate
|
|
329
|
+
|
|
330
|
+
print(f'| Metric | Value |')
|
|
331
|
+
print(f'|--------|-------|')
|
|
332
|
+
print(f'| Baseline pass rate | {b_rate:.1f}% ({len(bp)}/{len(bp)+len(bf)}) |')
|
|
333
|
+
print(f'| UAM pass rate | {u_rate:.1f}% ({len(up)}/{len(up)+len(uf)}) |')
|
|
334
|
+
print(f'| **Net delta** | **{delta:+.1f}%** ({len(uam_wins)-len(baseline_wins):+d} tasks) |')
|
|
335
|
+
print(f'| UAM wins | {len(uam_wins)} tasks |')
|
|
336
|
+
print(f'| Baseline wins | {len(baseline_wins)} tasks |')
|
|
337
|
+
print(f'| Both pass | {len(both_pass)} tasks |')
|
|
338
|
+
print(f'| Both fail | {len(both_fail)} tasks |')
|
|
339
|
+
print()
|
|
340
|
+
|
|
341
|
+
if uam_wins:
|
|
342
|
+
print('**UAM wins:** ' + ', '.join(uam_wins))
|
|
343
|
+
print()
|
|
344
|
+
if baseline_wins:
|
|
345
|
+
print('**Baseline wins:** ' + ', '.join(baseline_wins))
|
|
346
|
+
print()
|
|
347
|
+
" >> "$report_file" 2>/dev/null || echo "Unable to parse results for $model" >> "$report_file"
|
|
348
|
+
echo "" >> "$report_file"
|
|
349
|
+
fi
|
|
350
|
+
done
|
|
351
|
+
|
|
352
|
+
echo "" >> "$report_file"
|
|
353
|
+
echo "---" >> "$report_file"
|
|
354
|
+
echo "*Report generated by \`scripts/run-full-benchmark.sh\` at $(date -Iseconds)*" >> "$report_file"
|
|
355
|
+
|
|
356
|
+
log OK "Basic report saved to $report_file"
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
# === Main ===
|
|
360
|
+
|
|
361
|
+
main() {
|
|
362
|
+
parse_args "$@"
|
|
363
|
+
|
|
364
|
+
echo "================================================================"
|
|
365
|
+
echo " Terminal-Bench 2.0 Full Benchmark"
|
|
366
|
+
echo " UAM v3.1.0 vs Baseline | $(date)"
|
|
367
|
+
echo "================================================================"
|
|
368
|
+
echo ""
|
|
369
|
+
echo " Models: ${SELECTED_MODELS[*]}"
|
|
370
|
+
echo " Configs: $([ "$RUN_BASELINE" = true ] && echo "baseline ")$([ "$RUN_UAM" = true ] && echo "uam")"
|
|
371
|
+
echo " Concurrency: $CONCURRENCY"
|
|
372
|
+
echo " Timeout: ${TIMEOUT_MULT}x"
|
|
373
|
+
echo " Results: $RESULTS_DIR"
|
|
374
|
+
echo " Timestamp: $TIMESTAMP"
|
|
375
|
+
echo ""
|
|
376
|
+
|
|
377
|
+
check_prerequisites
|
|
378
|
+
|
|
379
|
+
# Run each model x config combination
|
|
380
|
+
local run_count=0
|
|
381
|
+
local total_runs=0
|
|
382
|
+
|
|
383
|
+
for model in "${SELECTED_MODELS[@]}"; do
|
|
384
|
+
[[ "$RUN_BASELINE" == true ]] && (( total_runs++ )) || true
|
|
385
|
+
[[ "$RUN_UAM" == true ]] && (( total_runs++ )) || true
|
|
386
|
+
done
|
|
387
|
+
|
|
388
|
+
log INFO "Starting $total_runs benchmark runs..."
|
|
389
|
+
|
|
390
|
+
for model in "${SELECTED_MODELS[@]}"; do
|
|
391
|
+
if [[ "$RUN_BASELINE" == true ]]; then
|
|
392
|
+
(( run_count++ )) || true
|
|
393
|
+
log INFO "Run $run_count/$total_runs"
|
|
394
|
+
run_harbor "baseline" "$model"
|
|
395
|
+
fi
|
|
396
|
+
|
|
397
|
+
if [[ "$RUN_UAM" == true ]]; then
|
|
398
|
+
(( run_count++ )) || true
|
|
399
|
+
log INFO "Run $run_count/$total_runs"
|
|
400
|
+
run_harbor "uam" "$model"
|
|
401
|
+
fi
|
|
402
|
+
done
|
|
403
|
+
|
|
404
|
+
# Generate report
|
|
405
|
+
generate_report
|
|
406
|
+
|
|
407
|
+
# Print summary
|
|
408
|
+
print_summary
|
|
409
|
+
|
|
410
|
+
log OK "All benchmark runs complete."
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
main "$@"
|