universal-agent-memory 6.1.1 → 6.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generators/claude-md.js +1 -8
- package/dist/generators/claude-md.js.map +1 -1
- package/package.json +4 -10
- package/templates/CLAUDE.template.md +303 -100
- package/scripts/README.md +0 -161
- package/scripts/generate-comparison-report.ts +0 -461
- package/scripts/install-desktop.sh +0 -105
- package/scripts/install-web.sh +0 -73
- package/scripts/run-full-benchmark.sh +0 -413
- package/scripts/run-hybrid-adaptive-tbench.sh +0 -252
- package/scripts/run-terminal-bench.sh +0 -302
- package/scripts/run-uam-benchmark.sh +0 -72
- package/scripts/setup.sh +0 -337
|
@@ -1,413 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
#
|
|
3
|
-
# Full Terminal-Bench 2.0 Benchmark: UAM v3.1.0 vs Baseline
|
|
4
|
-
# Runs all 3 models x 2 configs = 6 total benchmark runs
|
|
5
|
-
#
|
|
6
|
-
# Models: Claude Opus 4.5, GPT 5.2 Codex, GLM 4.7
|
|
7
|
-
# Configs: Baseline (no UAM), With UAM
|
|
8
|
-
#
|
|
9
|
-
# Usage:
|
|
10
|
-
# export FACTORY_API_KEY="your-key"
|
|
11
|
-
# ./scripts/run-full-benchmark.sh [options]
|
|
12
|
-
#
|
|
13
|
-
# Options:
|
|
14
|
-
# --model <model> Run only this model (e.g. anthropic/claude-opus-4-5)
|
|
15
|
-
# --baseline-only Skip UAM runs
|
|
16
|
-
# --uam-only Skip baseline runs
|
|
17
|
-
# --concurrency <n> Parallel tasks per run (default: 4)
|
|
18
|
-
# --timeout-mult <f> Timeout multiplier (default: 2.0)
|
|
19
|
-
# --dry-run Print commands without executing
|
|
20
|
-
# --resume <timestamp> Resume a previous run using its timestamp
|
|
21
|
-
# --help Show help
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
set -euo pipefail
|
|
25
|
-
|
|
26
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
27
|
-
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
28
|
-
RESULTS_DIR="$PROJECT_ROOT/benchmark-results"
|
|
29
|
-
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
30
|
-
|
|
31
|
-
# Models in Harbor format
|
|
32
|
-
declare -A MODEL_MAP=(
|
|
33
|
-
["anthropic/claude-opus-4-5"]="opus45"
|
|
34
|
-
["openai/gpt-5.2-codex"]="gpt52"
|
|
35
|
-
["zhipu/glm-4.7"]="glm47"
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
ALL_MODELS=("anthropic/claude-opus-4-5" "openai/gpt-5.2-codex" "zhipu/glm-4.7")
|
|
39
|
-
|
|
40
|
-
# Defaults
|
|
41
|
-
CONCURRENCY=4
|
|
42
|
-
TIMEOUT_MULT=2.0
|
|
43
|
-
DATASET="terminal-bench@2.0"
|
|
44
|
-
RUN_BASELINE=true
|
|
45
|
-
RUN_UAM=true
|
|
46
|
-
DRY_RUN=false
|
|
47
|
-
SELECTED_MODELS=("${ALL_MODELS[@]}")
|
|
48
|
-
RESUME_TS=""
|
|
49
|
-
|
|
50
|
-
# Track run results for summary
|
|
51
|
-
declare -A RUN_STATUS
|
|
52
|
-
declare -A RUN_JOBS
|
|
53
|
-
|
|
54
|
-
usage() {
|
|
55
|
-
sed -n '2,/^$/p' "$0" | sed 's/^#//' | sed 's/^ //'
|
|
56
|
-
exit 0
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
parse_args() {
|
|
60
|
-
while [[ $# -gt 0 ]]; do
|
|
61
|
-
case $1 in
|
|
62
|
-
--model) SELECTED_MODELS=("$2"); shift 2 ;;
|
|
63
|
-
--baseline-only) RUN_UAM=false; shift ;;
|
|
64
|
-
--uam-only) RUN_BASELINE=false; shift ;;
|
|
65
|
-
--concurrency) CONCURRENCY="$2"; shift 2 ;;
|
|
66
|
-
--timeout-mult) TIMEOUT_MULT="$2"; shift 2 ;;
|
|
67
|
-
--dry-run) DRY_RUN=true; shift ;;
|
|
68
|
-
--resume) RESUME_TS="$2"; TIMESTAMP="$2"; shift 2 ;;
|
|
69
|
-
--help) usage ;;
|
|
70
|
-
*) echo "Unknown option: $1"; exit 1 ;;
|
|
71
|
-
esac
|
|
72
|
-
done
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
check_prerequisites() {
|
|
76
|
-
if ! command -v harbor &>/dev/null; then
|
|
77
|
-
echo "Error: 'harbor' CLI not found. Install from https://github.com/laude-institute/harbor"
|
|
78
|
-
exit 1
|
|
79
|
-
fi
|
|
80
|
-
|
|
81
|
-
if [[ -z "${FACTORY_API_KEY:-}" ]] && [[ -z "${DROID_API_KEY:-}" ]] && [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
|
|
82
|
-
echo "Error: No API key found. Set FACTORY_API_KEY, DROID_API_KEY, or ANTHROPIC_API_KEY"
|
|
83
|
-
echo "Get your Factory key at: https://app.factory.ai/settings/api-keys"
|
|
84
|
-
exit 1
|
|
85
|
-
fi
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
log() {
|
|
89
|
-
local level="$1"; shift
|
|
90
|
-
local ts
|
|
91
|
-
ts=$(date +"%H:%M:%S")
|
|
92
|
-
case "$level" in
|
|
93
|
-
INFO) echo -e "[$ts] \033[36mINFO\033[0m $*" ;;
|
|
94
|
-
OK) echo -e "[$ts] \033[32mOK\033[0m $*" ;;
|
|
95
|
-
WARN) echo -e "[$ts] \033[33mWARN\033[0m $*" ;;
|
|
96
|
-
ERROR) echo -e "[$ts] \033[31mERROR\033[0m $*" ;;
|
|
97
|
-
RUN) echo -e "[$ts] \033[35mRUN\033[0m $*" ;;
|
|
98
|
-
esac
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
run_harbor() {
|
|
102
|
-
local config_type="$1" # "baseline" or "uam"
|
|
103
|
-
local model="$2"
|
|
104
|
-
local model_short="${MODEL_MAP[$model]}"
|
|
105
|
-
local job_name="${config_type}_${model_short}_${TIMESTAMP}"
|
|
106
|
-
local log_file="$RESULTS_DIR/${job_name}.log"
|
|
107
|
-
local run_key="${config_type}_${model_short}"
|
|
108
|
-
|
|
109
|
-
# Skip if already completed (resume mode)
|
|
110
|
-
if [[ -n "$RESUME_TS" ]] && [[ -f "$RESULTS_DIR/${job_name}/result.json" ]]; then
|
|
111
|
-
log INFO "Skipping $job_name (already completed)"
|
|
112
|
-
RUN_STATUS[$run_key]="skipped"
|
|
113
|
-
RUN_JOBS[$run_key]="$job_name"
|
|
114
|
-
return 0
|
|
115
|
-
fi
|
|
116
|
-
|
|
117
|
-
log RUN "$config_type | $model | job=$job_name"
|
|
118
|
-
|
|
119
|
-
local cmd=(
|
|
120
|
-
harbor run
|
|
121
|
-
-d "$DATASET"
|
|
122
|
-
-m "$model"
|
|
123
|
-
-n "$CONCURRENCY"
|
|
124
|
-
--timeout-multiplier "$TIMEOUT_MULT"
|
|
125
|
-
--job-name "$job_name"
|
|
126
|
-
--jobs-dir "$RESULTS_DIR"
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
if [[ "$config_type" == "baseline" ]]; then
|
|
130
|
-
# Baseline: vanilla claude-code agent with no UAM context
|
|
131
|
-
cmd+=(-a claude-code --ak "system_prompt=")
|
|
132
|
-
else
|
|
133
|
-
# UAM: custom agent with classified preamble and pre-execution hooks
|
|
134
|
-
cmd+=(--agent-import-path "uam_harbor.uam_agent:UAMAgent")
|
|
135
|
-
fi
|
|
136
|
-
|
|
137
|
-
if [[ "$DRY_RUN" == true ]]; then
|
|
138
|
-
echo " [DRY RUN] ${cmd[*]}"
|
|
139
|
-
RUN_STATUS[$run_key]="dry-run"
|
|
140
|
-
RUN_JOBS[$run_key]="$job_name"
|
|
141
|
-
return 0
|
|
142
|
-
fi
|
|
143
|
-
|
|
144
|
-
mkdir -p "$RESULTS_DIR"
|
|
145
|
-
|
|
146
|
-
local start_time
|
|
147
|
-
start_time=$(date +%s)
|
|
148
|
-
|
|
149
|
-
if "${cmd[@]}" 2>&1 | tee "$log_file"; then
|
|
150
|
-
RUN_STATUS[$run_key]="success"
|
|
151
|
-
else
|
|
152
|
-
RUN_STATUS[$run_key]="failed"
|
|
153
|
-
log WARN "$job_name exited with non-zero status"
|
|
154
|
-
fi
|
|
155
|
-
|
|
156
|
-
RUN_JOBS[$run_key]="$job_name"
|
|
157
|
-
|
|
158
|
-
local end_time
|
|
159
|
-
end_time=$(date +%s)
|
|
160
|
-
local duration=$(( end_time - start_time ))
|
|
161
|
-
local hours=$(( duration / 3600 ))
|
|
162
|
-
local minutes=$(( (duration % 3600) / 60 ))
|
|
163
|
-
|
|
164
|
-
log OK "$job_name completed in ${hours}h ${minutes}m"
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
print_summary() {
|
|
168
|
-
echo ""
|
|
169
|
-
echo "================================================================"
|
|
170
|
-
echo " BENCHMARK SUMMARY"
|
|
171
|
-
echo "================================================================"
|
|
172
|
-
echo ""
|
|
173
|
-
printf " %-12s %-30s %-10s %s\n" "Config" "Model" "Status" "Job Name"
|
|
174
|
-
printf " %-12s %-30s %-10s %s\n" "------" "-----" "------" "--------"
|
|
175
|
-
|
|
176
|
-
for model in "${SELECTED_MODELS[@]}"; do
|
|
177
|
-
local model_short="${MODEL_MAP[$model]}"
|
|
178
|
-
for config in baseline uam; do
|
|
179
|
-
local key="${config}_${model_short}"
|
|
180
|
-
local status="${RUN_STATUS[$key]:-not-run}"
|
|
181
|
-
local job="${RUN_JOBS[$key]:-N/A}"
|
|
182
|
-
printf " %-12s %-30s %-10s %s\n" "$config" "$model" "$status" "$job"
|
|
183
|
-
done
|
|
184
|
-
done
|
|
185
|
-
|
|
186
|
-
echo ""
|
|
187
|
-
echo " Results directory: $RESULTS_DIR"
|
|
188
|
-
echo " Timestamp: $TIMESTAMP"
|
|
189
|
-
echo ""
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
generate_report() {
|
|
193
|
-
log INFO "Generating comparison report..."
|
|
194
|
-
|
|
195
|
-
local report_script="$SCRIPT_DIR/generate-comparison-report.ts"
|
|
196
|
-
if [[ ! -f "$report_script" ]]; then
|
|
197
|
-
log WARN "Report generator not found at $report_script"
|
|
198
|
-
log INFO "Generating basic summary instead..."
|
|
199
|
-
generate_basic_report
|
|
200
|
-
return
|
|
201
|
-
fi
|
|
202
|
-
|
|
203
|
-
# Run the TypeScript report generator
|
|
204
|
-
local report_output
|
|
205
|
-
report_output="$RESULTS_DIR/FULL_COMPARISON_${TIMESTAMP}.md"
|
|
206
|
-
|
|
207
|
-
local job_args=""
|
|
208
|
-
for model in "${SELECTED_MODELS[@]}"; do
|
|
209
|
-
local model_short="${MODEL_MAP[$model]}"
|
|
210
|
-
if [[ "$RUN_BASELINE" == true ]]; then
|
|
211
|
-
local bj="${RUN_JOBS[baseline_${model_short}]:-}"
|
|
212
|
-
if [[ -n "$bj" ]]; then
|
|
213
|
-
job_args="$job_args --baseline $RESULTS_DIR/$bj"
|
|
214
|
-
fi
|
|
215
|
-
fi
|
|
216
|
-
if [[ "$RUN_UAM" == true ]]; then
|
|
217
|
-
local uj="${RUN_JOBS[uam_${model_short}]:-}"
|
|
218
|
-
if [[ -n "$uj" ]]; then
|
|
219
|
-
job_args="$job_args --uam $RESULTS_DIR/$uj"
|
|
220
|
-
fi
|
|
221
|
-
fi
|
|
222
|
-
done
|
|
223
|
-
|
|
224
|
-
if npx tsx "$report_script" \
|
|
225
|
-
--output "$report_output" \
|
|
226
|
-
--timestamp "$TIMESTAMP" \
|
|
227
|
-
$job_args 2>&1; then
|
|
228
|
-
log OK "Report saved to $report_output"
|
|
229
|
-
else
|
|
230
|
-
log WARN "TypeScript report generator failed, falling back to basic report"
|
|
231
|
-
generate_basic_report
|
|
232
|
-
fi
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
generate_basic_report() {
|
|
236
|
-
local report_file="$RESULTS_DIR/FULL_COMPARISON_${TIMESTAMP}.md"
|
|
237
|
-
|
|
238
|
-
cat > "$report_file" << HEADER
|
|
239
|
-
# Terminal-Bench 2.0 Full Comparison: UAM v3.1.0 vs Baseline
|
|
240
|
-
|
|
241
|
-
**Generated:** $(date -Iseconds)
|
|
242
|
-
**Dataset:** $DATASET (89 tasks)
|
|
243
|
-
**UAM Version:** 3.1.0
|
|
244
|
-
**Concurrency:** $CONCURRENCY | **Timeout Multiplier:** $TIMEOUT_MULT
|
|
245
|
-
|
|
246
|
-
## Results Summary
|
|
247
|
-
|
|
248
|
-
| Model | Config | Pass Rate | Passed | Failed | Errors |
|
|
249
|
-
|-------|--------|-----------|--------|--------|--------|
|
|
250
|
-
HEADER
|
|
251
|
-
|
|
252
|
-
for model in "${SELECTED_MODELS[@]}"; do
|
|
253
|
-
local model_short="${MODEL_MAP[$model]}"
|
|
254
|
-
for config in baseline uam; do
|
|
255
|
-
local key="${config}_${model_short}"
|
|
256
|
-
local job="${RUN_JOBS[$key]:-}"
|
|
257
|
-
local result_file="$RESULTS_DIR/$job/result.json"
|
|
258
|
-
|
|
259
|
-
if [[ -n "$job" ]] && [[ -f "$result_file" ]]; then
|
|
260
|
-
local stats
|
|
261
|
-
stats=$(python3 -c "
|
|
262
|
-
import json, sys
|
|
263
|
-
with open('$result_file') as f:
|
|
264
|
-
d = json.load(f)
|
|
265
|
-
evals = d['stats']['evals']
|
|
266
|
-
for k, v in evals.items():
|
|
267
|
-
rw = v.get('reward_stats', {}).get('reward', {})
|
|
268
|
-
p = len(rw.get('1.0', []))
|
|
269
|
-
f = len(rw.get('0.0', []))
|
|
270
|
-
total = p + f
|
|
271
|
-
rate = p/total*100 if total > 0 else 0
|
|
272
|
-
err = v.get('n_errors', 0)
|
|
273
|
-
print(f'{rate:.1f}%|{p}|{f}|{err}')
|
|
274
|
-
" 2>/dev/null || echo "N/A|N/A|N/A|N/A")
|
|
275
|
-
|
|
276
|
-
IFS='|' read -r rate passed failed errors <<< "$stats"
|
|
277
|
-
echo "| $model | $config | $rate | $passed | $failed | $errors |" >> "$report_file"
|
|
278
|
-
else
|
|
279
|
-
echo "| $model | $config | N/A | N/A | N/A | N/A |" >> "$report_file"
|
|
280
|
-
fi
|
|
281
|
-
done
|
|
282
|
-
done
|
|
283
|
-
|
|
284
|
-
# Add per-model delta section
|
|
285
|
-
cat >> "$report_file" << 'DELTAS'
|
|
286
|
-
|
|
287
|
-
## Per-Model UAM Delta
|
|
288
|
-
|
|
289
|
-
DELTAS
|
|
290
|
-
|
|
291
|
-
for model in "${SELECTED_MODELS[@]}"; do
|
|
292
|
-
local model_short="${MODEL_MAP[$model]}"
|
|
293
|
-
local bj="${RUN_JOBS[baseline_${model_short}]:-}"
|
|
294
|
-
local uj="${RUN_JOBS[uam_${model_short}]:-}"
|
|
295
|
-
local b_result="$RESULTS_DIR/$bj/result.json"
|
|
296
|
-
local u_result="$RESULTS_DIR/$uj/result.json"
|
|
297
|
-
|
|
298
|
-
if [[ -f "$b_result" ]] && [[ -f "$u_result" ]]; then
|
|
299
|
-
echo "### $model" >> "$report_file"
|
|
300
|
-
echo "" >> "$report_file"
|
|
301
|
-
|
|
302
|
-
python3 -c "
|
|
303
|
-
import json
|
|
304
|
-
with open('$b_result') as f:
|
|
305
|
-
bd = json.load(f)
|
|
306
|
-
with open('$u_result') as f:
|
|
307
|
-
ud = json.load(f)
|
|
308
|
-
|
|
309
|
-
def get_tasks(data):
|
|
310
|
-
evals = data['stats']['evals']
|
|
311
|
-
for k, v in evals.items():
|
|
312
|
-
rw = v.get('reward_stats', {}).get('reward', {})
|
|
313
|
-
passed = set(t.split('__')[0] for t in rw.get('1.0', []))
|
|
314
|
-
failed = set(t.split('__')[0] for t in rw.get('0.0', []))
|
|
315
|
-
return passed, failed
|
|
316
|
-
return set(), set()
|
|
317
|
-
|
|
318
|
-
bp, bf = get_tasks(bd)
|
|
319
|
-
up, uf = get_tasks(ud)
|
|
320
|
-
|
|
321
|
-
uam_wins = sorted(up - bp)
|
|
322
|
-
baseline_wins = sorted(bp - up)
|
|
323
|
-
both_pass = sorted(bp & up)
|
|
324
|
-
both_fail = sorted(bf & uf)
|
|
325
|
-
|
|
326
|
-
b_rate = len(bp)/(len(bp)+len(bf))*100 if (len(bp)+len(bf))>0 else 0
|
|
327
|
-
u_rate = len(up)/(len(up)+len(uf))*100 if (len(up)+len(uf))>0 else 0
|
|
328
|
-
delta = u_rate - b_rate
|
|
329
|
-
|
|
330
|
-
print(f'| Metric | Value |')
|
|
331
|
-
print(f'|--------|-------|')
|
|
332
|
-
print(f'| Baseline pass rate | {b_rate:.1f}% ({len(bp)}/{len(bp)+len(bf)}) |')
|
|
333
|
-
print(f'| UAM pass rate | {u_rate:.1f}% ({len(up)}/{len(up)+len(uf)}) |')
|
|
334
|
-
print(f'| **Net delta** | **{delta:+.1f}%** ({len(uam_wins)-len(baseline_wins):+d} tasks) |')
|
|
335
|
-
print(f'| UAM wins | {len(uam_wins)} tasks |')
|
|
336
|
-
print(f'| Baseline wins | {len(baseline_wins)} tasks |')
|
|
337
|
-
print(f'| Both pass | {len(both_pass)} tasks |')
|
|
338
|
-
print(f'| Both fail | {len(both_fail)} tasks |')
|
|
339
|
-
print()
|
|
340
|
-
|
|
341
|
-
if uam_wins:
|
|
342
|
-
print('**UAM wins:** ' + ', '.join(uam_wins))
|
|
343
|
-
print()
|
|
344
|
-
if baseline_wins:
|
|
345
|
-
print('**Baseline wins:** ' + ', '.join(baseline_wins))
|
|
346
|
-
print()
|
|
347
|
-
" >> "$report_file" 2>/dev/null || echo "Unable to parse results for $model" >> "$report_file"
|
|
348
|
-
echo "" >> "$report_file"
|
|
349
|
-
fi
|
|
350
|
-
done
|
|
351
|
-
|
|
352
|
-
echo "" >> "$report_file"
|
|
353
|
-
echo "---" >> "$report_file"
|
|
354
|
-
echo "*Report generated by \`scripts/run-full-benchmark.sh\` at $(date -Iseconds)*" >> "$report_file"
|
|
355
|
-
|
|
356
|
-
log OK "Basic report saved to $report_file"
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
# === Main ===
|
|
360
|
-
|
|
361
|
-
main() {
|
|
362
|
-
parse_args "$@"
|
|
363
|
-
|
|
364
|
-
echo "================================================================"
|
|
365
|
-
echo " Terminal-Bench 2.0 Full Benchmark"
|
|
366
|
-
echo " UAM v3.1.0 vs Baseline | $(date)"
|
|
367
|
-
echo "================================================================"
|
|
368
|
-
echo ""
|
|
369
|
-
echo " Models: ${SELECTED_MODELS[*]}"
|
|
370
|
-
echo " Configs: $([ "$RUN_BASELINE" = true ] && echo "baseline ")$([ "$RUN_UAM" = true ] && echo "uam")"
|
|
371
|
-
echo " Concurrency: $CONCURRENCY"
|
|
372
|
-
echo " Timeout: ${TIMEOUT_MULT}x"
|
|
373
|
-
echo " Results: $RESULTS_DIR"
|
|
374
|
-
echo " Timestamp: $TIMESTAMP"
|
|
375
|
-
echo ""
|
|
376
|
-
|
|
377
|
-
check_prerequisites
|
|
378
|
-
|
|
379
|
-
# Run each model x config combination
|
|
380
|
-
local run_count=0
|
|
381
|
-
local total_runs=0
|
|
382
|
-
|
|
383
|
-
for model in "${SELECTED_MODELS[@]}"; do
|
|
384
|
-
[[ "$RUN_BASELINE" == true ]] && (( total_runs++ )) || true
|
|
385
|
-
[[ "$RUN_UAM" == true ]] && (( total_runs++ )) || true
|
|
386
|
-
done
|
|
387
|
-
|
|
388
|
-
log INFO "Starting $total_runs benchmark runs..."
|
|
389
|
-
|
|
390
|
-
for model in "${SELECTED_MODELS[@]}"; do
|
|
391
|
-
if [[ "$RUN_BASELINE" == true ]]; then
|
|
392
|
-
(( run_count++ )) || true
|
|
393
|
-
log INFO "Run $run_count/$total_runs"
|
|
394
|
-
run_harbor "baseline" "$model"
|
|
395
|
-
fi
|
|
396
|
-
|
|
397
|
-
if [[ "$RUN_UAM" == true ]]; then
|
|
398
|
-
(( run_count++ )) || true
|
|
399
|
-
log INFO "Run $run_count/$total_runs"
|
|
400
|
-
run_harbor "uam" "$model"
|
|
401
|
-
fi
|
|
402
|
-
done
|
|
403
|
-
|
|
404
|
-
# Generate report
|
|
405
|
-
generate_report
|
|
406
|
-
|
|
407
|
-
# Print summary
|
|
408
|
-
print_summary
|
|
409
|
-
|
|
410
|
-
log OK "All benchmark runs complete."
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
main "$@"
|
|
@@ -1,252 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
#
|
|
3
|
-
# Run Terminal-Bench with Hybrid Adaptive UAM Context (Option 4)
|
|
4
|
-
#
|
|
5
|
-
# Key improvements over previous UAM runs:
|
|
6
|
-
# 1. Task classification skips UAM for reasoning/scheduling tasks
|
|
7
|
-
# 2. Time pressure assessment prevents timeout regressions
|
|
8
|
-
# 3. Historical benefit tracking optimizes context loading
|
|
9
|
-
# 4. Progressive context escalation on retry
|
|
10
|
-
# 5. Environment bootstrapping (Factory Droid technique)
|
|
11
|
-
# 6. Risk-aware prompting (Apex2 technique)
|
|
12
|
-
#
|
|
13
|
-
|
|
14
|
-
set -e
|
|
15
|
-
|
|
16
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
17
|
-
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
18
|
-
|
|
19
|
-
# Source environment
|
|
20
|
-
source ~/.profile 2>/dev/null || true
|
|
21
|
-
|
|
22
|
-
# Task classification function (mirrors TypeScript logic)
|
|
23
|
-
classify_task() {
|
|
24
|
-
local task="$1"
|
|
25
|
-
case "$task" in
|
|
26
|
-
# Pure reasoning/scheduling - SKIP UAM (prevents timeout regression)
|
|
27
|
-
constraints-scheduling|chess-best-move|regex-chess)
|
|
28
|
-
echo "skip"
|
|
29
|
-
;;
|
|
30
|
-
# Security tasks - FULL UAM
|
|
31
|
-
crack-7z-hash|password-recovery|vulnerable-secret|filter-js-from-html|break-filter-js-from-html)
|
|
32
|
-
echo "security"
|
|
33
|
-
;;
|
|
34
|
-
# File operations - FULL UAM
|
|
35
|
-
extract-elf|sqlite-db-truncate|db-wal-recovery)
|
|
36
|
-
echo "file-ops"
|
|
37
|
-
;;
|
|
38
|
-
# Legacy/modernization - FULL UAM
|
|
39
|
-
cobol-modernization)
|
|
40
|
-
echo "legacy"
|
|
41
|
-
;;
|
|
42
|
-
# ML tasks - MINIMAL UAM (risk-aware)
|
|
43
|
-
gpt2-codegolf)
|
|
44
|
-
echo "ml"
|
|
45
|
-
;;
|
|
46
|
-
# General coding - MINIMAL UAM
|
|
47
|
-
code-from-image|financial-document-processor|log-summary-date-ranges)
|
|
48
|
-
echo "coding"
|
|
49
|
-
;;
|
|
50
|
-
# Default
|
|
51
|
-
*)
|
|
52
|
-
echo "general"
|
|
53
|
-
;;
|
|
54
|
-
esac
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
# Generate context based on classification (Hybrid Adaptive logic)
|
|
58
|
-
get_adaptive_context() {
|
|
59
|
-
local category="$1"
|
|
60
|
-
|
|
61
|
-
case "$category" in
|
|
62
|
-
skip)
|
|
63
|
-
# No context for pure reasoning tasks
|
|
64
|
-
echo ""
|
|
65
|
-
;;
|
|
66
|
-
security)
|
|
67
|
-
cat << 'EOF'
|
|
68
|
-
## UAM Memory Context (Security)
|
|
69
|
-
|
|
70
|
-
### Security
|
|
71
|
-
- XSS: Use bleach.clean() or DOMPurify. Remove script, onclick, onerror, javascript:
|
|
72
|
-
- Password cracking: hashcat -m 11600 for 7z, -m 0 MD5, -m 1000 NTLM, john for CPU
|
|
73
|
-
- Binary secrets: strings, objdump -d, check .rodata section
|
|
74
|
-
- CVE exploitation: Research exact steps before attempting, some operations irreversible
|
|
75
|
-
|
|
76
|
-
### Tools
|
|
77
|
-
- hashcat: GPU password cracking, -m flag for hash type
|
|
78
|
-
- john: CPU password cracking, flexible format support
|
|
79
|
-
- readelf -a / objdump -d: Binary analysis
|
|
80
|
-
- strings: Extract printable strings from binaries
|
|
81
|
-
EOF
|
|
82
|
-
;;
|
|
83
|
-
file-ops)
|
|
84
|
-
cat << 'EOF'
|
|
85
|
-
## UAM Memory Context (File Operations)
|
|
86
|
-
|
|
87
|
-
### File Formats
|
|
88
|
-
- ELF: Program headers at e_phoff. Use struct.unpack('<HH...') for parsing
|
|
89
|
-
- SQLite WAL: Header 32 bytes, frames follow. PRAGMA wal_checkpoint(TRUNCATE) to recover
|
|
90
|
-
- 7z: Install p7zip-full, use 7z x -p'password' for password protected archives
|
|
91
|
-
|
|
92
|
-
### Coding
|
|
93
|
-
- Use absolute paths (/app/...) not relative
|
|
94
|
-
- Verify file exists before reading
|
|
95
|
-
- Handle edge cases in parsing
|
|
96
|
-
EOF
|
|
97
|
-
;;
|
|
98
|
-
legacy)
|
|
99
|
-
cat << 'EOF'
|
|
100
|
-
## UAM Memory Context (Legacy Code)
|
|
101
|
-
|
|
102
|
-
### Legacy Code
|
|
103
|
-
- COBOL: Fixed-format columns (1-6 sequence, 7 indicator, 8-72 code)
|
|
104
|
-
- WORKING-STORAGE SECTION for variables
|
|
105
|
-
- Modernization: Preserve business logic, update data structures
|
|
106
|
-
- Test with original inputs to verify behavior matches
|
|
107
|
-
EOF
|
|
108
|
-
;;
|
|
109
|
-
ml)
|
|
110
|
-
cat << 'EOF'
|
|
111
|
-
## UAM Memory Context (ML - Risk Aware)
|
|
112
|
-
|
|
113
|
-
### ML Training (Risk-Aware)
|
|
114
|
-
- CRITICAL: Test with small epochs/batch first before full runs
|
|
115
|
-
- Training can exceed 5 minutes - verify shapes and configs early
|
|
116
|
-
- Check GPU: nvidia-smi, CUDA_VISIBLE_DEVICES for GPU selection
|
|
117
|
-
- Cache datasets to avoid re-downloads
|
|
118
|
-
|
|
119
|
-
### Coding
|
|
120
|
-
- Use absolute paths (/app/...)
|
|
121
|
-
- Match exact output format required
|
|
122
|
-
EOF
|
|
123
|
-
;;
|
|
124
|
-
coding|general)
|
|
125
|
-
cat << 'EOF'
|
|
126
|
-
## UAM Memory Context (Coding)
|
|
127
|
-
|
|
128
|
-
### Coding
|
|
129
|
-
- Use absolute paths (/app/...) not relative
|
|
130
|
-
- Verify file exists before reading
|
|
131
|
-
- Handle edge cases in parsing
|
|
132
|
-
- Match exact output format required
|
|
133
|
-
EOF
|
|
134
|
-
;;
|
|
135
|
-
esac
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
# Main execution
|
|
139
|
-
TASKS=(
|
|
140
|
-
"crack-7z-hash"
|
|
141
|
-
"filter-js-from-html"
|
|
142
|
-
"cobol-modernization"
|
|
143
|
-
"code-from-image"
|
|
144
|
-
"sqlite-db-truncate"
|
|
145
|
-
"extract-elf"
|
|
146
|
-
"db-wal-recovery"
|
|
147
|
-
"vulnerable-secret"
|
|
148
|
-
"chess-best-move"
|
|
149
|
-
"log-summary-date-ranges"
|
|
150
|
-
"password-recovery"
|
|
151
|
-
"gpt2-codegolf"
|
|
152
|
-
"constraints-scheduling"
|
|
153
|
-
"financial-document-processor"
|
|
154
|
-
"regex-chess"
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
TIMESTAMP=$(date +%Y-%m-%d__%H-%M-%S)
|
|
158
|
-
JOBS_DIR="$PROJECT_ROOT/jobs/tbench_hybrid_adaptive_$TIMESTAMP"
|
|
159
|
-
|
|
160
|
-
echo "=============================================="
|
|
161
|
-
echo " Hybrid Adaptive UAM Terminal-Bench Runner"
|
|
162
|
-
echo " (Option 4 Implementation)"
|
|
163
|
-
echo "=============================================="
|
|
164
|
-
echo "Tasks: ${#TASKS[@]}"
|
|
165
|
-
echo "Output: $JOBS_DIR"
|
|
166
|
-
echo ""
|
|
167
|
-
|
|
168
|
-
# Show classification plan
|
|
169
|
-
echo "Task Classification (Hybrid Adaptive):"
|
|
170
|
-
echo "---------------------------------------"
|
|
171
|
-
SKIP_COUNT=0
|
|
172
|
-
FULL_COUNT=0
|
|
173
|
-
MINIMAL_COUNT=0
|
|
174
|
-
|
|
175
|
-
for task in "${TASKS[@]}"; do
|
|
176
|
-
category=$(classify_task "$task")
|
|
177
|
-
case "$category" in
|
|
178
|
-
skip)
|
|
179
|
-
echo " $task → NO UAM (reasoning/games - prevents timeout)"
|
|
180
|
-
((SKIP_COUNT++))
|
|
181
|
-
;;
|
|
182
|
-
security|file-ops|legacy)
|
|
183
|
-
echo " $task → FULL UAM ($category context)"
|
|
184
|
-
((FULL_COUNT++))
|
|
185
|
-
;;
|
|
186
|
-
ml|coding|general)
|
|
187
|
-
echo " $task → MINIMAL UAM ($category context)"
|
|
188
|
-
((MINIMAL_COUNT++))
|
|
189
|
-
;;
|
|
190
|
-
esac
|
|
191
|
-
done
|
|
192
|
-
|
|
193
|
-
echo ""
|
|
194
|
-
echo "Summary: $SKIP_COUNT skip, $FULL_COUNT full, $MINIMAL_COUNT minimal"
|
|
195
|
-
echo ""
|
|
196
|
-
|
|
197
|
-
# Build combined context (excluding pure reasoning tasks)
|
|
198
|
-
# This is the Hybrid Adaptive context that combines relevant sections
|
|
199
|
-
COMBINED_CONTEXT="## UAM Hybrid Adaptive Memory Context
|
|
200
|
-
|
|
201
|
-
### Security (for security tasks)
|
|
202
|
-
- XSS: bleach.clean(), remove script/onclick/javascript:
|
|
203
|
-
- Password: hashcat -m 11600 (7z), -m 0 (MD5), john for CPU
|
|
204
|
-
- Binary: strings, objdump -d, check .rodata
|
|
205
|
-
|
|
206
|
-
### File Formats (for file-ops tasks)
|
|
207
|
-
- ELF: e_phoff for headers, struct.unpack('<HH...')
|
|
208
|
-
- SQLite WAL: PRAGMA wal_checkpoint(TRUNCATE)
|
|
209
|
-
- 7z: p7zip, 7z x -p'password'
|
|
210
|
-
|
|
211
|
-
### Legacy (for modernization tasks)
|
|
212
|
-
- COBOL: columns 1-6 sequence, 7 indicator, 8-72 code
|
|
213
|
-
- WORKING-STORAGE for variables
|
|
214
|
-
- Test with original inputs
|
|
215
|
-
|
|
216
|
-
### Coding (minimal, for applicable tasks)
|
|
217
|
-
- Use absolute paths /app/
|
|
218
|
-
- Verify files exist before reading
|
|
219
|
-
- Match exact output format"
|
|
220
|
-
|
|
221
|
-
echo "Starting benchmark..."
|
|
222
|
-
echo ""
|
|
223
|
-
|
|
224
|
-
# Build task arguments
|
|
225
|
-
TASK_ARGS=""
|
|
226
|
-
for task in "${TASKS[@]}"; do
|
|
227
|
-
TASK_ARGS="$TASK_ARGS -t $task"
|
|
228
|
-
done
|
|
229
|
-
|
|
230
|
-
# Run with Harbor
|
|
231
|
-
harbor run -d terminal-bench@2.0 \
|
|
232
|
-
-a claude-code \
|
|
233
|
-
-m anthropic/claude-opus-4-5 \
|
|
234
|
-
--ak "append_system_prompt=$COMBINED_CONTEXT" \
|
|
235
|
-
$TASK_ARGS \
|
|
236
|
-
-k 1 \
|
|
237
|
-
--jobs-dir "$JOBS_DIR" \
|
|
238
|
-
-n 8 \
|
|
239
|
-
--timeout-multiplier 2.0
|
|
240
|
-
|
|
241
|
-
echo ""
|
|
242
|
-
echo "=============================================="
|
|
243
|
-
echo " Benchmark Complete"
|
|
244
|
-
echo "=============================================="
|
|
245
|
-
echo "Results: $JOBS_DIR/result.json"
|
|
246
|
-
echo ""
|
|
247
|
-
echo "Expected improvements over baseline:"
|
|
248
|
-
echo " - constraints-scheduling: Should PASS (no UAM overhead)"
|
|
249
|
-
echo " - extract-elf: Should PASS (file format context)"
|
|
250
|
-
echo " - password-recovery: Should PASS (security context)"
|
|
251
|
-
echo ""
|
|
252
|
-
echo "Compare with: jobs/tbench_uam_15/*/result.json"
|