universal-agent-memory 6.1.1 → 6.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,413 +0,0 @@
1
- #!/bin/bash
2
- #
3
- # Full Terminal-Bench 2.0 Benchmark: UAM v3.1.0 vs Baseline
4
- # Runs all 3 models x 2 configs = 6 total benchmark runs
5
- #
6
- # Models: Claude Opus 4.5, GPT 5.2 Codex, GLM 4.7
7
- # Configs: Baseline (no UAM), With UAM
8
- #
9
- # Usage:
10
- # export FACTORY_API_KEY="your-key"
11
- # ./scripts/run-full-benchmark.sh [options]
12
- #
13
- # Options:
14
- # --model <model> Run only this model (e.g. anthropic/claude-opus-4-5)
15
- # --baseline-only Skip UAM runs
16
- # --uam-only Skip baseline runs
17
- # --concurrency <n> Parallel tasks per run (default: 4)
18
- # --timeout-mult <f> Timeout multiplier (default: 2.0)
19
- # --dry-run Print commands without executing
20
- # --resume <timestamp> Resume a previous run using its timestamp
21
- # --help Show help
22
- #
23
-
24
- set -euo pipefail
25
-
26
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
27
- PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
28
- RESULTS_DIR="$PROJECT_ROOT/benchmark-results"
29
- TIMESTAMP=$(date +%Y%m%d_%H%M%S)
30
-
31
- # Models in Harbor format
32
- declare -A MODEL_MAP=(
33
- ["anthropic/claude-opus-4-5"]="opus45"
34
- ["openai/gpt-5.2-codex"]="gpt52"
35
- ["zhipu/glm-4.7"]="glm47"
36
- )
37
-
38
- ALL_MODELS=("anthropic/claude-opus-4-5" "openai/gpt-5.2-codex" "zhipu/glm-4.7")
39
-
40
- # Defaults
41
- CONCURRENCY=4
42
- TIMEOUT_MULT=2.0
43
- DATASET="terminal-bench@2.0"
44
- RUN_BASELINE=true
45
- RUN_UAM=true
46
- DRY_RUN=false
47
- SELECTED_MODELS=("${ALL_MODELS[@]}")
48
- RESUME_TS=""
49
-
50
- # Track run results for summary
51
- declare -A RUN_STATUS
52
- declare -A RUN_JOBS
53
-
54
- usage() {
55
- sed -n '2,/^$/p' "$0" | sed 's/^#//' | sed 's/^ //'
56
- exit 0
57
- }
58
-
59
- parse_args() {
60
- while [[ $# -gt 0 ]]; do
61
- case $1 in
62
- --model) SELECTED_MODELS=("$2"); shift 2 ;;
63
- --baseline-only) RUN_UAM=false; shift ;;
64
- --uam-only) RUN_BASELINE=false; shift ;;
65
- --concurrency) CONCURRENCY="$2"; shift 2 ;;
66
- --timeout-mult) TIMEOUT_MULT="$2"; shift 2 ;;
67
- --dry-run) DRY_RUN=true; shift ;;
68
- --resume) RESUME_TS="$2"; TIMESTAMP="$2"; shift 2 ;;
69
- --help) usage ;;
70
- *) echo "Unknown option: $1"; exit 1 ;;
71
- esac
72
- done
73
- }
74
-
75
- check_prerequisites() {
76
- if ! command -v harbor &>/dev/null; then
77
- echo "Error: 'harbor' CLI not found. Install from https://github.com/laude-institute/harbor"
78
- exit 1
79
- fi
80
-
81
- if [[ -z "${FACTORY_API_KEY:-}" ]] && [[ -z "${DROID_API_KEY:-}" ]] && [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
82
- echo "Error: No API key found. Set FACTORY_API_KEY, DROID_API_KEY, or ANTHROPIC_API_KEY"
83
- echo "Get your Factory key at: https://app.factory.ai/settings/api-keys"
84
- exit 1
85
- fi
86
- }
87
-
88
- log() {
89
- local level="$1"; shift
90
- local ts
91
- ts=$(date +"%H:%M:%S")
92
- case "$level" in
93
- INFO) echo -e "[$ts] \033[36mINFO\033[0m $*" ;;
94
- OK) echo -e "[$ts] \033[32mOK\033[0m $*" ;;
95
- WARN) echo -e "[$ts] \033[33mWARN\033[0m $*" ;;
96
- ERROR) echo -e "[$ts] \033[31mERROR\033[0m $*" ;;
97
- RUN) echo -e "[$ts] \033[35mRUN\033[0m $*" ;;
98
- esac
99
- }
100
-
101
- run_harbor() {
102
- local config_type="$1" # "baseline" or "uam"
103
- local model="$2"
104
- local model_short="${MODEL_MAP[$model]}"
105
- local job_name="${config_type}_${model_short}_${TIMESTAMP}"
106
- local log_file="$RESULTS_DIR/${job_name}.log"
107
- local run_key="${config_type}_${model_short}"
108
-
109
- # Skip if already completed (resume mode)
110
- if [[ -n "$RESUME_TS" ]] && [[ -f "$RESULTS_DIR/${job_name}/result.json" ]]; then
111
- log INFO "Skipping $job_name (already completed)"
112
- RUN_STATUS[$run_key]="skipped"
113
- RUN_JOBS[$run_key]="$job_name"
114
- return 0
115
- fi
116
-
117
- log RUN "$config_type | $model | job=$job_name"
118
-
119
- local cmd=(
120
- harbor run
121
- -d "$DATASET"
122
- -m "$model"
123
- -n "$CONCURRENCY"
124
- --timeout-multiplier "$TIMEOUT_MULT"
125
- --job-name "$job_name"
126
- --jobs-dir "$RESULTS_DIR"
127
- )
128
-
129
- if [[ "$config_type" == "baseline" ]]; then
130
- # Baseline: vanilla claude-code agent with no UAM context
131
- cmd+=(-a claude-code --ak "system_prompt=")
132
- else
133
- # UAM: custom agent with classified preamble and pre-execution hooks
134
- cmd+=(--agent-import-path "uam_harbor.uam_agent:UAMAgent")
135
- fi
136
-
137
- if [[ "$DRY_RUN" == true ]]; then
138
- echo " [DRY RUN] ${cmd[*]}"
139
- RUN_STATUS[$run_key]="dry-run"
140
- RUN_JOBS[$run_key]="$job_name"
141
- return 0
142
- fi
143
-
144
- mkdir -p "$RESULTS_DIR"
145
-
146
- local start_time
147
- start_time=$(date +%s)
148
-
149
- if "${cmd[@]}" 2>&1 | tee "$log_file"; then
150
- RUN_STATUS[$run_key]="success"
151
- else
152
- RUN_STATUS[$run_key]="failed"
153
- log WARN "$job_name exited with non-zero status"
154
- fi
155
-
156
- RUN_JOBS[$run_key]="$job_name"
157
-
158
- local end_time
159
- end_time=$(date +%s)
160
- local duration=$(( end_time - start_time ))
161
- local hours=$(( duration / 3600 ))
162
- local minutes=$(( (duration % 3600) / 60 ))
163
-
164
- log OK "$job_name completed in ${hours}h ${minutes}m"
165
- }
166
-
167
- print_summary() {
168
- echo ""
169
- echo "================================================================"
170
- echo " BENCHMARK SUMMARY"
171
- echo "================================================================"
172
- echo ""
173
- printf " %-12s %-30s %-10s %s\n" "Config" "Model" "Status" "Job Name"
174
- printf " %-12s %-30s %-10s %s\n" "------" "-----" "------" "--------"
175
-
176
- for model in "${SELECTED_MODELS[@]}"; do
177
- local model_short="${MODEL_MAP[$model]}"
178
- for config in baseline uam; do
179
- local key="${config}_${model_short}"
180
- local status="${RUN_STATUS[$key]:-not-run}"
181
- local job="${RUN_JOBS[$key]:-N/A}"
182
- printf " %-12s %-30s %-10s %s\n" "$config" "$model" "$status" "$job"
183
- done
184
- done
185
-
186
- echo ""
187
- echo " Results directory: $RESULTS_DIR"
188
- echo " Timestamp: $TIMESTAMP"
189
- echo ""
190
- }
191
-
192
- generate_report() {
193
- log INFO "Generating comparison report..."
194
-
195
- local report_script="$SCRIPT_DIR/generate-comparison-report.ts"
196
- if [[ ! -f "$report_script" ]]; then
197
- log WARN "Report generator not found at $report_script"
198
- log INFO "Generating basic summary instead..."
199
- generate_basic_report
200
- return
201
- fi
202
-
203
- # Run the TypeScript report generator
204
- local report_output
205
- report_output="$RESULTS_DIR/FULL_COMPARISON_${TIMESTAMP}.md"
206
-
207
- local job_args=""
208
- for model in "${SELECTED_MODELS[@]}"; do
209
- local model_short="${MODEL_MAP[$model]}"
210
- if [[ "$RUN_BASELINE" == true ]]; then
211
- local bj="${RUN_JOBS[baseline_${model_short}]:-}"
212
- if [[ -n "$bj" ]]; then
213
- job_args="$job_args --baseline $RESULTS_DIR/$bj"
214
- fi
215
- fi
216
- if [[ "$RUN_UAM" == true ]]; then
217
- local uj="${RUN_JOBS[uam_${model_short}]:-}"
218
- if [[ -n "$uj" ]]; then
219
- job_args="$job_args --uam $RESULTS_DIR/$uj"
220
- fi
221
- fi
222
- done
223
-
224
- if npx tsx "$report_script" \
225
- --output "$report_output" \
226
- --timestamp "$TIMESTAMP" \
227
- $job_args 2>&1; then
228
- log OK "Report saved to $report_output"
229
- else
230
- log WARN "TypeScript report generator failed, falling back to basic report"
231
- generate_basic_report
232
- fi
233
- }
234
-
235
- generate_basic_report() {
236
- local report_file="$RESULTS_DIR/FULL_COMPARISON_${TIMESTAMP}.md"
237
-
238
- cat > "$report_file" << HEADER
239
- # Terminal-Bench 2.0 Full Comparison: UAM v3.1.0 vs Baseline
240
-
241
- **Generated:** $(date -Iseconds)
242
- **Dataset:** $DATASET (89 tasks)
243
- **UAM Version:** 3.1.0
244
- **Concurrency:** $CONCURRENCY | **Timeout Multiplier:** $TIMEOUT_MULT
245
-
246
- ## Results Summary
247
-
248
- | Model | Config | Pass Rate | Passed | Failed | Errors |
249
- |-------|--------|-----------|--------|--------|--------|
250
- HEADER
251
-
252
- for model in "${SELECTED_MODELS[@]}"; do
253
- local model_short="${MODEL_MAP[$model]}"
254
- for config in baseline uam; do
255
- local key="${config}_${model_short}"
256
- local job="${RUN_JOBS[$key]:-}"
257
- local result_file="$RESULTS_DIR/$job/result.json"
258
-
259
- if [[ -n "$job" ]] && [[ -f "$result_file" ]]; then
260
- local stats
261
- stats=$(python3 -c "
262
- import json, sys
263
- with open('$result_file') as f:
264
- d = json.load(f)
265
- evals = d['stats']['evals']
266
- for k, v in evals.items():
267
- rw = v.get('reward_stats', {}).get('reward', {})
268
- p = len(rw.get('1.0', []))
269
- f = len(rw.get('0.0', []))
270
- total = p + f
271
- rate = p/total*100 if total > 0 else 0
272
- err = v.get('n_errors', 0)
273
- print(f'{rate:.1f}%|{p}|{f}|{err}')
274
- " 2>/dev/null || echo "N/A|N/A|N/A|N/A")
275
-
276
- IFS='|' read -r rate passed failed errors <<< "$stats"
277
- echo "| $model | $config | $rate | $passed | $failed | $errors |" >> "$report_file"
278
- else
279
- echo "| $model | $config | N/A | N/A | N/A | N/A |" >> "$report_file"
280
- fi
281
- done
282
- done
283
-
284
- # Add per-model delta section
285
- cat >> "$report_file" << 'DELTAS'
286
-
287
- ## Per-Model UAM Delta
288
-
289
- DELTAS
290
-
291
- for model in "${SELECTED_MODELS[@]}"; do
292
- local model_short="${MODEL_MAP[$model]}"
293
- local bj="${RUN_JOBS[baseline_${model_short}]:-}"
294
- local uj="${RUN_JOBS[uam_${model_short}]:-}"
295
- local b_result="$RESULTS_DIR/$bj/result.json"
296
- local u_result="$RESULTS_DIR/$uj/result.json"
297
-
298
- if [[ -f "$b_result" ]] && [[ -f "$u_result" ]]; then
299
- echo "### $model" >> "$report_file"
300
- echo "" >> "$report_file"
301
-
302
- python3 -c "
303
- import json
304
- with open('$b_result') as f:
305
- bd = json.load(f)
306
- with open('$u_result') as f:
307
- ud = json.load(f)
308
-
309
- def get_tasks(data):
310
- evals = data['stats']['evals']
311
- for k, v in evals.items():
312
- rw = v.get('reward_stats', {}).get('reward', {})
313
- passed = set(t.split('__')[0] for t in rw.get('1.0', []))
314
- failed = set(t.split('__')[0] for t in rw.get('0.0', []))
315
- return passed, failed
316
- return set(), set()
317
-
318
- bp, bf = get_tasks(bd)
319
- up, uf = get_tasks(ud)
320
-
321
- uam_wins = sorted(up - bp)
322
- baseline_wins = sorted(bp - up)
323
- both_pass = sorted(bp & up)
324
- both_fail = sorted(bf & uf)
325
-
326
- b_rate = len(bp)/(len(bp)+len(bf))*100 if (len(bp)+len(bf))>0 else 0
327
- u_rate = len(up)/(len(up)+len(uf))*100 if (len(up)+len(uf))>0 else 0
328
- delta = u_rate - b_rate
329
-
330
- print(f'| Metric | Value |')
331
- print(f'|--------|-------|')
332
- print(f'| Baseline pass rate | {b_rate:.1f}% ({len(bp)}/{len(bp)+len(bf)}) |')
333
- print(f'| UAM pass rate | {u_rate:.1f}% ({len(up)}/{len(up)+len(uf)}) |')
334
- print(f'| **Net delta** | **{delta:+.1f}%** ({len(uam_wins)-len(baseline_wins):+d} tasks) |')
335
- print(f'| UAM wins | {len(uam_wins)} tasks |')
336
- print(f'| Baseline wins | {len(baseline_wins)} tasks |')
337
- print(f'| Both pass | {len(both_pass)} tasks |')
338
- print(f'| Both fail | {len(both_fail)} tasks |')
339
- print()
340
-
341
- if uam_wins:
342
- print('**UAM wins:** ' + ', '.join(uam_wins))
343
- print()
344
- if baseline_wins:
345
- print('**Baseline wins:** ' + ', '.join(baseline_wins))
346
- print()
347
- " >> "$report_file" 2>/dev/null || echo "Unable to parse results for $model" >> "$report_file"
348
- echo "" >> "$report_file"
349
- fi
350
- done
351
-
352
- echo "" >> "$report_file"
353
- echo "---" >> "$report_file"
354
- echo "*Report generated by \`scripts/run-full-benchmark.sh\` at $(date -Iseconds)*" >> "$report_file"
355
-
356
- log OK "Basic report saved to $report_file"
357
- }
358
-
359
- # === Main ===
360
-
361
- main() {
362
- parse_args "$@"
363
-
364
- echo "================================================================"
365
- echo " Terminal-Bench 2.0 Full Benchmark"
366
- echo " UAM v3.1.0 vs Baseline | $(date)"
367
- echo "================================================================"
368
- echo ""
369
- echo " Models: ${SELECTED_MODELS[*]}"
370
- echo " Configs: $([ "$RUN_BASELINE" = true ] && echo "baseline ")$([ "$RUN_UAM" = true ] && echo "uam")"
371
- echo " Concurrency: $CONCURRENCY"
372
- echo " Timeout: ${TIMEOUT_MULT}x"
373
- echo " Results: $RESULTS_DIR"
374
- echo " Timestamp: $TIMESTAMP"
375
- echo ""
376
-
377
- check_prerequisites
378
-
379
- # Run each model x config combination
380
- local run_count=0
381
- local total_runs=0
382
-
383
- for model in "${SELECTED_MODELS[@]}"; do
384
- [[ "$RUN_BASELINE" == true ]] && (( total_runs++ )) || true
385
- [[ "$RUN_UAM" == true ]] && (( total_runs++ )) || true
386
- done
387
-
388
- log INFO "Starting $total_runs benchmark runs..."
389
-
390
- for model in "${SELECTED_MODELS[@]}"; do
391
- if [[ "$RUN_BASELINE" == true ]]; then
392
- (( run_count++ )) || true
393
- log INFO "Run $run_count/$total_runs"
394
- run_harbor "baseline" "$model"
395
- fi
396
-
397
- if [[ "$RUN_UAM" == true ]]; then
398
- (( run_count++ )) || true
399
- log INFO "Run $run_count/$total_runs"
400
- run_harbor "uam" "$model"
401
- fi
402
- done
403
-
404
- # Generate report
405
- generate_report
406
-
407
- # Print summary
408
- print_summary
409
-
410
- log OK "All benchmark runs complete."
411
- }
412
-
413
- main "$@"
@@ -1,252 +0,0 @@
1
- #!/bin/bash
2
- #
3
- # Run Terminal-Bench with Hybrid Adaptive UAM Context (Option 4)
4
- #
5
- # Key improvements over previous UAM runs:
6
- # 1. Task classification skips UAM for reasoning/scheduling tasks
7
- # 2. Time pressure assessment prevents timeout regressions
8
- # 3. Historical benefit tracking optimizes context loading
9
- # 4. Progressive context escalation on retry
10
- # 5. Environment bootstrapping (Factory Droid technique)
11
- # 6. Risk-aware prompting (Apex2 technique)
12
- #
13
-
14
- set -e
15
-
16
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
17
- PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
18
-
19
- # Source environment
20
- source ~/.profile 2>/dev/null || true
21
-
22
- # Task classification function (mirrors TypeScript logic)
23
- classify_task() {
24
- local task="$1"
25
- case "$task" in
26
- # Pure reasoning/scheduling - SKIP UAM (prevents timeout regression)
27
- constraints-scheduling|chess-best-move|regex-chess)
28
- echo "skip"
29
- ;;
30
- # Security tasks - FULL UAM
31
- crack-7z-hash|password-recovery|vulnerable-secret|filter-js-from-html|break-filter-js-from-html)
32
- echo "security"
33
- ;;
34
- # File operations - FULL UAM
35
- extract-elf|sqlite-db-truncate|db-wal-recovery)
36
- echo "file-ops"
37
- ;;
38
- # Legacy/modernization - FULL UAM
39
- cobol-modernization)
40
- echo "legacy"
41
- ;;
42
- # ML tasks - MINIMAL UAM (risk-aware)
43
- gpt2-codegolf)
44
- echo "ml"
45
- ;;
46
- # General coding - MINIMAL UAM
47
- code-from-image|financial-document-processor|log-summary-date-ranges)
48
- echo "coding"
49
- ;;
50
- # Default
51
- *)
52
- echo "general"
53
- ;;
54
- esac
55
- }
56
-
57
- # Generate context based on classification (Hybrid Adaptive logic)
58
- get_adaptive_context() {
59
- local category="$1"
60
-
61
- case "$category" in
62
- skip)
63
- # No context for pure reasoning tasks
64
- echo ""
65
- ;;
66
- security)
67
- cat << 'EOF'
68
- ## UAM Memory Context (Security)
69
-
70
- ### Security
71
- - XSS: Use bleach.clean() or DOMPurify. Remove script, onclick, onerror, javascript:
72
- - Password cracking: hashcat -m 11600 for 7z, -m 0 MD5, -m 1000 NTLM, john for CPU
73
- - Binary secrets: strings, objdump -d, check .rodata section
74
- - CVE exploitation: Research exact steps before attempting, some operations irreversible
75
-
76
- ### Tools
77
- - hashcat: GPU password cracking, -m flag for hash type
78
- - john: CPU password cracking, flexible format support
79
- - readelf -a / objdump -d: Binary analysis
80
- - strings: Extract printable strings from binaries
81
- EOF
82
- ;;
83
- file-ops)
84
- cat << 'EOF'
85
- ## UAM Memory Context (File Operations)
86
-
87
- ### File Formats
88
- - ELF: Program headers at e_phoff. Use struct.unpack('<HH...') for parsing
89
- - SQLite WAL: Header 32 bytes, frames follow. PRAGMA wal_checkpoint(TRUNCATE) to recover
90
- - 7z: Install p7zip-full, use 7z x -p'password' for password protected archives
91
-
92
- ### Coding
93
- - Use absolute paths (/app/...) not relative
94
- - Verify file exists before reading
95
- - Handle edge cases in parsing
96
- EOF
97
- ;;
98
- legacy)
99
- cat << 'EOF'
100
- ## UAM Memory Context (Legacy Code)
101
-
102
- ### Legacy Code
103
- - COBOL: Fixed-format columns (1-6 sequence, 7 indicator, 8-72 code)
104
- - WORKING-STORAGE SECTION for variables
105
- - Modernization: Preserve business logic, update data structures
106
- - Test with original inputs to verify behavior matches
107
- EOF
108
- ;;
109
- ml)
110
- cat << 'EOF'
111
- ## UAM Memory Context (ML - Risk Aware)
112
-
113
- ### ML Training (Risk-Aware)
114
- - CRITICAL: Test with small epochs/batch first before full runs
115
- - Training can exceed 5 minutes - verify shapes and configs early
116
- - Check GPU: nvidia-smi, CUDA_VISIBLE_DEVICES for GPU selection
117
- - Cache datasets to avoid re-downloads
118
-
119
- ### Coding
120
- - Use absolute paths (/app/...)
121
- - Match exact output format required
122
- EOF
123
- ;;
124
- coding|general)
125
- cat << 'EOF'
126
- ## UAM Memory Context (Coding)
127
-
128
- ### Coding
129
- - Use absolute paths (/app/...) not relative
130
- - Verify file exists before reading
131
- - Handle edge cases in parsing
132
- - Match exact output format required
133
- EOF
134
- ;;
135
- esac
136
- }
137
-
138
- # Main execution
139
- TASKS=(
140
- "crack-7z-hash"
141
- "filter-js-from-html"
142
- "cobol-modernization"
143
- "code-from-image"
144
- "sqlite-db-truncate"
145
- "extract-elf"
146
- "db-wal-recovery"
147
- "vulnerable-secret"
148
- "chess-best-move"
149
- "log-summary-date-ranges"
150
- "password-recovery"
151
- "gpt2-codegolf"
152
- "constraints-scheduling"
153
- "financial-document-processor"
154
- "regex-chess"
155
- )
156
-
157
- TIMESTAMP=$(date +%Y-%m-%d__%H-%M-%S)
158
- JOBS_DIR="$PROJECT_ROOT/jobs/tbench_hybrid_adaptive_$TIMESTAMP"
159
-
160
- echo "=============================================="
161
- echo " Hybrid Adaptive UAM Terminal-Bench Runner"
162
- echo " (Option 4 Implementation)"
163
- echo "=============================================="
164
- echo "Tasks: ${#TASKS[@]}"
165
- echo "Output: $JOBS_DIR"
166
- echo ""
167
-
168
- # Show classification plan
169
- echo "Task Classification (Hybrid Adaptive):"
170
- echo "---------------------------------------"
171
- SKIP_COUNT=0
172
- FULL_COUNT=0
173
- MINIMAL_COUNT=0
174
-
175
- for task in "${TASKS[@]}"; do
176
- category=$(classify_task "$task")
177
- case "$category" in
178
- skip)
179
- echo " $task → NO UAM (reasoning/games - prevents timeout)"
180
- ((SKIP_COUNT++))
181
- ;;
182
- security|file-ops|legacy)
183
- echo " $task → FULL UAM ($category context)"
184
- ((FULL_COUNT++))
185
- ;;
186
- ml|coding|general)
187
- echo " $task → MINIMAL UAM ($category context)"
188
- ((MINIMAL_COUNT++))
189
- ;;
190
- esac
191
- done
192
-
193
- echo ""
194
- echo "Summary: $SKIP_COUNT skip, $FULL_COUNT full, $MINIMAL_COUNT minimal"
195
- echo ""
196
-
197
- # Build combined context (excluding pure reasoning tasks)
198
- # This is the Hybrid Adaptive context that combines relevant sections
199
- COMBINED_CONTEXT="## UAM Hybrid Adaptive Memory Context
200
-
201
- ### Security (for security tasks)
202
- - XSS: bleach.clean(), remove script/onclick/javascript:
203
- - Password: hashcat -m 11600 (7z), -m 0 (MD5), john for CPU
204
- - Binary: strings, objdump -d, check .rodata
205
-
206
- ### File Formats (for file-ops tasks)
207
- - ELF: e_phoff for headers, struct.unpack('<HH...')
208
- - SQLite WAL: PRAGMA wal_checkpoint(TRUNCATE)
209
- - 7z: p7zip, 7z x -p'password'
210
-
211
- ### Legacy (for modernization tasks)
212
- - COBOL: columns 1-6 sequence, 7 indicator, 8-72 code
213
- - WORKING-STORAGE for variables
214
- - Test with original inputs
215
-
216
- ### Coding (minimal, for applicable tasks)
217
- - Use absolute paths /app/
218
- - Verify files exist before reading
219
- - Match exact output format"
220
-
221
- echo "Starting benchmark..."
222
- echo ""
223
-
224
- # Build task arguments
225
- TASK_ARGS=""
226
- for task in "${TASKS[@]}"; do
227
- TASK_ARGS="$TASK_ARGS -t $task"
228
- done
229
-
230
- # Run with Harbor
231
- harbor run -d terminal-bench@2.0 \
232
- -a claude-code \
233
- -m anthropic/claude-opus-4-5 \
234
- --ak "append_system_prompt=$COMBINED_CONTEXT" \
235
- $TASK_ARGS \
236
- -k 1 \
237
- --jobs-dir "$JOBS_DIR" \
238
- -n 8 \
239
- --timeout-multiplier 2.0
240
-
241
- echo ""
242
- echo "=============================================="
243
- echo " Benchmark Complete"
244
- echo "=============================================="
245
- echo "Results: $JOBS_DIR/result.json"
246
- echo ""
247
- echo "Expected improvements over baseline:"
248
- echo " - constraints-scheduling: Should PASS (no UAM overhead)"
249
- echo " - extract-elf: Should PASS (file format context)"
250
- echo " - password-recovery: Should PASS (security context)"
251
- echo ""
252
- echo "Compare with: jobs/tbench_uam_15/*/result.json"