@miller-tech/uap 1.39.0 → 1.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/README.md +109 -642
  2. package/dist/.tsbuildinfo +1 -1
  3. package/dist/bin/cli.js +2 -2
  4. package/dist/bin/cli.js.map +1 -1
  5. package/dist/cli/deliver.d.ts +3 -2
  6. package/dist/cli/deliver.d.ts.map +1 -1
  7. package/dist/cli/deliver.js +10 -5
  8. package/dist/cli/deliver.js.map +1 -1
  9. package/docs/INDEX.md +48 -286
  10. package/docs/architecture/OVERVIEW.md +328 -0
  11. package/docs/architecture/PROTOCOL.md +204 -0
  12. package/docs/benchmarks/README.md +17 -192
  13. package/docs/getting-started/CONFIGURATION.md +237 -0
  14. package/docs/getting-started/INSTALLATION.md +125 -0
  15. package/docs/getting-started/QUICKSTART.md +115 -0
  16. package/docs/guides/COORDINATION.md +162 -0
  17. package/docs/guides/DELIVER.md +115 -0
  18. package/docs/guides/DEPLOY_BATCHING.md +212 -0
  19. package/docs/guides/DROIDS_AND_SKILLS.md +202 -0
  20. package/docs/guides/LOCAL_MODELS.md +148 -0
  21. package/docs/guides/MCP_ROUTER.md +195 -0
  22. package/docs/guides/MEMORY.md +235 -0
  23. package/docs/guides/MULTI_MODEL.md +223 -0
  24. package/docs/guides/POLICIES.md +190 -0
  25. package/docs/guides/WORKTREE_WORKFLOW.md +185 -0
  26. package/docs/integrations/MCP_ROUTER.md +147 -0
  27. package/docs/integrations/RTK.md +102 -0
  28. package/docs/reference/API.md +485 -0
  29. package/docs/reference/CLI.md +719 -0
  30. package/docs/reference/CONFIGURATION.md +90 -193
  31. package/docs/reference/DATABASE_SCHEMA.md +110 -344
  32. package/docs/reference/FEATURES.md +176 -472
  33. package/docs/reference/PATTERNS.md +102 -0
  34. package/docs/reference/PLATFORMS.md +83 -0
  35. package/package.json +1 -1
  36. package/docs/AGENTS.md +0 -423
  37. package/docs/DOCUMENTATION_AUDIT_REPORT.md +0 -131
  38. package/docs/GETTING_STARTED.md +0 -288
  39. package/docs/PROJECT_ANALYSIS_REPORT.md +0 -510
  40. package/docs/architecture/COMPLETE_ARCHITECTURE.md +0 -748
  41. package/docs/architecture/EXPERT_STACK.md +0 -137
  42. package/docs/architecture/MULTI_MODEL.md +0 -224
  43. package/docs/architecture/PLATFORM_GATING.md +0 -68
  44. package/docs/architecture/SYSTEM_ANALYSIS.md +0 -334
  45. package/docs/architecture/UAP_COMPLIANCE.md +0 -217
  46. package/docs/architecture/UAP_PROTOCOL.md +0 -339
  47. package/docs/architecture/UAP_STRICT_DROIDS.md +0 -172
  48. package/docs/archive/BALLS_MODE_SELF_ANALYSIS.md +0 -260
  49. package/docs/archive/BENCHMARK_GAPS_AND_PLAN.md +0 -146
  50. package/docs/archive/FAILING_TASKS_SOLUTION_PLAN.md +0 -668
  51. package/docs/archive/JINJA2-SYSTEM-MESSAGE-FIX.md +0 -209
  52. package/docs/archive/MODEL_ROUTING_IMPLEMENTATION_SUMMARY.md +0 -281
  53. package/docs/archive/MODEL_ROUTING_OPTIMIZATION_PLAN.md +0 -320
  54. package/docs/archive/NPM-PUBLISH-V0.9.1.md +0 -240
  55. package/docs/archive/OPTIMIZATION_OPTIONS.md +0 -334
  56. package/docs/archive/PARALLELISM_GAPS_AND_OPTIONS.md +0 -422
  57. package/docs/archive/POLICY_GATE_IMPLEMENTATION.md +0 -245
  58. package/docs/archive/SETUP_IMPROVEMENTS.md +0 -213
  59. package/docs/archive/UAP_GENERIC_OPTIMIZATION_PLAN.md +0 -270
  60. package/docs/archive/UAP_OPTIMIZATION_PLAN.md +0 -701
  61. package/docs/archive/UAP_V103_PATTERN_DESIGN.md +0 -315
  62. package/docs/archive/UAP_V104_COMPLIANCE_DESIGN.md +0 -223
  63. package/docs/archive/changelog/2026-03-10_uap-100-compliance.md +0 -77
  64. package/docs/archive/changelog/2026-03-10_uap-full-system-verification.md +0 -109
  65. package/docs/archive/opencode-integration-guide.md +0 -740
  66. package/docs/archive/opencode-integration-quickref.md +0 -180
  67. package/docs/benchmarks/OVERNIGHT_RUNNER.md +0 -341
  68. package/docs/benchmarks/SPECULATIVE_DECODING_JOURNEY_2026-03.md +0 -221
  69. package/docs/benchmarks/VALIDATION_PLAN.md +0 -568
  70. package/docs/blog/SPECULATIVE_DECODING_PRODUCTION_PLAYBOOK.md +0 -139
  71. package/docs/blog/local-coding-agents.md +0 -266
  72. package/docs/blog/x-thread.md +0 -254
  73. package/docs/deployment/DEPLOYMENT.md +0 -895
  74. package/docs/deployment/DEPLOYMENT_STRATEGIES.md +0 -518
  75. package/docs/deployment/DEPLOY_BATCHER_ANALYSIS.md +0 -224
  76. package/docs/deployment/DEPLOY_BATCHING.md +0 -273
  77. package/docs/deployment/DEPLOY_BUCKETING_ANALYSIS.md +0 -420
  78. package/docs/deployment/QWEN35_LLAMA_CPP.md +0 -426
  79. package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +0 -279
  80. package/docs/getting-started/INTEGRATION.md +0 -628
  81. package/docs/getting-started/OVERVIEW.md +0 -324
  82. package/docs/getting-started/SETUP.md +0 -377
  83. package/docs/integrations/MCP_ROUTER_SETUP.md +0 -445
  84. package/docs/integrations/RTK_INTEGRATION.md +0 -468
  85. package/docs/operations/TROUBLESHOOTING.md +0 -660
  86. package/docs/pr/PR_SPECULATIVE_DOCS_TEMPLATE.md +0 -146
  87. package/docs/pr/UPSTREAM_PRS.md +0 -424
  88. package/docs/reference/API_REFERENCE.md +0 -903
  89. package/docs/reference/EXPERT_DROIDS.md +0 -219
  90. package/docs/reference/HARNESS-MATRIX.md +0 -318
  91. package/docs/reference/PATTERN_LIBRARY.md +0 -636
  92. package/docs/reference/UAP_CLI_REFERENCE.md +0 -620
  93. package/docs/research/BEHAVIORAL_PATTERNS.md +0 -228
  94. package/docs/research/DOMAIN_STRATEGIES.md +0 -316
  95. package/docs/research/MEMORY_SYSTEMS_COMPARISON.md +0 -812
  96. package/docs/research/PATTERN_ANALYSIS_2026-01-18.md +0 -436
  97. package/docs/research/PERFORMANCE_ANALYSIS_2026-01-18.md +0 -209
  98. package/docs/research/PERFORMANCE_TEST_PLAN.md +0 -383
  99. package/docs/research/TERMINAL_BENCH_LEARNINGS.md +0 -217
@@ -1,260 +0,0 @@
1
- # UAP Project Self-Analysis Using Balls-Mode
2
-
3
- **Date:** 2026-01-19
4
- **Analysis Method:** Balls-Mode Decomposed Reasoning with Confidence Scoring
5
- **Purpose:** Identify optimization opportunities for Terminal-Bench performance
6
-
7
- ---
8
-
9
- ## /balls Can UAP improve Terminal-Bench pass rate from 54% to 70%?
10
-
11
- ### Step 1: CLASSIFY
12
-
13
- **Complex** - Multi-faceted optimization question requiring analysis of:
14
-
15
- - Current failure modes
16
- - Pattern effectiveness
17
- - Implementation gaps
18
- - Resource constraints
19
-
20
- ---
21
-
22
- ### Step 2: DECOMPOSE
23
-
24
- | # | Ball | Why it matters |
25
- | --- | ------------------------------------------ | ---------------------------------- |
26
- | 1 | Are patterns being applied correctly? | Patterns exist but may not trigger |
27
- | 2 | Which tasks are near-miss (>50% subtests)? | Highest ROI targets |
28
- | 3 | Are pre-hooks working? | Domain knowledge injection |
29
- | 4 | Is the model capable enough? | Some tasks need stronger model |
30
- | 5 | Are impossible tasks detected early? | Avoid wasting time |
31
- | 6 | Is output verification happening? | 37% of failures are missing files |
32
- | 7 | Is round-trip testing done? | Compression/encoding failures |
33
- | 8 | Are domain libraries used? | Chess/stats need specialized tools |
34
-
35
- ---
36
-
37
- ### Step 3: SOLVE & VERIFY
38
-
39
- #### Ball 1: Are patterns being applied correctly?
40
-
41
- **Evidence:**
42
-
43
- - Pattern Router prints analysis block ✓
44
- - But: winning-avg-corewars showed 47% improvement when hooks worked
45
- - Some patterns in CLAUDE.md but not enforced
46
-
47
- **Answer:** Patterns exist but compliance is inconsistent
48
-
49
- #### Ball 2: Which tasks are near-miss?
50
-
51
- **Evidence from benchmark data:**
52
-
53
- - adaptive-rejection-sampler: 8/9 (88%) - 1 test away
54
- - headless-terminal: 6/7 (85%) - 1 test away
55
- - winning-avg-corewars: 4/5 subtests (80%) - 1% threshold miss
56
- - write-compressor: 2/3 (67%) - round-trip issue
57
- - pytorch-model-cli: shebang/chmod issue
58
-
59
- **Answer:** 5 tasks within striking distance (could add +5 to pass count)
60
-
61
- #### Ball 3: Are pre-hooks working?
62
-
63
- **Evidence:**
64
-
65
- - CoreWars: +47% improvement when hook provided strategy
66
- - XSS: Hook didn't help (still failing)
67
- - Compression: Hook didn't fix round-trip issue
68
-
69
- **Answer:** Pre-hooks work for domain knowledge, not for implementation bugs
70
-
71
- #### Ball 4: Is the model capable enough?
72
-
73
- **Evidence:**
74
-
75
- - Opus 4.5: 54% pass rate
76
- - Sonnet 4: 11% pass rate
77
- - Droid (top leaderboard): 65% with GPT-5.2
78
-
79
- **Answer:** Model matters significantly; Opus 4.5 is competitive but not best
80
-
81
- #### Ball 5: Are impossible tasks detected early?
82
-
83
- **Evidence:**
84
-
85
- - gpt2-codegolf: Still attempted despite being impossible
86
- - Pattern 5/19 exists but agent doesn't refuse quickly
87
-
88
- **Answer:** Detection exists but enforcement is weak
89
-
90
- #### Ball 6: Is output verification happening?
91
-
92
- **Evidence:**
93
-
94
- - P12 (OEV) added in v10.2
95
- - But: pytorch-model-cli failed due to missing shebang
96
- - filter-js-from-html: file not created
97
-
98
- **Answer:** Verification exists but not enforced pre-completion
99
-
100
- #### Ball 7: Is round-trip testing done?
101
-
102
- **Evidence:**
103
-
104
- - write-compressor: Decompression fails
105
- - P34 (Reversibility Verification) added
106
- - Agent doesn't test round-trip before optimizing
107
-
108
- **Answer:** Pattern exists but not followed
109
-
110
- #### Ball 8: Are domain libraries used?
111
-
112
- **Evidence:**
113
-
114
- - chess-best-move: Visual FEN extraction failed
115
- - adaptive-rejection-sampler: Custom R code vs CRAN 'ars' package
116
- - P32 recommends libraries over custom
117
-
118
- **Answer:** Agents reinvent instead of using established libraries
119
-
120
- ---
121
-
122
- ### Step 4: SCORE
123
-
124
- | Ball | Answer | Confidence | Notes |
125
- | -------------------- | ----------------------------------------- | ---------- | -------------------------- |
126
- | Pattern compliance | Inconsistent - some patterns not enforced | 0.7 | Clear from logs |
127
- | Near-miss tasks | 5 tasks within reach | 0.85 | Data-driven |
128
- | Pre-hooks effective | Yes for domain knowledge | 0.75 | CoreWars proves it |
129
- | Model capability | Opus 4.5 competitive | 0.8 | Leaderboard data |
130
- | Impossible detection | Weak enforcement | 0.6 | gpt2-codegolf evidence |
131
- | Output verification | Exists but not enforced | 0.65 | pytorch-model-cli failure |
132
- | Round-trip testing | Not done systematically | 0.5 | write-compressor shows gap |
133
- | Library usage | Reinventing instead of reusing | 0.7 | Chess/R failures |
134
-
135
- ---
136
-
137
- ### Step 5: SYNTHESIZE
138
-
139
- ## Synthesis
140
-
141
- **Answer**: Yes, 70% is achievable by fixing 3 key gaps:
142
-
143
- 1. **Enforce pattern compliance** - Make Gate checks mandatory, not advisory
144
- 2. **Target near-miss tasks** - 5 tasks at >50% need specific fixes
145
- 3. **Use established libraries** - Chess (python-chess), Stats (CRAN ars), Compression (zlib)
146
-
147
- **Overall Confidence**: 0.65
148
-
149
- **Weakest Links**:
150
-
151
- - Round-trip testing (0.5) - Compression tasks will keep failing without this
152
- - Impossible detection (0.6) - Time wasted on gpt2-codegolf
153
-
154
- **To increase confidence**:
155
-
156
- 1. Run targeted tests on the 5 near-miss tasks with specific fixes
157
- 2. Add mandatory round-trip verification for compression tasks
158
- 3. Implement library-first pattern in pre-hooks
159
-
160
- ---
161
-
162
- ## Specific Optimization Actions
163
-
164
- ### High-Priority (Addresses weakest balls)
165
-
166
- #### 1. Mandatory Round-Trip Verification Hook
167
-
168
- ```bash
169
- # Pre-hook for compression tasks
170
- cat > /tmp/verify_roundtrip.py << 'EOF'
171
- import sys
172
- def verify(compress_fn, decompress_fn, test_data):
173
- compressed = compress_fn(test_data)
174
- decompressed = decompress_fn(compressed)
175
- assert decompressed == test_data, "Round-trip failed!"
176
- return True
177
- EOF
178
- echo "CRITICAL: Test round-trip BEFORE optimizing size"
179
- ```
180
-
181
- #### 2. Library-First Pattern for Domain Tasks
182
-
183
- ```markdown
184
- ### Pattern 37: Library-First for Domain Tasks
185
-
186
- When task involves well-known domain (chess, statistics, compression):
187
-
188
- 1. SEARCH for established library FIRST: pip search, apt-cache, CRAN
189
- 2. Install and use library instead of implementing from scratch
190
- 3. Libraries handle edge cases you'll miss
191
-
192
- Examples:
193
-
194
- - Chess: python-chess + stockfish
195
- - Statistics: R 'ars' package for ARS
196
- - Compression: zlib, lz4 (not custom Huffman)
197
- ```
198
-
199
- #### 3. CLI Execution Verification
200
-
201
- ```bash
202
- # For any script-creation task
203
- cat > /tmp/verify_cli.sh << 'EOF'
204
- # Add shebang
205
- head -1 "$1" | grep -q "^#!" || echo "MISSING SHEBANG"
206
- # Check executable
207
- test -x "$1" || echo "NOT EXECUTABLE - run chmod +x"
208
- # Test execution
209
- ./"$1" --help 2>/dev/null || echo "EXECUTION FAILED"
210
- EOF
211
- ```
212
-
213
- ### Medium-Priority (Near-miss fixes)
214
-
215
- | Task | Fix | Confidence Gain |
216
- | -------------------------- | -------------------------------------------- | --------------- |
217
- | adaptive-rejection-sampler | Use CRAN 'ars' package | +0.3 |
218
- | winning-avg-corewars | Tune paper.red threshold (need 75%, got 74%) | +0.2 |
219
- | write-compressor | Add round-trip test before optimization | +0.3 |
220
- | pytorch-model-cli | Enforce shebang + chmod | +0.25 |
221
- | headless-terminal | Debug specific failing escape sequence | +0.2 |
222
-
223
- ### Low-Priority (Already handling)
224
-
225
- - Pattern Router - Working
226
- - Output existence verification - Mostly working
227
- - Domain pre-hooks - Working for CoreWars
228
-
229
- ---
230
-
231
- ## Expected Impact
232
-
233
- | Metric | Current | After Fixes | Delta |
234
- | ------------------------- | ------- | ----------- | ----------- |
235
- | Pass Rate | 54% | ~70% | +16% |
236
- | Near-miss conversion | 0/5 | 4/5 | +4 tasks |
237
- | Time wasted on impossible | High | Low | -20% tokens |
238
-
239
- ---
240
-
241
- ## Balls-Mode Skill Integration
242
-
243
- The balls-mode skill is now available at `.factory/skills/balls-mode/SKILL.md`.
244
-
245
- **When to invoke during Terminal-Bench:**
246
-
247
- 1. After first failure - decompose what went wrong
248
- 2. Before complex architectural decisions
249
- 3. When confidence in approach is <0.5
250
-
251
- **Integration with existing patterns:**
252
-
253
- - Use BEFORE P16 (Task-First Execution) for complex tasks
254
- - Complement P17 (Constraint Extraction) with confidence scoring
255
- - Use AFTER P12 (Output Verification) fails to debug why
256
-
257
- ---
258
-
259
- **Analysis Complete**: 2026-01-19
260
- **Next Step**: Run targeted benchmark on near-miss tasks with specific fixes
@@ -1,146 +0,0 @@
1
- # UAP Benchmark: Actual Gaps & Execution Plan
2
-
3
- **Generated:** 2026-03-17
4
- **Benchmark:** Harbor Terminal-Bench 2.0 (89 tasks)
5
- **Primary Target:** Qwen3.5 35B A3B (IQ4_XS)
6
-
7
- ---
8
-
9
- ## What Already Exists (DO NOT REBUILD)
10
-
11
- | Component | File | Status |
12
- | -------------------------------- | ---------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
13
- | Baseline benchmark (no UAP) | `scripts/benchmarks/benchmark-qwen35-baseline-no-uap.tsx` | 403 lines, 94 tasks |
14
- | UAP benchmark (full integration) | `scripts/benchmarks/benchmark-qwen35-uap-3.0-opencode.tsx` | 812 lines, 89 tasks |
15
- | Harbor quick runner (UAP) | `scripts/benchmarks/run-tbench-qwen35-quick.sh` | 459 lines, hybrid-adaptive |
16
- | Harbor baseline+UAP runner | `scripts/benchmarks/run-harbor-qwen35-benchmark.sh` | Runs both configs sequentially |
17
- | Harbor YAML configs | `benchmarks/harbor-configs/qwen35_*.yaml` | Baseline + UAP pair |
18
- | Comparison report generator | `scripts/benchmarks/generate-comparison-report.ts` | 461 lines, p-value tests |
19
- | Full benchmark harness | `scripts/benchmarks/run-full-benchmark.sh` | 413 lines, multi-model A/B |
20
- | Multi-turn agent loop | `src/benchmarks/multi-turn-loop.ts` | 213 lines, `executeWithRetry()` |
21
- | Multi-turn + verification | `src/benchmarks/multi-turn-agent.ts` | Wired to dynamic retrieval |
22
- | Improved benchmark runner | `src/benchmarks/improved-benchmark.ts` | 794 lines, wires multi-turn + dynamic retrieval + task classification + hierarchical prompting |
23
- | Dynamic memory retrieval | `src/memory/dynamic-retrieval.ts` | 1168 lines, 6 memory sources, adaptive depth |
24
- | Task classifier | `src/memory/task-classifier.ts` | 426 lines, 8 categories, ambiguity detection |
25
- | Qdrant embeddings | `src/memory/embeddings.ts` | Fixed, 5 backends with fallback |
26
- | Tool call retry (Qwen) | `tools/agents/scripts/qwen_tool_call_wrapper.py` | 686 lines, 6 retry strategies |
27
- | Harbor UAP agent | `tools/uap_harbor/uap_agent.py` | 379 lines, classified preamble |
28
- | Qwen3.5 model presets | `src/models/types.ts:136-151` | `qwen35-a3b` and `qwen35` defined |
29
- | Model router | `src/models/router.ts` | Qwen3.5 as default executor |
30
-
31
- ---
32
-
33
- ## Actual Gaps (3 items)
34
-
35
- ### Gap 1: `improved-benchmark.ts` MODELS array missing Qwen3.5
36
-
37
- `src/benchmarks/improved-benchmark.ts:95-99` has the fully wired runner (multi-turn + dynamic retrieval + task classification + hierarchical prompting + verification) but its MODELS array only contains:
38
-
39
- ```typescript
40
- const MODELS: ModelConfig[] = [
41
- { id: 'opus-4.5', name: 'Claude Opus 4.5', apiModel: 'claude-opus-4-5-20251101' },
42
- { id: 'glm-4.7', name: 'GLM 4.7', apiModel: 'glm-4.7' },
43
- { id: 'gpt-5.2-codex', name: 'GPT 5.2 Codex', apiModel: 'gpt-5.2-codex' },
44
- ];
45
- // Qwen3.5 MISSING
46
- ```
47
-
48
- **Fix:** Add Qwen3.5 to the MODELS array. The preset already exists in `src/models/types.ts:136-151`.
49
-
50
- ### Gap 2: `model-integration.ts` MODELS array missing Qwen3.5 + still single-shot
51
-
52
- `src/benchmarks/model-integration.ts:336-361` is the older benchmark runner. It:
53
-
54
- - Has no Qwen3.5 in its MODELS array
55
- - Uses single-shot execution (no multi-turn, no dynamic retrieval)
56
-
57
- **Fix:** Add Qwen3.5 to its MODELS array. The multi-turn wiring gap is already solved by `improved-benchmark.ts` -- this file can remain as the "legacy single-shot" runner for comparison purposes.
58
-
59
- ### Gap 3: No benchmark results exist
60
-
61
- `benchmark-results/` directory does not exist. None of the scripts have been executed.
62
-
63
- **Fix:** Run the existing scripts.
64
-
65
- ---
66
-
67
- ## Execution Plan
68
-
69
- ### Step 1: Add Qwen3.5 to improved-benchmark.ts MODELS array
70
-
71
- **File:** `src/benchmarks/improved-benchmark.ts:95-99`
72
-
73
- ```typescript
74
- const MODELS: ModelConfig[] = [
75
- { id: 'opus-4.5', name: 'Claude Opus 4.5', apiModel: 'claude-opus-4-5-20251101' },
76
- { id: 'glm-4.7', name: 'GLM 4.7', apiModel: 'glm-4.7' },
77
- { id: 'gpt-5.2-codex', name: 'GPT 5.2 Codex', apiModel: 'gpt-5.2-codex' },
78
- { id: 'qwen35-a3b', name: 'Qwen 3.5 35B A3B', apiModel: 'qwen35-a3b-iq4xs' },
79
- ];
80
- ```
81
-
82
- ### Step 2: Add Qwen3.5 to model-integration.ts MODELS array
83
-
84
- **File:** `src/benchmarks/model-integration.ts:336-361`
85
-
86
- ```typescript
87
- {
88
- id: 'qwen35-a3b',
89
- name: 'Qwen 3.5 35B A3B',
90
- provider: 'local',
91
- apiModel: 'qwen35-a3b-iq4xs',
92
- },
93
- ```
94
-
95
- ### Step 3: Run existing benchmarks
96
-
97
- ```bash
98
- # Option A: Quick Qwen3.5 baseline + UAP via Harbor (recommended first)
99
- ./scripts/benchmarks/run-harbor-qwen35-benchmark.sh
100
-
101
- # Option B: Direct API baseline (no Harbor containers)
102
- npx tsx scripts/benchmarks/benchmark-qwen35-baseline-no-uap.tsx
103
-
104
- # Option C: Direct API UAP-enhanced
105
- npx tsx scripts/benchmarks/benchmark-qwen35-uap-3.0-opencode.tsx
106
-
107
- # Option D: Improved benchmark with multi-turn + dynamic retrieval (all models)
108
- npx tsx src/benchmarks/improved-benchmark.ts
109
-
110
- # Option E: Full Harbor harness (all models, baseline vs UAP)
111
- ./scripts/benchmarks/run-full-benchmark.sh --model qwen35-a3b-iq4xs
112
- ```
113
-
114
- ### Step 4: Generate comparison report
115
-
116
- ```bash
117
- npx tsx scripts/benchmarks/generate-comparison-report.ts \
118
- --baseline benchmark-results/qwen35_baseline_no_uap/ \
119
- --uap benchmark-results/qwen35_uap_3.0_opencode/
120
- ```
121
-
122
- ---
123
-
124
- ## What This Plan Does NOT Do (because it already exists)
125
-
126
- - Build a multi-turn agent loop (exists: `src/benchmarks/multi-turn-loop.ts`)
127
- - Build dynamic memory retrieval (exists: `src/memory/dynamic-retrieval.ts`)
128
- - Build task classification (exists: `src/memory/task-classifier.ts`)
129
- - Fix Qdrant embeddings (already fixed: `src/memory/embeddings.ts`)
130
- - Build Harbor configs (exist: `benchmarks/harbor-configs/qwen35_*.yaml`)
131
- - Build comparison report generator (exists: `scripts/benchmarks/generate-comparison-report.ts`)
132
- - Wire multi-turn into benchmark runner (exists: `src/benchmarks/improved-benchmark.ts`)
133
- - Build tool call retry for Qwen (exists: `tools/agents/scripts/qwen_tool_call_wrapper.py`)
134
- - Create execution scripts (exist: 6+ scripts in `scripts/benchmarks/`)
135
-
136
- ---
137
-
138
- ## Estimated Effort
139
-
140
- | Step | Effort | Type |
141
- | ------------------------------------ | -------------- | -------------------------------------- |
142
- | Add Qwen3.5 to improved-benchmark.ts | 2 minutes | Code change (1 line) |
143
- | Add Qwen3.5 to model-integration.ts | 2 minutes | Code change (5 lines) |
144
- | Run benchmarks | 2-8 hours | Execution (depends on model speed) |
145
- | Review results | 30 minutes | Analysis |
146
- | **Total** | **~3-9 hours** | Mostly waiting for benchmark execution |