@miller-tech/uap 1.40.0 → 1.41.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/README.md +109 -642
  2. package/dist/.tsbuildinfo +1 -1
  3. package/dist/cli/deliver-defaults.d.ts +23 -0
  4. package/dist/cli/deliver-defaults.d.ts.map +1 -0
  5. package/dist/cli/deliver-defaults.js +121 -0
  6. package/dist/cli/deliver-defaults.js.map +1 -0
  7. package/dist/cli/init.d.ts.map +1 -1
  8. package/dist/cli/init.js +29 -0
  9. package/dist/cli/init.js.map +1 -1
  10. package/dist/cli/setup.d.ts.map +1 -1
  11. package/dist/cli/setup.js +19 -0
  12. package/dist/cli/setup.js.map +1 -1
  13. package/dist/policies/policy-tools.d.ts +7 -0
  14. package/dist/policies/policy-tools.d.ts.map +1 -1
  15. package/dist/policies/policy-tools.js +24 -2
  16. package/dist/policies/policy-tools.js.map +1 -1
  17. package/docs/INDEX.md +48 -286
  18. package/docs/architecture/OVERVIEW.md +328 -0
  19. package/docs/architecture/PROTOCOL.md +204 -0
  20. package/docs/benchmarks/README.md +17 -192
  21. package/docs/getting-started/CONFIGURATION.md +237 -0
  22. package/docs/getting-started/INSTALLATION.md +125 -0
  23. package/docs/getting-started/QUICKSTART.md +115 -0
  24. package/docs/guides/COORDINATION.md +162 -0
  25. package/docs/guides/DELIVER.md +115 -0
  26. package/docs/guides/DEPLOY_BATCHING.md +212 -0
  27. package/docs/guides/DROIDS_AND_SKILLS.md +202 -0
  28. package/docs/guides/LOCAL_MODELS.md +148 -0
  29. package/docs/guides/MCP_ROUTER.md +195 -0
  30. package/docs/guides/MEMORY.md +235 -0
  31. package/docs/guides/MULTI_MODEL.md +223 -0
  32. package/docs/guides/POLICIES.md +190 -0
  33. package/docs/guides/WORKTREE_WORKFLOW.md +185 -0
  34. package/docs/integrations/MCP_ROUTER.md +147 -0
  35. package/docs/integrations/RTK.md +102 -0
  36. package/docs/reference/API.md +485 -0
  37. package/docs/reference/CLI.md +719 -0
  38. package/docs/reference/CONFIGURATION.md +90 -193
  39. package/docs/reference/DATABASE_SCHEMA.md +110 -344
  40. package/docs/reference/FEATURES.md +176 -472
  41. package/docs/reference/PATTERNS.md +102 -0
  42. package/docs/reference/PLATFORMS.md +83 -0
  43. package/package.json +3 -1
  44. package/src/policies/enforcers/7ebbc721-7540-4e9f-879a-770e0213a09b_architecture_review.py +101 -0
  45. package/src/policies/enforcers/__pycache__/_common.cpython-312.pyc +0 -0
  46. package/src/policies/enforcers/_common.py +100 -0
  47. package/src/policies/enforcers/artifact_hygiene.py +52 -0
  48. package/src/policies/enforcers/cluster_routing.py +63 -0
  49. package/src/policies/enforcers/codebase_read_before_plan.py +52 -0
  50. package/src/policies/enforcers/coord_overlap.py +81 -0
  51. package/src/policies/enforcers/delivery_enforcement.py +97 -0
  52. package/src/policies/enforcers/doc_live_over_report.py +50 -0
  53. package/src/policies/enforcers/expert_review_required.py +135 -0
  54. package/src/policies/enforcers/iac_parity.py +53 -0
  55. package/src/policies/enforcers/mcp_router_first.py +37 -0
  56. package/src/policies/enforcers/memory_before_plan.py +61 -0
  57. package/src/policies/enforcers/parallel_reads.py +50 -0
  58. package/src/policies/enforcers/rtk_wrap.py +44 -0
  59. package/src/policies/enforcers/schema_diff_gate.py +80 -0
  60. package/src/policies/enforcers/session_memory_write.py +52 -0
  61. package/src/policies/enforcers/task_required.py +131 -0
  62. package/src/policies/enforcers/test_gate.py +58 -0
  63. package/src/policies/enforcers/validate_plan_before_build.py +75 -0
  64. package/src/policies/enforcers/worktree_required.py +57 -0
  65. package/src/policies/schemas/policies/architecture-review.md +51 -0
  66. package/src/policies/schemas/policies/artifact-hygiene.md +29 -0
  67. package/src/policies/schemas/policies/cluster-routing.md +31 -0
  68. package/src/policies/schemas/policies/codebase-read-before-plan.md +30 -0
  69. package/src/policies/schemas/policies/coord-overlap.md +24 -0
  70. package/src/policies/schemas/policies/delivery-enforcement.md +45 -0
  71. package/src/policies/schemas/policies/doc-live-over-report.md +32 -0
  72. package/src/policies/schemas/policies/expert-review-required.md +60 -0
  73. package/src/policies/schemas/policies/iac-parity.md +31 -0
  74. package/src/policies/schemas/policies/mandatory-testing-deployment.md +147 -0
  75. package/src/policies/schemas/policies/mcp-router-first.md +24 -0
  76. package/src/policies/schemas/policies/memory-before-plan.md +24 -0
  77. package/src/policies/schemas/policies/merge-deploy-monitor-verify.md +145 -0
  78. package/src/policies/schemas/policies/parallel-reads.md +24 -0
  79. package/src/policies/schemas/policies/rtk-wrap.md +26 -0
  80. package/src/policies/schemas/policies/schema-diff-gate.md +30 -0
  81. package/src/policies/schemas/policies/session-memory-write.md +24 -0
  82. package/src/policies/schemas/policies/task-required.md +49 -0
  83. package/src/policies/schemas/policies/test-gate.md +24 -0
  84. package/src/policies/schemas/policies/validate-plan-before-build.md +28 -0
  85. package/src/policies/schemas/policies/worktree-required.md +28 -0
  86. package/templates/hooks/uap-policy-gate.sh +5 -0
  87. package/docs/AGENTS.md +0 -423
  88. package/docs/DOCUMENTATION_AUDIT_REPORT.md +0 -131
  89. package/docs/GETTING_STARTED.md +0 -288
  90. package/docs/PROJECT_ANALYSIS_REPORT.md +0 -510
  91. package/docs/architecture/COMPLETE_ARCHITECTURE.md +0 -748
  92. package/docs/architecture/EXPERT_STACK.md +0 -137
  93. package/docs/architecture/MULTI_MODEL.md +0 -224
  94. package/docs/architecture/PLATFORM_GATING.md +0 -68
  95. package/docs/architecture/SYSTEM_ANALYSIS.md +0 -334
  96. package/docs/architecture/UAP_COMPLIANCE.md +0 -217
  97. package/docs/architecture/UAP_PROTOCOL.md +0 -339
  98. package/docs/architecture/UAP_STRICT_DROIDS.md +0 -172
  99. package/docs/archive/BALLS_MODE_SELF_ANALYSIS.md +0 -260
  100. package/docs/archive/BENCHMARK_GAPS_AND_PLAN.md +0 -146
  101. package/docs/archive/FAILING_TASKS_SOLUTION_PLAN.md +0 -668
  102. package/docs/archive/JINJA2-SYSTEM-MESSAGE-FIX.md +0 -209
  103. package/docs/archive/MODEL_ROUTING_IMPLEMENTATION_SUMMARY.md +0 -281
  104. package/docs/archive/MODEL_ROUTING_OPTIMIZATION_PLAN.md +0 -320
  105. package/docs/archive/NPM-PUBLISH-V0.9.1.md +0 -240
  106. package/docs/archive/OPTIMIZATION_OPTIONS.md +0 -334
  107. package/docs/archive/PARALLELISM_GAPS_AND_OPTIONS.md +0 -422
  108. package/docs/archive/POLICY_GATE_IMPLEMENTATION.md +0 -245
  109. package/docs/archive/SETUP_IMPROVEMENTS.md +0 -213
  110. package/docs/archive/UAP_GENERIC_OPTIMIZATION_PLAN.md +0 -270
  111. package/docs/archive/UAP_OPTIMIZATION_PLAN.md +0 -701
  112. package/docs/archive/UAP_V103_PATTERN_DESIGN.md +0 -315
  113. package/docs/archive/UAP_V104_COMPLIANCE_DESIGN.md +0 -223
  114. package/docs/archive/changelog/2026-03-10_uap-100-compliance.md +0 -77
  115. package/docs/archive/changelog/2026-03-10_uap-full-system-verification.md +0 -109
  116. package/docs/archive/opencode-integration-guide.md +0 -740
  117. package/docs/archive/opencode-integration-quickref.md +0 -180
  118. package/docs/benchmarks/OVERNIGHT_RUNNER.md +0 -341
  119. package/docs/benchmarks/SPECULATIVE_DECODING_JOURNEY_2026-03.md +0 -221
  120. package/docs/benchmarks/VALIDATION_PLAN.md +0 -568
  121. package/docs/blog/SPECULATIVE_DECODING_PRODUCTION_PLAYBOOK.md +0 -139
  122. package/docs/blog/local-coding-agents.md +0 -266
  123. package/docs/blog/x-thread.md +0 -254
  124. package/docs/deployment/DEPLOYMENT.md +0 -895
  125. package/docs/deployment/DEPLOYMENT_STRATEGIES.md +0 -518
  126. package/docs/deployment/DEPLOY_BATCHER_ANALYSIS.md +0 -224
  127. package/docs/deployment/DEPLOY_BATCHING.md +0 -273
  128. package/docs/deployment/DEPLOY_BUCKETING_ANALYSIS.md +0 -420
  129. package/docs/deployment/QWEN35_LLAMA_CPP.md +0 -426
  130. package/docs/deployment/UAP_LLAMA_ANTHROPIC_PROXY_BOOTSTRAP.md +0 -279
  131. package/docs/getting-started/INTEGRATION.md +0 -628
  132. package/docs/getting-started/OVERVIEW.md +0 -324
  133. package/docs/getting-started/SETUP.md +0 -377
  134. package/docs/integrations/MCP_ROUTER_SETUP.md +0 -445
  135. package/docs/integrations/RTK_INTEGRATION.md +0 -468
  136. package/docs/operations/TROUBLESHOOTING.md +0 -660
  137. package/docs/pr/PR_SPECULATIVE_DOCS_TEMPLATE.md +0 -146
  138. package/docs/pr/UPSTREAM_PRS.md +0 -424
  139. package/docs/reference/API_REFERENCE.md +0 -903
  140. package/docs/reference/EXPERT_DROIDS.md +0 -219
  141. package/docs/reference/HARNESS-MATRIX.md +0 -318
  142. package/docs/reference/PATTERN_LIBRARY.md +0 -636
  143. package/docs/reference/UAP_CLI_REFERENCE.md +0 -620
  144. package/docs/research/BEHAVIORAL_PATTERNS.md +0 -228
  145. package/docs/research/DOMAIN_STRATEGIES.md +0 -316
  146. package/docs/research/MEMORY_SYSTEMS_COMPARISON.md +0 -812
  147. package/docs/research/PATTERN_ANALYSIS_2026-01-18.md +0 -436
  148. package/docs/research/PERFORMANCE_ANALYSIS_2026-01-18.md +0 -209
  149. package/docs/research/PERFORMANCE_TEST_PLAN.md +0 -383
  150. package/docs/research/TERMINAL_BENCH_LEARNINGS.md +0 -217
@@ -1,260 +0,0 @@
1
- # UAP Project Self-Analysis Using Balls-Mode
2
-
3
- **Date:** 2026-01-19
4
- **Analysis Method:** Balls-Mode Decomposed Reasoning with Confidence Scoring
5
- **Purpose:** Identify optimization opportunities for Terminal-Bench performance
6
-
7
- ---
8
-
9
- ## /balls Can UAP improve Terminal-Bench pass rate from 54% to 70%?
10
-
11
- ### Step 1: CLASSIFY
12
-
13
- **Complex** - Multi-faceted optimization question requiring analysis of:
14
-
15
- - Current failure modes
16
- - Pattern effectiveness
17
- - Implementation gaps
18
- - Resource constraints
19
-
20
- ---
21
-
22
- ### Step 2: DECOMPOSE
23
-
24
- | # | Ball | Why it matters |
25
- | --- | ------------------------------------------ | ---------------------------------- |
26
- | 1 | Are patterns being applied correctly? | Patterns exist but may not trigger |
27
- | 2 | Which tasks are near-miss (>50% subtests)? | Highest ROI targets |
28
- | 3 | Are pre-hooks working? | Domain knowledge injection |
29
- | 4 | Is the model capable enough? | Some tasks need stronger model |
30
- | 5 | Are impossible tasks detected early? | Avoid wasting time |
31
- | 6 | Is output verification happening? | 37% of failures are missing files |
32
- | 7 | Is round-trip testing done? | Compression/encoding failures |
33
- | 8 | Are domain libraries used? | Chess/stats need specialized tools |
34
-
35
- ---
36
-
37
- ### Step 3: SOLVE & VERIFY
38
-
39
- #### Ball 1: Are patterns being applied correctly?
40
-
41
- **Evidence:**
42
-
43
- - Pattern Router prints analysis block ✓
44
- - But: winning-avg-corewars showed 47% improvement when hooks worked
45
- - Some patterns in CLAUDE.md but not enforced
46
-
47
- **Answer:** Patterns exist but compliance is inconsistent
48
-
49
- #### Ball 2: Which tasks are near-miss?
50
-
51
- **Evidence from benchmark data:**
52
-
53
- - adaptive-rejection-sampler: 8/9 (88%) - 1 test away
54
- - headless-terminal: 6/7 (85%) - 1 test away
55
- - winning-avg-corewars: 4/5 subtests (80%) - 1% threshold miss
56
- - write-compressor: 2/3 (67%) - round-trip issue
57
- - pytorch-model-cli: shebang/chmod issue
58
-
59
- **Answer:** 5 tasks within striking distance (could add +5 to pass count)
60
-
61
- #### Ball 3: Are pre-hooks working?
62
-
63
- **Evidence:**
64
-
65
- - CoreWars: +47% improvement when hook provided strategy
66
- - XSS: Hook didn't help (still failing)
67
- - Compression: Hook didn't fix round-trip issue
68
-
69
- **Answer:** Pre-hooks work for domain knowledge, not for implementation bugs
70
-
71
- #### Ball 4: Is the model capable enough?
72
-
73
- **Evidence:**
74
-
75
- - Opus 4.5: 54% pass rate
76
- - Sonnet 4: 11% pass rate
77
- - Droid (top leaderboard): 65% with GPT-5.2
78
-
79
- **Answer:** Model matters significantly; Opus 4.5 is competitive but not best
80
-
81
- #### Ball 5: Are impossible tasks detected early?
82
-
83
- **Evidence:**
84
-
85
- - gpt2-codegolf: Still attempted despite being impossible
86
- - Pattern 5/19 exists but agent doesn't refuse quickly
87
-
88
- **Answer:** Detection exists but enforcement is weak
89
-
90
- #### Ball 6: Is output verification happening?
91
-
92
- **Evidence:**
93
-
94
- - P12 (OEV) added in v10.2
95
- - But: pytorch-model-cli failed due to missing shebang
96
- - filter-js-from-html: file not created
97
-
98
- **Answer:** Verification exists but not enforced pre-completion
99
-
100
- #### Ball 7: Is round-trip testing done?
101
-
102
- **Evidence:**
103
-
104
- - write-compressor: Decompression fails
105
- - P34 (Reversibility Verification) added
106
- - Agent doesn't test round-trip before optimizing
107
-
108
- **Answer:** Pattern exists but not followed
109
-
110
- #### Ball 8: Are domain libraries used?
111
-
112
- **Evidence:**
113
-
114
- - chess-best-move: Visual FEN extraction failed
115
- - adaptive-rejection-sampler: Custom R code vs CRAN 'ars' package
116
- - P32 recommends libraries over custom
117
-
118
- **Answer:** Agents reinvent instead of using established libraries
119
-
120
- ---
121
-
122
- ### Step 4: SCORE
123
-
124
- | Ball | Answer | Confidence | Notes |
125
- | -------------------- | ----------------------------------------- | ---------- | -------------------------- |
126
- | Pattern compliance | Inconsistent - some patterns not enforced | 0.7 | Clear from logs |
127
- | Near-miss tasks | 5 tasks within reach | 0.85 | Data-driven |
128
- | Pre-hooks effective | Yes for domain knowledge | 0.75 | CoreWars proves it |
129
- | Model capability | Opus 4.5 competitive | 0.8 | Leaderboard data |
130
- | Impossible detection | Weak enforcement | 0.6 | gpt2-codegolf evidence |
131
- | Output verification | Exists but not enforced | 0.65 | pytorch-model-cli failure |
132
- | Round-trip testing | Not done systematically | 0.5 | write-compressor shows gap |
133
- | Library usage | Reinventing instead of reusing | 0.7 | Chess/R failures |
134
-
135
- ---
136
-
137
- ### Step 5: SYNTHESIZE
138
-
139
- ## Synthesis
140
-
141
- **Answer**: Yes, 70% is achievable by fixing 3 key gaps:
142
-
143
- 1. **Enforce pattern compliance** - Make Gate checks mandatory, not advisory
144
- 2. **Target near-miss tasks** - 5 tasks at >50% need specific fixes
145
- 3. **Use established libraries** - Chess (python-chess), Stats (CRAN ars), Compression (zlib)
146
-
147
- **Overall Confidence**: 0.65
148
-
149
- **Weakest Links**:
150
-
151
- - Round-trip testing (0.5) - Compression tasks will keep failing without this
152
- - Impossible detection (0.6) - Time wasted on gpt2-codegolf
153
-
154
- **To increase confidence**:
155
-
156
- 1. Run targeted tests on the 5 near-miss tasks with specific fixes
157
- 2. Add mandatory round-trip verification for compression tasks
158
- 3. Implement library-first pattern in pre-hooks
159
-
160
- ---
161
-
162
- ## Specific Optimization Actions
163
-
164
- ### High-Priority (Addresses weakest balls)
165
-
166
- #### 1. Mandatory Round-Trip Verification Hook
167
-
168
- ```bash
169
- # Pre-hook for compression tasks
170
- cat > /tmp/verify_roundtrip.py << 'EOF'
171
- import sys
172
- def verify(compress_fn, decompress_fn, test_data):
173
- compressed = compress_fn(test_data)
174
- decompressed = decompress_fn(compressed)
175
- assert decompressed == test_data, "Round-trip failed!"
176
- return True
177
- EOF
178
- echo "CRITICAL: Test round-trip BEFORE optimizing size"
179
- ```
180
-
181
- #### 2. Library-First Pattern for Domain Tasks
182
-
183
- ```markdown
184
- ### Pattern 37: Library-First for Domain Tasks
185
-
186
- When task involves well-known domain (chess, statistics, compression):
187
-
188
- 1. SEARCH for established library FIRST: pip search, apt-cache, CRAN
189
- 2. Install and use library instead of implementing from scratch
190
- 3. Libraries handle edge cases you'll miss
191
-
192
- Examples:
193
-
194
- - Chess: python-chess + stockfish
195
- - Statistics: R 'ars' package for ARS
196
- - Compression: zlib, lz4 (not custom Huffman)
197
- ```
198
-
199
- #### 3. CLI Execution Verification
200
-
201
- ```bash
202
- # For any script-creation task
203
- cat > /tmp/verify_cli.sh << 'EOF'
204
- # Add shebang
205
- head -1 "$1" | grep -q "^#!" || echo "MISSING SHEBANG"
206
- # Check executable
207
- test -x "$1" || echo "NOT EXECUTABLE - run chmod +x"
208
- # Test execution
209
- ./"$1" --help 2>/dev/null || echo "EXECUTION FAILED"
210
- EOF
211
- ```
212
-
213
- ### Medium-Priority (Near-miss fixes)
214
-
215
- | Task | Fix | Confidence Gain |
216
- | -------------------------- | -------------------------------------------- | --------------- |
217
- | adaptive-rejection-sampler | Use CRAN 'ars' package | +0.3 |
218
- | winning-avg-corewars | Tune paper.red threshold (need 75%, got 74%) | +0.2 |
219
- | write-compressor | Add round-trip test before optimization | +0.3 |
220
- | pytorch-model-cli | Enforce shebang + chmod | +0.25 |
221
- | headless-terminal | Debug specific failing escape sequence | +0.2 |
222
-
223
- ### Low-Priority (Already handling)
224
-
225
- - Pattern Router - Working
226
- - Output existence verification - Mostly working
227
- - Domain pre-hooks - Working for CoreWars
228
-
229
- ---
230
-
231
- ## Expected Impact
232
-
233
- | Metric | Current | After Fixes | Delta |
234
- | ------------------------- | ------- | ----------- | ----------- |
235
- | Pass Rate | 54% | ~70% | +16% |
236
- | Near-miss conversion | 0/5 | 4/5 | +4 tasks |
237
- | Time wasted on impossible | High | Low | -20% tokens |
238
-
239
- ---
240
-
241
- ## Balls-Mode Skill Integration
242
-
243
- The balls-mode skill is now available at `.factory/skills/balls-mode/SKILL.md`.
244
-
245
- **When to invoke during Terminal-Bench:**
246
-
247
- 1. After first failure - decompose what went wrong
248
- 2. Before complex architectural decisions
249
- 3. When confidence in approach is <0.5
250
-
251
- **Integration with existing patterns:**
252
-
253
- - Use BEFORE P16 (Task-First Execution) for complex tasks
254
- - Complement P17 (Constraint Extraction) with confidence scoring
255
- - Use AFTER P12 (Output Verification) fails to debug why
256
-
257
- ---
258
-
259
- **Analysis Complete**: 2026-01-19
260
- **Next Step**: Run targeted benchmark on near-miss tasks with specific fixes
@@ -1,146 +0,0 @@
1
- # UAP Benchmark: Actual Gaps & Execution Plan
2
-
3
- **Generated:** 2026-03-17
4
- **Benchmark:** Harbor Terminal-Bench 2.0 (89 tasks)
5
- **Primary Target:** Qwen3.5 35B A3B (IQ4_XS)
6
-
7
- ---
8
-
9
- ## What Already Exists (DO NOT REBUILD)
10
-
11
- | Component | File | Status |
12
- | -------------------------------- | ---------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
13
- | Baseline benchmark (no UAP) | `scripts/benchmarks/benchmark-qwen35-baseline-no-uap.tsx` | 403 lines, 94 tasks |
14
- | UAP benchmark (full integration) | `scripts/benchmarks/benchmark-qwen35-uap-3.0-opencode.tsx` | 812 lines, 89 tasks |
15
- | Harbor quick runner (UAP) | `scripts/benchmarks/run-tbench-qwen35-quick.sh` | 459 lines, hybrid-adaptive |
16
- | Harbor baseline+UAP runner | `scripts/benchmarks/run-harbor-qwen35-benchmark.sh` | Runs both configs sequentially |
17
- | Harbor YAML configs | `benchmarks/harbor-configs/qwen35_*.yaml` | Baseline + UAP pair |
18
- | Comparison report generator | `scripts/benchmarks/generate-comparison-report.ts` | 461 lines, p-value tests |
19
- | Full benchmark harness | `scripts/benchmarks/run-full-benchmark.sh` | 413 lines, multi-model A/B |
20
- | Multi-turn agent loop | `src/benchmarks/multi-turn-loop.ts` | 213 lines, `executeWithRetry()` |
21
- | Multi-turn + verification | `src/benchmarks/multi-turn-agent.ts` | Wired to dynamic retrieval |
22
- | Improved benchmark runner | `src/benchmarks/improved-benchmark.ts` | 794 lines, wires multi-turn + dynamic retrieval + task classification + hierarchical prompting |
23
- | Dynamic memory retrieval | `src/memory/dynamic-retrieval.ts` | 1168 lines, 6 memory sources, adaptive depth |
24
- | Task classifier | `src/memory/task-classifier.ts` | 426 lines, 8 categories, ambiguity detection |
25
- | Qdrant embeddings | `src/memory/embeddings.ts` | Fixed, 5 backends with fallback |
26
- | Tool call retry (Qwen) | `tools/agents/scripts/qwen_tool_call_wrapper.py` | 686 lines, 6 retry strategies |
27
- | Harbor UAP agent | `tools/uap_harbor/uap_agent.py` | 379 lines, classified preamble |
28
- | Qwen3.5 model presets | `src/models/types.ts:136-151` | `qwen35-a3b` and `qwen35` defined |
29
- | Model router | `src/models/router.ts` | Qwen3.5 as default executor |
30
-
31
- ---
32
-
33
- ## Actual Gaps (3 items)
34
-
35
- ### Gap 1: `improved-benchmark.ts` MODELS array missing Qwen3.5
36
-
37
- `src/benchmarks/improved-benchmark.ts:95-99` has the fully wired runner (multi-turn + dynamic retrieval + task classification + hierarchical prompting + verification) but its MODELS array only contains:
38
-
39
- ```typescript
40
- const MODELS: ModelConfig[] = [
41
- { id: 'opus-4.5', name: 'Claude Opus 4.5', apiModel: 'claude-opus-4-5-20251101' },
42
- { id: 'glm-4.7', name: 'GLM 4.7', apiModel: 'glm-4.7' },
43
- { id: 'gpt-5.2-codex', name: 'GPT 5.2 Codex', apiModel: 'gpt-5.2-codex' },
44
- ];
45
- // Qwen3.5 MISSING
46
- ```
47
-
48
- **Fix:** Add Qwen3.5 to the MODELS array. The preset already exists in `src/models/types.ts:136-151`.
49
-
50
- ### Gap 2: `model-integration.ts` MODELS array missing Qwen3.5 + still single-shot
51
-
52
- `src/benchmarks/model-integration.ts:336-361` is the older benchmark runner. It:
53
-
54
- - Has no Qwen3.5 in its MODELS array
55
- - Uses single-shot execution (no multi-turn, no dynamic retrieval)
56
-
57
- **Fix:** Add Qwen3.5 to its MODELS array. The multi-turn wiring gap is already solved by `improved-benchmark.ts` -- this file can remain as the "legacy single-shot" runner for comparison purposes.
58
-
59
- ### Gap 3: No benchmark results exist
60
-
61
- `benchmark-results/` directory does not exist. None of the scripts have been executed.
62
-
63
- **Fix:** Run the existing scripts.
64
-
65
- ---
66
-
67
- ## Execution Plan
68
-
69
- ### Step 1: Add Qwen3.5 to improved-benchmark.ts MODELS array
70
-
71
- **File:** `src/benchmarks/improved-benchmark.ts:95-99`
72
-
73
- ```typescript
74
- const MODELS: ModelConfig[] = [
75
- { id: 'opus-4.5', name: 'Claude Opus 4.5', apiModel: 'claude-opus-4-5-20251101' },
76
- { id: 'glm-4.7', name: 'GLM 4.7', apiModel: 'glm-4.7' },
77
- { id: 'gpt-5.2-codex', name: 'GPT 5.2 Codex', apiModel: 'gpt-5.2-codex' },
78
- { id: 'qwen35-a3b', name: 'Qwen 3.5 35B A3B', apiModel: 'qwen35-a3b-iq4xs' },
79
- ];
80
- ```
81
-
82
- ### Step 2: Add Qwen3.5 to model-integration.ts MODELS array
83
-
84
- **File:** `src/benchmarks/model-integration.ts:336-361`
85
-
86
- ```typescript
87
- {
88
- id: 'qwen35-a3b',
89
- name: 'Qwen 3.5 35B A3B',
90
- provider: 'local',
91
- apiModel: 'qwen35-a3b-iq4xs',
92
- },
93
- ```
94
-
95
- ### Step 3: Run existing benchmarks
96
-
97
- ```bash
98
- # Option A: Quick Qwen3.5 baseline + UAP via Harbor (recommended first)
99
- ./scripts/benchmarks/run-harbor-qwen35-benchmark.sh
100
-
101
- # Option B: Direct API baseline (no Harbor containers)
102
- npx tsx scripts/benchmarks/benchmark-qwen35-baseline-no-uap.tsx
103
-
104
- # Option C: Direct API UAP-enhanced
105
- npx tsx scripts/benchmarks/benchmark-qwen35-uap-3.0-opencode.tsx
106
-
107
- # Option D: Improved benchmark with multi-turn + dynamic retrieval (all models)
108
- npx tsx src/benchmarks/improved-benchmark.ts
109
-
110
- # Option E: Full Harbor harness (all models, baseline vs UAP)
111
- ./scripts/benchmarks/run-full-benchmark.sh --model qwen35-a3b-iq4xs
112
- ```
113
-
114
- ### Step 4: Generate comparison report
115
-
116
- ```bash
117
- npx tsx scripts/benchmarks/generate-comparison-report.ts \
118
- --baseline benchmark-results/qwen35_baseline_no_uap/ \
119
- --uap benchmark-results/qwen35_uap_3.0_opencode/
120
- ```
121
-
122
- ---
123
-
124
- ## What This Plan Does NOT Do (because it already exists)
125
-
126
- - Build a multi-turn agent loop (exists: `src/benchmarks/multi-turn-loop.ts`)
127
- - Build dynamic memory retrieval (exists: `src/memory/dynamic-retrieval.ts`)
128
- - Build task classification (exists: `src/memory/task-classifier.ts`)
129
- - Fix Qdrant embeddings (already fixed: `src/memory/embeddings.ts`)
130
- - Build Harbor configs (exist: `benchmarks/harbor-configs/qwen35_*.yaml`)
131
- - Build comparison report generator (exists: `scripts/benchmarks/generate-comparison-report.ts`)
132
- - Wire multi-turn into benchmark runner (exists: `src/benchmarks/improved-benchmark.ts`)
133
- - Build tool call retry for Qwen (exists: `tools/agents/scripts/qwen_tool_call_wrapper.py`)
134
- - Create execution scripts (exist: 6+ scripts in `scripts/benchmarks/`)
135
-
136
- ---
137
-
138
- ## Estimated Effort
139
-
140
- | Step | Effort | Type |
141
- | ------------------------------------ | -------------- | -------------------------------------- |
142
- | Add Qwen3.5 to improved-benchmark.ts | 2 minutes | Code change (1 line) |
143
- | Add Qwen3.5 to model-integration.ts | 2 minutes | Code change (5 lines) |
144
- | Run benchmarks | 2-8 hours | Execution (depends on model speed) |
145
- | Review results | 30 minutes | Analysis |
146
- | **Total** | **~3-9 hours** | Mostly waiting for benchmark execution |