@miller-tech/uap 1.9.1 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/README.md +285 -642
  2. package/config/chat_template.jinja +76 -0
  3. package/config/lora-finetune.yaml +82 -0
  4. package/config/model-profiles/claude-haiku-3.5.json +62 -0
  5. package/config/model-profiles/claude-opus-4.6.json +63 -0
  6. package/config/model-profiles/claude-sonnet-4.6.json +63 -0
  7. package/config/model-profiles/gemini-2.5-flash.json +62 -0
  8. package/config/model-profiles/gemini-2.5-pro.json +62 -0
  9. package/config/model-profiles/generic.json +54 -0
  10. package/config/model-profiles/glm-5.json +61 -0
  11. package/config/model-profiles/gpt-4.1.json +64 -0
  12. package/config/model-profiles/gpt-4o.json +64 -0
  13. package/config/model-profiles/gpt-o3.json +61 -0
  14. package/config/model-profiles/kimi-k2.5.json +70 -0
  15. package/config/model-profiles/llama.json +86 -0
  16. package/config/model-profiles/qwen35.json +218 -0
  17. package/dist/.tsbuildinfo +1 -1
  18. package/dist/bin/cli.js +78 -9
  19. package/dist/bin/cli.js.map +1 -1
  20. package/dist/bin/llama-server-optimize.js +0 -10
  21. package/dist/bin/llama-server-optimize.js.map +1 -1
  22. package/dist/bin/policy.js +11 -195
  23. package/dist/bin/policy.js.map +1 -1
  24. package/dist/cli/dashboard.d.ts.map +1 -1
  25. package/dist/cli/dashboard.js +10 -12
  26. package/dist/cli/dashboard.js.map +1 -1
  27. package/dist/cli/generate.d.ts +0 -7
  28. package/dist/cli/generate.d.ts.map +1 -1
  29. package/dist/cli/generate.js +10 -22
  30. package/dist/cli/generate.js.map +1 -1
  31. package/dist/cli/memory.d.ts.map +1 -1
  32. package/dist/cli/memory.js +43 -109
  33. package/dist/cli/memory.js.map +1 -1
  34. package/dist/cli/model.d.ts.map +1 -1
  35. package/dist/cli/model.js +8 -25
  36. package/dist/cli/model.js.map +1 -1
  37. package/dist/cli/patterns.d.ts.map +1 -1
  38. package/dist/cli/patterns.js +11 -9
  39. package/dist/cli/patterns.js.map +1 -1
  40. package/dist/cli/policy.d.ts.map +1 -1
  41. package/dist/cli/policy.js +165 -1
  42. package/dist/cli/policy.js.map +1 -1
  43. package/dist/cli/rtk-validation.d.ts +0 -4
  44. package/dist/cli/rtk-validation.d.ts.map +1 -1
  45. package/dist/cli/rtk-validation.js +5 -23
  46. package/dist/cli/rtk-validation.js.map +1 -1
  47. package/dist/cli/schema-diff.d.ts.map +1 -1
  48. package/dist/cli/schema-diff.js +25 -16
  49. package/dist/cli/schema-diff.js.map +1 -1
  50. package/dist/cli/setup-wizard.d.ts.map +1 -1
  51. package/dist/cli/setup-wizard.js +11 -7
  52. package/dist/cli/setup-wizard.js.map +1 -1
  53. package/dist/cli/setup.d.ts.map +1 -1
  54. package/dist/cli/setup.js +3 -4
  55. package/dist/cli/setup.js.map +1 -1
  56. package/dist/cli/visualize.d.ts +0 -13
  57. package/dist/cli/visualize.d.ts.map +1 -1
  58. package/dist/cli/visualize.js +1 -60
  59. package/dist/cli/visualize.js.map +1 -1
  60. package/dist/cli/worktree.d.ts.map +1 -1
  61. package/dist/cli/worktree.js +12 -13
  62. package/dist/cli/worktree.js.map +1 -1
  63. package/dist/coordination/adaptive-patterns.d.ts.map +1 -1
  64. package/dist/coordination/adaptive-patterns.js +7 -3
  65. package/dist/coordination/adaptive-patterns.js.map +1 -1
  66. package/dist/coordination/deploy-batcher.d.ts.map +1 -1
  67. package/dist/coordination/deploy-batcher.js +8 -8
  68. package/dist/coordination/deploy-batcher.js.map +1 -1
  69. package/dist/dashboard/data-service.d.ts.map +1 -1
  70. package/dist/dashboard/data-service.js +12 -13
  71. package/dist/dashboard/data-service.js.map +1 -1
  72. package/dist/dashboard/server.d.ts.map +1 -1
  73. package/dist/dashboard/server.js +23 -8
  74. package/dist/dashboard/server.js.map +1 -1
  75. package/dist/index.d.ts +4 -4
  76. package/dist/index.d.ts.map +1 -1
  77. package/dist/index.js +3 -3
  78. package/dist/index.js.map +1 -1
  79. package/dist/mcp-router/executor/client.d.ts +0 -6
  80. package/dist/mcp-router/executor/client.d.ts.map +1 -1
  81. package/dist/mcp-router/executor/client.js +1 -27
  82. package/dist/mcp-router/executor/client.js.map +1 -1
  83. package/dist/mcp-router/index.d.ts +1 -1
  84. package/dist/mcp-router/index.d.ts.map +1 -1
  85. package/dist/mcp-router/index.js.map +1 -1
  86. package/dist/mcp-router/output-compressor.js +114 -112
  87. package/dist/mcp-router/output-compressor.js.map +1 -1
  88. package/dist/mcp-router/types.d.ts +0 -5
  89. package/dist/mcp-router/types.d.ts.map +1 -1
  90. package/dist/memory/backends/github.d.ts.map +1 -1
  91. package/dist/memory/backends/github.js +21 -13
  92. package/dist/memory/backends/github.js.map +1 -1
  93. package/dist/memory/context-pruner.d.ts +2 -9
  94. package/dist/memory/context-pruner.d.ts.map +1 -1
  95. package/dist/memory/context-pruner.js +5 -22
  96. package/dist/memory/context-pruner.js.map +1 -1
  97. package/dist/memory/correction-propagator.d.ts.map +1 -1
  98. package/dist/memory/correction-propagator.js +19 -19
  99. package/dist/memory/correction-propagator.js.map +1 -1
  100. package/dist/memory/dynamic-retrieval.d.ts.map +1 -1
  101. package/dist/memory/dynamic-retrieval.js +263 -132
  102. package/dist/memory/dynamic-retrieval.js.map +1 -1
  103. package/dist/memory/embeddings.d.ts.map +1 -1
  104. package/dist/memory/embeddings.js +2 -15
  105. package/dist/memory/embeddings.js.map +1 -1
  106. package/dist/memory/hierarchical-memory.d.ts.map +1 -1
  107. package/dist/memory/hierarchical-memory.js +6 -0
  108. package/dist/memory/hierarchical-memory.js.map +1 -1
  109. package/dist/memory/knowledge-graph.d.ts.map +1 -1
  110. package/dist/memory/knowledge-graph.js +2 -1
  111. package/dist/memory/knowledge-graph.js.map +1 -1
  112. package/dist/memory/memory-consolidator.d.ts +1 -0
  113. package/dist/memory/memory-consolidator.d.ts.map +1 -1
  114. package/dist/memory/memory-consolidator.js +27 -3
  115. package/dist/memory/memory-consolidator.js.map +1 -1
  116. package/dist/memory/predictive-memory.d.ts +9 -1
  117. package/dist/memory/predictive-memory.d.ts.map +1 -1
  118. package/dist/memory/predictive-memory.js +77 -1
  119. package/dist/memory/predictive-memory.js.map +1 -1
  120. package/dist/memory/serverless-qdrant.d.ts +1 -0
  121. package/dist/memory/serverless-qdrant.d.ts.map +1 -1
  122. package/dist/memory/serverless-qdrant.js +3 -9
  123. package/dist/memory/serverless-qdrant.js.map +1 -1
  124. package/dist/memory/short-term/schema.d.ts.map +1 -1
  125. package/dist/memory/short-term/schema.js +44 -6
  126. package/dist/memory/short-term/schema.js.map +1 -1
  127. package/dist/memory/short-term/sqlite.d.ts +4 -3
  128. package/dist/memory/short-term/sqlite.d.ts.map +1 -1
  129. package/dist/memory/short-term/sqlite.js +3 -12
  130. package/dist/memory/short-term/sqlite.js.map +1 -1
  131. package/dist/memory/speculative-cache.js +2 -2
  132. package/dist/memory/speculative-cache.js.map +1 -1
  133. package/dist/models/executor.d.ts +21 -0
  134. package/dist/models/executor.d.ts.map +1 -1
  135. package/dist/models/executor.js +116 -4
  136. package/dist/models/executor.js.map +1 -1
  137. package/dist/models/planner.d.ts +1 -0
  138. package/dist/models/planner.d.ts.map +1 -1
  139. package/dist/models/planner.js +13 -1
  140. package/dist/models/planner.js.map +1 -1
  141. package/dist/policies/enforced-tool-router.d.ts +3 -1
  142. package/dist/policies/enforced-tool-router.d.ts.map +1 -1
  143. package/dist/policies/enforced-tool-router.js.map +1 -1
  144. package/dist/tasks/service.d.ts +1 -0
  145. package/dist/tasks/service.d.ts.map +1 -1
  146. package/dist/tasks/service.js +10 -6
  147. package/dist/tasks/service.js.map +1 -1
  148. package/dist/telemetry/session-telemetry.d.ts.map +1 -1
  149. package/dist/telemetry/session-telemetry.js +14 -11
  150. package/dist/telemetry/session-telemetry.js.map +1 -1
  151. package/dist/types/config.d.ts +426 -359
  152. package/dist/types/config.d.ts.map +1 -1
  153. package/dist/types/config.js +12 -67
  154. package/dist/types/config.js.map +1 -1
  155. package/dist/utils/config-loader.d.ts +34 -0
  156. package/dist/utils/config-loader.d.ts.map +1 -0
  157. package/dist/utils/config-loader.js +93 -0
  158. package/dist/utils/config-loader.js.map +1 -0
  159. package/dist/utils/lazy-imports.d.ts +18 -0
  160. package/dist/utils/lazy-imports.d.ts.map +1 -0
  161. package/dist/utils/lazy-imports.js +39 -0
  162. package/dist/utils/lazy-imports.js.map +1 -0
  163. package/dist/utils/stopwords.d.ts +12 -0
  164. package/dist/utils/stopwords.d.ts.map +1 -0
  165. package/dist/utils/stopwords.js +196 -0
  166. package/dist/utils/stopwords.js.map +1 -0
  167. package/dist/utils/string-similarity.d.ts +10 -3
  168. package/dist/utils/string-similarity.d.ts.map +1 -1
  169. package/dist/utils/string-similarity.js +49 -25
  170. package/dist/utils/string-similarity.js.map +1 -1
  171. package/docs/INDEX.md +35 -34
  172. package/package.json +13 -14
  173. package/scripts/maintenance/publish-npm.sh +82 -0
  174. package/scripts/maintenance/publish.sh +29 -0
  175. package/scripts/maintenance/update-droids.py +93 -0
  176. package/scripts/maintenance/update-skills.py +148 -0
  177. package/scripts/maintenance/update-uap-compliance.sh +45 -0
  178. package/scripts/maintenance/validate-skills.py +83 -0
  179. package/scripts/maintenance/verify-compliance.sh +117 -0
  180. package/scripts/setup/install-cloakbrowser.ts +14 -0
  181. package/scripts/setup/install-desktop.sh +105 -0
  182. package/scripts/setup/install-rtk.sh +184 -0
  183. package/scripts/setup/install-web.sh +73 -0
  184. package/scripts/setup/setup.sh +375 -0
  185. package/scripts/validate-build.sh +62 -0
  186. package/scripts/version-bump.sh +130 -0
  187. package/tools/agents/scripts/migrate_memory_to_qdrant.py +1 -1
  188. /package/docs/{BENCHMARK_GAPS_AND_PLAN.md → archive/BENCHMARK_GAPS_AND_PLAN.md} +0 -0
  189. /package/docs/{MODEL_ROUTING_IMPLEMENTATION_SUMMARY.md → archive/MODEL_ROUTING_IMPLEMENTATION_SUMMARY.md} +0 -0
  190. /package/docs/{MODEL_ROUTING_OPTIMIZATION_PLAN.md → archive/MODEL_ROUTING_OPTIMIZATION_PLAN.md} +0 -0
  191. /package/docs/{PARALLELISM_GAPS_AND_OPTIONS.md → archive/PARALLELISM_GAPS_AND_OPTIONS.md} +0 -0
  192. /package/docs/{POLICY_GATE_IMPLEMENTATION.md → archive/POLICY_GATE_IMPLEMENTATION.md} +0 -0
  193. /package/docs/{UAP_OPTIMIZATION_PLAN.md → archive/UAP_OPTIMIZATION_PLAN.md} +0 -0
  194. /package/docs/{opencode-integration-guide.md → archive/opencode-integration-guide.md} +0 -0
  195. /package/docs/{opencode-integration-quickref.md → archive/opencode-integration-quickref.md} +0 -0
@@ -0,0 +1,64 @@
1
+ {
2
+ "_profile": "gpt-4o",
3
+ "_description": "OpenAI GPT-4o (gpt-4o-2025-03-27). Multimodal flagship with native tool calling, JSON mode, and 128K context. Fast, capable, and cost-effective for most tasks. Supports parallel function calling and structured outputs.",
4
+
5
+ "model": "gpt-4o-2025-03-27",
6
+ "provider": "openai",
7
+ "api_base_url": "https://api.openai.com/v1",
8
+ "max_tokens": 16384,
9
+ "temperature": 0.7,
10
+ "top_p": 1.0,
11
+ "timeout_ms": 120000,
12
+ "context_window": 128000,
13
+
14
+ "optimize_for_tool_calls": true,
15
+ "enable_thinking": false,
16
+
17
+ "tool_calling": {
18
+ "parallel_tool_calls": true,
19
+ "tool_choice": "auto",
20
+ "strict_mode": true,
21
+ "_comment": "GPT-4o has native parallel function calling. strict_mode ensures function call arguments match the schema exactly."
22
+ },
23
+
24
+ "structured_output": {
25
+ "json_mode": true,
26
+ "strict_json_schema": true,
27
+ "_comment": "GPT-4o supports structured outputs with guaranteed schema adherence via json_schema response format."
28
+ },
29
+
30
+ "dynamic_temperature": {
31
+ "enabled": true,
32
+ "decay": 0.5,
33
+ "floor": 0.2,
34
+ "_comment": "Lower temperature on retries for more deterministic tool call output."
35
+ },
36
+
37
+ "tool_call_batching": {
38
+ "enabled": true,
39
+ "system_prompt_suffix": "When multiple tools are needed, call ALL of them in a single response using parallel function calling.",
40
+ "_comment": "GPT-4o natively supports parallel function calls."
41
+ },
42
+
43
+ "pricing": {
44
+ "input_per_1m": 2.5,
45
+ "output_per_1m": 10.0,
46
+ "cached_input_per_1m": 1.25,
47
+ "currency": "USD",
48
+ "_comment": "As of 2025. Prompt caching available for 50% input discount."
49
+ },
50
+
51
+ "rate_limits": {
52
+ "requests_per_minute": 5000,
53
+ "tokens_per_minute": 800000,
54
+ "_comment": "Tier 3+ limits. Very high throughput."
55
+ },
56
+
57
+ "running_config": {
58
+ "description": "OpenAI hosted API. Requires OPENAI_API_KEY environment variable.",
59
+ "env_vars": {
60
+ "OPENAI_API_KEY": "<your-api-key>",
61
+ "UAP_MODEL_PROFILE": "gpt-4o"
62
+ }
63
+ }
64
+ }
@@ -0,0 +1,61 @@
1
+ {
2
+ "_profile": "gpt-o3",
3
+ "_description": "OpenAI o3 (o3-2025-04-16). Reasoning model with chain-of-thought for complex problem solving. 200K context, native tool use, and configurable reasoning effort. Best for tasks requiring deep analysis, math, science, or multi-step planning.",
4
+
5
+ "model": "o3-2025-04-16",
6
+ "provider": "openai",
7
+ "api_base_url": "https://api.openai.com/v1",
8
+ "max_tokens": 100000,
9
+ "temperature": 1.0,
10
+ "top_p": 1.0,
11
+ "timeout_ms": 600000,
12
+ "context_window": 200000,
13
+
14
+ "optimize_for_tool_calls": true,
15
+ "enable_thinking": true,
16
+
17
+ "tool_calling": {
18
+ "parallel_tool_calls": true,
19
+ "tool_choice": "auto",
20
+ "_comment": "o3 supports tool use with reasoning. The model reasons about which tools to call and in what order."
21
+ },
22
+
23
+ "reasoning": {
24
+ "reasoning_effort": "medium",
25
+ "options": ["low", "medium", "high"],
26
+ "_comment": "Controls how much reasoning the model does. 'high' is most thorough but slowest and most expensive. 'low' is fastest."
27
+ },
28
+
29
+ "dynamic_temperature": {
30
+ "enabled": false,
31
+ "_comment": "o3 uses fixed temperature=1.0. Reasoning effort controls output quality instead."
32
+ },
33
+
34
+ "tool_call_batching": {
35
+ "enabled": true,
36
+ "system_prompt_suffix": "When multiple tools are needed, call ALL of them in a single response.",
37
+ "_comment": "o3 supports parallel tool calls."
38
+ },
39
+
40
+ "pricing": {
41
+ "input_per_1m": 2.0,
42
+ "output_per_1m": 8.0,
43
+ "cached_input_per_1m": 0.5,
44
+ "currency": "USD",
45
+ "_comment": "As of 2025. Reasoning tokens are billed as output tokens."
46
+ },
47
+
48
+ "rate_limits": {
49
+ "requests_per_minute": 1000,
50
+ "tokens_per_minute": 200000,
51
+ "_comment": "Lower throughput than GPT-4o due to reasoning overhead."
52
+ },
53
+
54
+ "running_config": {
55
+ "description": "OpenAI hosted API. Requires OPENAI_API_KEY environment variable.",
56
+ "env_vars": {
57
+ "OPENAI_API_KEY": "<your-api-key>",
58
+ "UAP_MODEL_PROFILE": "gpt-o3"
59
+ }
60
+ }
61
+ }
@@ -0,0 +1,70 @@
1
+ {
2
+ "_profile": "kimi-k2.5",
3
+ "_description": "Moonshot Kimi K2.5 (kimi-k2.5). Open-weight MoE model (1T total, 32B active) with agentic tool use, 128K context, and strong coding ability. Runs locally via vLLM or via Moonshot API. Competitive with GPT-4o on coding benchmarks at a fraction of the cost.",
4
+
5
+ "model": "kimi-k2.5",
6
+ "provider": "moonshot",
7
+ "api_base_url": "https://api.moonshot.cn/v1",
8
+ "max_tokens": 16384,
9
+ "temperature": 0.6,
10
+ "top_p": 0.9,
11
+ "timeout_ms": 180000,
12
+ "context_window": 131072,
13
+
14
+ "optimize_for_tool_calls": true,
15
+ "enable_thinking": true,
16
+
17
+ "tool_calling": {
18
+ "parallel_tool_calls": true,
19
+ "tool_choice": "auto",
20
+ "_comment": "Kimi K2.5 has native tool calling with OpenAI-compatible function calling format. Strong agentic performance."
21
+ },
22
+
23
+ "thinking": {
24
+ "enabled": true,
25
+ "_comment": "K2.5 supports thinking mode for complex reasoning. Can be disabled for faster responses."
26
+ },
27
+
28
+ "dynamic_temperature": {
29
+ "enabled": true,
30
+ "decay": 0.5,
31
+ "floor": 0.2,
32
+ "_comment": "Lower temperature on retries for more deterministic tool call output."
33
+ },
34
+
35
+ "tool_call_batching": {
36
+ "enabled": true,
37
+ "system_prompt_suffix": "When multiple tools are needed, call ALL of them in a single response. Do not call one tool and wait - emit all tool calls together.",
38
+ "_comment": "K2.5 supports parallel function calls."
39
+ },
40
+
41
+ "local_deployment": {
42
+ "supported": true,
43
+ "framework": "vLLM",
44
+ "min_vram_gb": 80,
45
+ "recommended_vram_gb": 160,
46
+ "command": "vllm serve moonshotai/Kimi-K2.5 --tensor-parallel-size 4 --max-model-len 131072",
47
+ "_comment": "Open-weight model. 1T total params, 32B active (MoE). Requires multi-GPU setup for local deployment. Use API for single-GPU setups."
48
+ },
49
+
50
+ "pricing": {
51
+ "input_per_1m_cny": 2.0,
52
+ "output_per_1m_cny": 8.0,
53
+ "currency": "CNY",
54
+ "_comment": "As of 2025. Moonshot API pricing. Open weights available for self-hosting."
55
+ },
56
+
57
+ "rate_limits": {
58
+ "requests_per_minute": 500,
59
+ "tokens_per_minute": 500000,
60
+ "_comment": "Default tier limits via Moonshot API."
61
+ },
62
+
63
+ "running_config": {
64
+ "description": "Moonshot hosted API or self-hosted via vLLM. Requires MOONSHOT_API_KEY for hosted, or local vLLM setup.",
65
+ "env_vars": {
66
+ "MOONSHOT_API_KEY": "<your-api-key>",
67
+ "UAP_MODEL_PROFILE": "kimi-k2.5"
68
+ }
69
+ }
70
+ }
@@ -0,0 +1,86 @@
1
+ {
2
+ "_profile": "llama",
3
+ "_description": "Optimized for Meta Llama 3.1/3.2/3.3 and Llama 4 models via llama.cpp or vLLM. Tuned for Llama's instruction-following strengths with tool calling support. Works with any Llama-family model (8B, 70B, 405B) at any quantization level.",
4
+
5
+ "model": "default",
6
+ "max_tokens": 4096,
7
+ "temperature": 0.6,
8
+ "top_p": 0.9,
9
+ "top_k": 40,
10
+ "min_p": 0.05,
11
+ "repetition_penalty": 1.1,
12
+ "timeout_ms": 120000,
13
+ "context_window": 131072,
14
+
15
+ "optimize_for_tool_calls": true,
16
+ "enable_thinking": false,
17
+
18
+ "dynamic_temperature": {
19
+ "enabled": true,
20
+ "decay": 0.5,
21
+ "floor": 0.2,
22
+ "_comment": "On retry, temp = max(floor, base_temp * decay^attempt). Reduces randomness for structured output."
23
+ },
24
+
25
+ "tool_call_batching": {
26
+ "enabled": true,
27
+ "system_prompt_suffix": "When multiple tools are needed, call ALL of them in a single response. Do not call one tool and wait - emit all tool calls together.",
28
+ "_comment": "Reduces round-trips by encouraging multi-tool emission in one turn."
29
+ },
30
+
31
+ "server_optimization": {
32
+ "flash_attention": true,
33
+ "kv_cache_type_k": "q8_0",
34
+ "kv_cache_type_v": "q4_0",
35
+ "prompt_cache": {
36
+ "enabled": true,
37
+ "slot_save_path": "./cache/slots"
38
+ },
39
+ "speculative_decoding": {
40
+ "enabled": false,
41
+ "_comment": "For Llama 70B+, use Llama-3.2-1B as draft model for 2-3x speedup."
42
+ },
43
+ "_comment": "KV q8/q4 split saves ~60% KV VRAM. Flash attn gives 1.5-2x speed on long context."
44
+ },
45
+
46
+ "quantization_profiles": {
47
+ "current": "Q4_K_M",
48
+ "upgrade_path": [
49
+ {
50
+ "quant": "Q4_K_M",
51
+ "vram_gb": 5,
52
+ "accuracy": "95%",
53
+ "tool_calls": "93%",
54
+ "note": "Llama 8B"
55
+ },
56
+ {
57
+ "quant": "Q4_K_M",
58
+ "vram_gb": 40,
59
+ "accuracy": "95%",
60
+ "tool_calls": "95%",
61
+ "note": "Llama 70B"
62
+ },
63
+ {
64
+ "quant": "Q5_K_M",
65
+ "vram_gb": 48,
66
+ "accuracy": "97%",
67
+ "tool_calls": "97%",
68
+ "note": "Llama 70B"
69
+ }
70
+ ],
71
+ "_comment": "Llama models scale well across quantization levels."
72
+ },
73
+
74
+ "running_config": {
75
+ "description": "Llama model via llama.cpp. Adjust --model path and --gpu-layers for your hardware.",
76
+ "llm_server": {
77
+ "description": "Llama via llama-server on port 8080",
78
+ "command": "llama-server --model /path/to/llama-model.gguf --port 8080 --host 0.0.0.0 --ctx-size 131072 --gpu-layers 99 --cache-type-k q8_0 --cache-type-v q4_0 -fa on --threads 8 --batch-size 512",
79
+ "vram_estimate": "~5GB (8B Q4_K_M) / ~40GB (70B Q4_K_M)"
80
+ },
81
+ "env_vars": {
82
+ "TARGET_URL": "http://127.0.0.1:8080",
83
+ "UAP_MODEL_PROFILE": "llama"
84
+ }
85
+ }
86
+ }
@@ -0,0 +1,218 @@
1
+ {
2
+ "model": "qwen3.5-a3b-iq4xs",
3
+ "max_tokens": 81920,
4
+ "temperature": 0.3,
5
+ "top_p": 0.9,
6
+ "top_k": 20,
7
+ "min_p": 0.05,
8
+ "repetition_penalty": 1.0,
9
+ "stop_sequences": [],
10
+ "timeout_ms": 120000,
11
+ "context_window": 262144,
12
+ "optimize_for_tool_calls": true,
13
+ "mode_switch_buffer_tokens": 500,
14
+ "enable_thinking": false,
15
+ "chat_template_kwargs": {
16
+ "enable_thinking": false
17
+ },
18
+
19
+ "dynamic_temperature": {
20
+ "enabled": true,
21
+ "decay": 0.5,
22
+ "floor": 0.2,
23
+ "_comment": "On retry, temp = max(floor, base_temp * decay^attempt). Reduces randomness for structured output."
24
+ },
25
+
26
+ "tool_call_batching": {
27
+ "enabled": true,
28
+ "system_prompt_suffix": "When multiple tools are needed, call ALL of them in a single response. Do not call one tool and wait - emit all tool calls together.",
29
+ "_comment": "Reduces round-trips by encouraging multi-tool emission in one turn."
30
+ },
31
+
32
+ "structured_output": {
33
+ "grammar_file": "tools/agents/config/tool-call.gbnf",
34
+ "json_schema_file": "tools/agents/config/tool-call-schema.json",
35
+ "_comment": "GBNF grammar for per-request use only. Do NOT use --grammar-file on server startup — it forces tool_call output on all requests including plain chat. llama.cpp's autoparser handles tool call detection lazily via the chat template."
36
+ },
37
+
38
+ "server_optimization": {
39
+ "flash_attention": true,
40
+ "kv_cache_type_k": "q8_0",
41
+ "kv_cache_type_v": "q4_0",
42
+ "prompt_cache": {
43
+ "enabled": true,
44
+ "slot_save_path": "./cache/slots"
45
+ },
46
+ "speculative_decoding": {
47
+ "enabled": true,
48
+ "draft_model": "Qwen3.5-0.8B-Q8_0",
49
+ "draft_max": 16,
50
+ "draft_min": 3,
51
+ "draft_p_min": 0.75,
52
+ "_comment": "Enable for 2-3x speedup in tokens/sec. Uses Qwen3.5-0.8B-Q8_0 as draft model to propose tokens. Set enabled=true and provide draft model GGUF path. Current setup: main model 17GB, draft model 0.8GB, KV cache ~2-3GB."
53
+ },
54
+ "_comment": "KV q8/q4 split saves ~60% KV VRAM. Flash attn gives 1.5-2x speed on long context."
55
+ },
56
+
57
+ "lora": {
58
+ "enabled": false,
59
+ "adapter_path": "",
60
+ "scale": 1.0,
61
+ "training_config": "config/lora-finetune.yaml",
62
+ "data_generator": "tools/agents/scripts/generate_lora_training_data.py",
63
+ "_comment": "Fine-tune a LoRA adapter for +15-20% tool call reliability. See config/lora-finetune.yaml."
64
+ },
65
+
66
+ "quantization_profiles": {
67
+ "current": "iq4_xs",
68
+ "upgrade_path": [
69
+ { "quant": "IQ4_XS", "vram_gb": 17, "accuracy": "96%", "tool_calls": "94%" },
70
+ { "quant": "Q4_K_M", "vram_gb": 20, "accuracy": "95%", "tool_calls": "95%" },
71
+ { "quant": "Q5_K_M", "vram_gb": 24, "accuracy": "97%", "tool_calls": "97%" },
72
+ { "quant": "Q6_K", "vram_gb": 28, "accuracy": "98%", "tool_calls": "98%" }
73
+ ],
74
+ "_comment": "Each step up requires ~4-7GB more VRAM but improves instruction following."
75
+ },
76
+
77
+ "running_config": {
78
+ "description": "Dual-model llama.cpp setup: Qwen3.5 35B A3B for LLM + nomic-embed-text-v2-moe for embeddings. Run as two separate llama-server instances on different ports.",
79
+
80
+ "llm_server": {
81
+ "description": "Instance 1: Main LLM (Qwen3.5 35B A3B) — port 8080",
82
+ "command": "llama-server --model /path/to/Qwen3.5-35B-A3B-UD-IQ4_XS.gguf --chat-template-file chat_template.jinja --port 8080 --host 0.0.0.0 --ctx-size 131072 --gpu-layers 99 --cache-type-k q8_0 --cache-type-v q4_0 -fa on --threads 8 --batch-size 512 --ubatch-size 256 --mlock --metrics --n-predict 4096 --temp 0.3 --top-p 0.9 --top-k 20 --min-p 0.05",
83
+ "flags": [
84
+ "--model /path/to/Qwen3.5-35B-A3B-UD-IQ4_XS.gguf",
85
+ "--chat-template-file chat_template.jinja",
86
+ "--n-predict 4096",
87
+ "--temp 0.3",
88
+ "--top-p 0.9",
89
+ "--top-k 20",
90
+ "--min-p 0.05",
91
+ "--threads 8",
92
+ "--ctx-size 131072",
93
+ "--batch-size 512",
94
+ "--ubatch-size 256",
95
+ "--gpu-layers 99",
96
+ "--cache-type-k q8_0",
97
+ "--cache-type-v q4_0",
98
+ "--mlock",
99
+ "-fa on",
100
+ "--metrics",
101
+ "--host 0.0.0.0",
102
+ "--port 8080"
103
+ ],
104
+ "vram_estimate": "~20GB (17GB model + 0.8GB draft + 2-3GB KV cache)"
105
+ },
106
+
107
+ "embedding_server": {
108
+ "description": "Instance 2: Embedding model (nomic-embed-text-v2-moe) — port 8081. SoTA multilingual MoE embedding model: 475M total / 305M active params, 768 dims, 100+ languages, MIRACL 65.8.",
109
+ "model": "nomic-embed-text-v2-moe",
110
+ "model_url": "https://huggingface.co/nomic-ai/nomic-embed-text-v2-moe-GGUF",
111
+ "recommended_quant": "Q4_K_M",
112
+ "quant_options": [
113
+ { "quant": "Q4_K_M", "size_mb": 328, "quality": "good, recommended for most use cases" },
114
+ { "quant": "Q5_K_M", "size_mb": 354, "quality": "high quality" },
115
+ { "quant": "Q6_K", "size_mb": 379, "quality": "very high quality, near perfect" },
116
+ { "quant": "Q8_0", "size_mb": 488, "quality": "extremely high, generally unneeded" },
117
+ { "quant": "F16", "size_mb": 913, "quality": "full precision, max quality" }
118
+ ],
119
+ "download_command": "huggingface-cli download nomic-ai/nomic-embed-text-v2-moe-GGUF nomic-embed-text-v2-moe.Q4_K_M.gguf --local-dir /path/to/models/",
120
+
121
+ "command_gpu": "llama-server --model /path/to/nomic-embed-text-v2-moe.Q4_K_M.gguf --port 8081 --host 0.0.0.0 --embeddings --pooling mean --ctx-size 512 --gpu-layers 99 --batch-size 2048 --ubatch-size 512 --threads 4",
122
+ "command_cpu": "llama-server --model /path/to/nomic-embed-text-v2-moe.Q4_K_M.gguf --port 8081 --host 0.0.0.0 --embeddings --pooling mean --ctx-size 512 --gpu-layers 0 --batch-size 2048 --threads 8",
123
+
124
+ "flags_gpu": [
125
+ "--model /path/to/nomic-embed-text-v2-moe.Q4_K_M.gguf",
126
+ "--port 8081",
127
+ "--host 0.0.0.0",
128
+ "--embeddings",
129
+ "--pooling mean",
130
+ "--ctx-size 512",
131
+ "--gpu-layers 99",
132
+ "--batch-size 2048",
133
+ "--ubatch-size 512",
134
+ "--threads 4"
135
+ ],
136
+ "flags_cpu": [
137
+ "--model /path/to/nomic-embed-text-v2-moe.Q4_K_M.gguf",
138
+ "--port 8081",
139
+ "--host 0.0.0.0",
140
+ "--embeddings",
141
+ "--pooling mean",
142
+ "--ctx-size 512",
143
+ "--gpu-layers 0",
144
+ "--batch-size 2048",
145
+ "--threads 8"
146
+ ],
147
+ "vram_estimate_gpu": "~380MB (328MB model + ~50MB KV cache)",
148
+ "vram_estimate_cpu": "0 (runs entirely on CPU RAM, ~400MB)",
149
+ "task_prefixes": {
150
+ "search_query": "search_query: ",
151
+ "search_document": "search_document: ",
152
+ "_comment": "nomic-embed-text-v2 requires task prefixes. Queries use search_query:, documents use search_document:. The UAP embedding provider handles this automatically."
153
+ },
154
+ "env_var": "UAP_EMBEDDING_ENDPOINT=http://localhost:8081",
155
+ "_comment": "Set UAP_EMBEDDING_ENDPOINT env var to override the default endpoint. The embedding provider auto-detects dimensions from the model response."
156
+ },
157
+
158
+ "dual_model_vram_budget": {
159
+ "24gb_gpu": {
160
+ "llm": "~20GB (Qwen3.5 IQ4_XS + draft + KV)",
161
+ "embedding_gpu": "~380MB (nomic-embed Q4_K_M on GPU)",
162
+ "total": "~20.4GB — fits with headroom",
163
+ "recommendation": "Run both on GPU"
164
+ },
165
+ "16gb_gpu": {
166
+ "llm": "~17GB (Qwen3.5 IQ4_XS, no draft, smaller KV)",
167
+ "embedding_cpu": "0 GPU (run embedding on CPU)",
168
+ "total": "~17GB GPU + ~400MB CPU",
169
+ "recommendation": "Run LLM on GPU, embedding on CPU"
170
+ },
171
+ "48gb_gpu": {
172
+ "llm": "~20GB (Qwen3.5 IQ4_XS + draft + KV)",
173
+ "embedding_gpu": "~380MB",
174
+ "headroom": "~28GB free for larger context or better quants",
175
+ "recommendation": "Run both on GPU, consider Q5_K_M or Q6_K for LLM"
176
+ }
177
+ }
178
+ },
179
+
180
+ "optimization_recommendations": {
181
+ "current_performance": {
182
+ "vram_breakdown": {
183
+ "main_model": "17GB",
184
+ "draft_model": "0.8GB",
185
+ "kv_cache": "~2-3GB",
186
+ "total_estimated": "~20GB"
187
+ },
188
+ "speedup_estimate": "2-3x improvement in tokens/sec compared to standard generation",
189
+ "notes": "Speculative decoding is working as expected with the current setup. Main model (Qwen3.5-a3b-iq4xs) and draft model (Qwen3.5-0.8B-Q8_0) are optimized for tool calls."
190
+ },
191
+ "recommendations": [
192
+ {
193
+ "title": "Increase draft-max for longer contexts",
194
+ "description": "Adjust --draft-max to 32 for longer contexts (up to 32 tokens at once). Current setting is 16. This can improve throughput by 10-20% for longer generation tasks.",
195
+ "priority": "high",
196
+ "action": "Update speculative_decoding.draft_max to 32 in config"
197
+ },
198
+ {
199
+ "title": "Monitor KV cache usage",
200
+ "description": "KV cache is currently ~2-3GB. For longer contexts (131k tokens), consider adjusting --ctx-size or --batch-size if memory pressure occurs.",
201
+ "priority": "medium",
202
+ "action": "Monitor memory usage and adjust context window if needed"
203
+ },
204
+ {
205
+ "title": "Consider GPU-layers optimization",
206
+ "description": "Current setting is --gpu-layers 35. If VRAM allows (20GB+), consider increasing to 36 for better memory management.",
207
+ "priority": "low",
208
+ "action": "Evaluate GPU memory and adjust --gpu-layers if needed"
209
+ },
210
+ {
211
+ "title": "Enable flash attention",
212
+ "description": "Flash attention is already enabled. For further optimization, ensure --flash-attn flag is present in command line.",
213
+ "priority": "medium",
214
+ "action": "Verify --flash-attn is enabled in running config"
215
+ }
216
+ ]
217
+ }
218
+ }