npm - @miller-tech/uap - Versions diffs - 1.13.3 → 1.13.5 - Mend

@miller-tech/uap 1.13.3 → 1.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/config/model-profiles/qwen35.json +6 -4
package/dist/.tsbuildinfo +1 -1
package/dist/models/executor.d.ts +5 -1
package/dist/models/executor.d.ts.map +1 -1
package/dist/models/executor.js +10 -6
package/dist/models/executor.js.map +1 -1
package/dist/models/types.js +2 -2
package/dist/models/types.js.map +1 -1
package/docs/deployment/QWEN35_LLAMA_CPP.md +9 -9
package/package.json +1 -1
package/templates/hooks/session-start.sh +48 -42

package/config/model-profiles/qwen35.json CHANGED Viewed

@@ -1,4 +1,6 @@
 {
+  "_profile": "qwen35",
+  "_description": "Qwen3.5 35B A3B (IQ4_XS) — local llama.cpp deployment for agentic tool-calling workloads",
   "model": "qwen3.5-a3b-iq4xs",
   "max_tokens": 81920,
   "temperature": 0.3,
@@ -6,8 +8,8 @@
   "top_k": 20,
   "min_p": 0.05,
   "repetition_penalty": 1.0,
-  "stop_sequences": [],
-  "timeout_ms": 120000,
+  "stop_sequences": ["<|im_end|>"],
+  "timeout_ms": 300000,
   "context_window": 262144,
   "optimize_for_tool_calls": true,
   "mode_switch_buffer_tokens": 500,
@@ -79,11 +81,11 @@
     "llm_server": {
       "description": "Instance 1: Main LLM (Qwen3.5 35B A3B) — port 8080",
-      "command": "llama-server --model /path/to/Qwen3.5-35B-A3B-UD-IQ4_XS.gguf --chat-template-file chat_template.jinja --port 8080 --host 0.0.0.0 --ctx-size 131072 --gpu-layers 99 --cache-type-k q8_0 --cache-type-v q4_0 -fa on --threads 8 --batch-size 512 --ubatch-size 256 --mlock --metrics --n-predict 4096 --temp 0.3 --top-p 0.9 --top-k 20 --min-p 0.05",
+      "command": "llama-server --model /path/to/Qwen3.5-35B-A3B-UD-IQ4_XS.gguf --chat-template-file chat_template.jinja --port 8080 --host 0.0.0.0 --ctx-size 131072 --gpu-layers 99 --cache-type-k q8_0 --cache-type-v q4_0 -fa on --threads 8 --batch-size 512 --ubatch-size 256 --mlock --metrics --n-predict 16384 --temp 0.3 --top-p 0.9 --top-k 20 --min-p 0.05",
       "flags": [
         "--model /path/to/Qwen3.5-35B-A3B-UD-IQ4_XS.gguf",
         "--chat-template-file chat_template.jinja",
-        "--n-predict 4096",
+        "--n-predict 16384",
         "--temp 0.3",
         "--top-p 0.9",
         "--top-k 20",