ollama-agent-router 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +213 -0
- package/dist/cli.js +3 -22
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +1 -10
- package/dist/index.js +3 -22
- package/dist/index.js.map +1 -1
- package/examples/gex44-secured.yaml +18 -12
- package/examples/gex44.yaml +1 -5
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -96,6 +96,219 @@ ollama-agent-router serve --config examples/gex44.yaml
|
|
|
96
96
|
|
|
97
97
|
`examples/gex44-secured.yaml` is the same hardware profile with the standalone plane locked down: API key required, anonymous access rejected, per-key rate limits, and the admin plane enabled on localhost. Use it as a starting point when the router is exposed beyond a single user or process.
|
|
98
98
|
|
|
99
|
+
## Routing Algorithm
|
|
100
|
+
|
|
101
|
+
### Candidate selection
|
|
102
|
+
|
|
103
|
+
For every request the router builds a candidate list from three sources, merged in order:
|
|
104
|
+
|
|
105
|
+
1. `router.preferredModels` from the request — added first, regardless of `routes`.
|
|
106
|
+
2. `routes[taskType]` — the ordered list for the classified task type.
|
|
107
|
+
3. Any model whose `purpose` or `tags` array contains the task type — acts as a catch-all fallback.
|
|
108
|
+
|
|
109
|
+
Models listed in `router.forbiddenModels` are dropped from the candidate list entirely.
|
|
110
|
+
|
|
111
|
+
### Blocking checks
|
|
112
|
+
|
|
113
|
+
Before scoring, each candidate is checked for hard blocks:
|
|
114
|
+
|
|
115
|
+
- **`gpu_only`** — `requireGpuOnly` is set (globally or per-request) and the model is not fully on GPU, has a CPU/GPU split in `ollama ps`, or there is not enough free VRAM to load it.
|
|
116
|
+
- **`busy`** — the model has `exclusive: true` and is already running, or `allowWhenBusy: false` and has reached `maxConcurrent`.
|
|
117
|
+
|
|
118
|
+
Blocked models are excluded from sync selection but can still be picked for async jobs.
|
|
119
|
+
|
|
120
|
+
### Scoring
|
|
121
|
+
|
|
122
|
+
Every non-blocked candidate receives a numeric score. Higher score wins. Starting value: **100**.
|
|
123
|
+
|
|
124
|
+
| Component | Delta | Notes |
|
|
125
|
+
|---|---|---|
|
|
126
|
+
| Route position | `+50` for index 0, `−8` per step | First entry in `routes[taskType]` gets the full bonus |
|
|
127
|
+
| `model.priority` | `+priority` | Set per model, 1–100 |
|
|
128
|
+
| `purpose` match | `+25` | Model's `purpose` array contains the task type |
|
|
129
|
+
| `preferredModels` | `+80` for index 0, `−10` per step | Request-level override |
|
|
130
|
+
| Already loaded in Ollama | **`+20`** | Model appears in `ollama ps` output |
|
|
131
|
+
| Heavy complexity + `costClass: high` | `+20` | Classifier returned `heavy`; rewards large models |
|
|
132
|
+
| Light complexity + `costClass: low` | `+15` | Classifier returned `light`; rewards small models |
|
|
133
|
+
| Free VRAM headroom | `+0..+25` | Scales with `(freeMb − requiredMb) / 512`, capped at 25 |
|
|
134
|
+
| Insufficient VRAM | **`−60`** | `model.sizeGb × 1024 + vramSafetyReserveMb > freeMb` |
|
|
135
|
+
| Queue depth | `−18 × queueDepth` | Per-model queue length |
|
|
136
|
+
| Running count | `−25 × running` | Per-model active executions |
|
|
137
|
+
| Exclusive + running | `−80 × running` additional | `exclusive: true` models penalised heavily while in use |
|
|
138
|
+
|
|
139
|
+
The candidate with the highest score is selected. The others appear in `fallbackModels` in the response.
|
|
140
|
+
|
|
141
|
+
### Model config fields that affect routing
|
|
142
|
+
|
|
143
|
+
```yaml
|
|
144
|
+
models:
|
|
145
|
+
- name: gpt-oss:20b
|
|
146
|
+
sizeGb: 14.0 # used for VRAM headroom calculation
|
|
147
|
+
purpose: [agentic_reasoning, large_context, planning, tool_use, complex_debugging]
|
|
148
|
+
# +25 score when task type matches; also adds model to the candidate list
|
|
149
|
+
priority: 95 # added directly to score; use to rank models of similar capability
|
|
150
|
+
maxConcurrent: 1 # hard cap on parallel executions
|
|
151
|
+
costClass: high # low | medium | high — matched against request complexity for bonus/penalty
|
|
152
|
+
exclusive: true # if running, gets −80 extra penalty per execution; only one at a time
|
|
153
|
+
allowWhenBusy: false # if false and maxConcurrent reached → blocked entirely
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**`purpose`** — declares what the model can do. Each entry that matches the request's task type adds `+25` to the score and also makes the model a candidate even when it is not listed in `routes[taskType]`. Use it for every task type the model handles well, including secondary ones (e.g. add `agentic_reasoning` to a coder model that works as a capable fallback).
|
|
157
|
+
|
|
158
|
+
**`costClass`** — signals the relative weight of the model:
|
|
159
|
+
- `high`: gets `+20` when the classifier decides the request is complex (`heavy`). Intended for large reasoning models.
|
|
160
|
+
- `low`: gets `+15` when the request is simple (`light`). Intended for small triage/chat models.
|
|
161
|
+
- `medium`: no complexity bonus in either direction.
|
|
162
|
+
|
|
163
|
+
**`exclusive`** — intended for large models that cannot safely share GPU memory with another concurrent execution. While one request is running, the model accumulates `−80` per running job on top of the standard `−25`, making it effectively unselectable for sync requests until free.
|
|
164
|
+
|
|
165
|
+
### `routes` config and its relation to scoring
|
|
166
|
+
|
|
167
|
+
```yaml
|
|
168
|
+
routes:
|
|
169
|
+
agentic_reasoning: [gpt-oss:20b, qwen2.5-coder:7b]
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Order matters: `gpt-oss:20b` at index 0 gets `+50`, `qwen2.5-coder:7b` at index 1 gets `+42`. Each additional position costs `−8`.
|
|
173
|
+
|
|
174
|
+
A model does not need to be in `routes` to be selected — if it declares the task type in `purpose` or `tags` it will still enter the candidate list (with a route-position score of 0).
|
|
175
|
+
|
|
176
|
+
### Sync vs async decision
|
|
177
|
+
|
|
178
|
+
After scoring, the router checks whether to run synchronously or push to the async queue:
|
|
179
|
+
|
|
180
|
+
1. If `router.mode: async` — always async.
|
|
181
|
+
2. If heavy load is detected (total queue depth ≥ `router.heavyLoadQueueDepth` **or** free VRAM < `router.heavyLoadGpuFreeMbThreshold`) and `allowAsync: true` — async.
|
|
182
|
+
3. If the top-scored model is busy and `allowAsync: true` — async on that model.
|
|
183
|
+
4. Otherwise — sync on the top-scored model.
|
|
184
|
+
|
|
185
|
+
`allowAsync` defaults to `true`. Set `"router": {"mode": "sync"}` in the request to force synchronous execution regardless of load.
|
|
186
|
+
|
|
187
|
+
### Forcing a specific model
|
|
188
|
+
|
|
189
|
+
`preferredModels` adds `+80` to the first entry, making it win unless blocked by VRAM or busy constraints. `forbiddenModels` removes models from the candidate list entirely — useful when testing a specific model in isolation.
|
|
190
|
+
|
|
191
|
+
### Request examples
|
|
192
|
+
|
|
193
|
+
**Explicit task type — let the router pick the best model for the task:**
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
curl -s http://127.0.0.1:11435/v1/chat/completions \
|
|
197
|
+
-H 'content-type: application/json' \
|
|
198
|
+
-H 'authorization: Bearer <api-key>' \
|
|
199
|
+
-d '{
|
|
200
|
+
"model": "auto",
|
|
201
|
+
"messages": [{"role": "user", "content": "Plan a multi-service refactor"}],
|
|
202
|
+
"router": {
|
|
203
|
+
"taskType": "agentic_reasoning"
|
|
204
|
+
}
|
|
205
|
+
}'
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
**Explicit task type with async fallback on heavy load:**
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
curl -s http://127.0.0.1:11435/v1/chat/completions \
|
|
212
|
+
-H 'content-type: application/json' \
|
|
213
|
+
-H 'authorization: Bearer <api-key>' \
|
|
214
|
+
-d '{
|
|
215
|
+
"model": "auto",
|
|
216
|
+
"messages": [{"role": "user", "content": "Plan a multi-service refactor"}],
|
|
217
|
+
"router": {
|
|
218
|
+
"taskType": "agentic_reasoning",
|
|
219
|
+
"allowAsync": true
|
|
220
|
+
}
|
|
221
|
+
}'
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Returns `202` with a job id when load is high; `200` with the result when run synchronously.
|
|
225
|
+
|
|
226
|
+
**Force a specific model, block all others:**
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
curl -s http://127.0.0.1:11435/v1/chat/completions \
|
|
230
|
+
-H 'content-type: application/json' \
|
|
231
|
+
-H 'authorization: Bearer <api-key>' \
|
|
232
|
+
-d '{
|
|
233
|
+
"model": "auto",
|
|
234
|
+
"messages": [{"role": "user", "content": "Review this PR diff"}],
|
|
235
|
+
"router": {
|
|
236
|
+
"taskType": "code_review",
|
|
237
|
+
"preferredModels": ["gpt-oss:20b"],
|
|
238
|
+
"forbiddenModels": ["qwen2.5-coder:7b", "deepseek-coder:6.7b"]
|
|
239
|
+
}
|
|
240
|
+
}'
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
**Force sync, no async fallback even under load:**
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
curl -s http://127.0.0.1:11435/v1/chat/completions \
|
|
247
|
+
-H 'content-type: application/json' \
|
|
248
|
+
-H 'authorization: Bearer <api-key>' \
|
|
249
|
+
-d '{
|
|
250
|
+
"model": "auto",
|
|
251
|
+
"messages": [{"role": "user", "content": "Fix the off-by-one error"}],
|
|
252
|
+
"router": {
|
|
253
|
+
"taskType": "code_fix",
|
|
254
|
+
"mode": "sync",
|
|
255
|
+
"allowAsync": false
|
|
256
|
+
}
|
|
257
|
+
}'
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
**High priority request — jumps ahead in the queue:**
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
curl -s http://127.0.0.1:11435/v1/chat/completions \
|
|
264
|
+
-H 'content-type: application/json' \
|
|
265
|
+
-H 'authorization: Bearer <api-key>' \
|
|
266
|
+
-d '{
|
|
267
|
+
"model": "auto",
|
|
268
|
+
"messages": [{"role": "user", "content": "Summarize this log"}],
|
|
269
|
+
"router": {
|
|
270
|
+
"taskType": "summarize",
|
|
271
|
+
"priority": "high"
|
|
272
|
+
}
|
|
273
|
+
}'
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
**GPU-only — reject if model would run on CPU or with a CPU/GPU split:**
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
curl -s http://127.0.0.1:11435/v1/chat/completions \
|
|
280
|
+
-H 'content-type: application/json' \
|
|
281
|
+
-H 'authorization: Bearer <api-key>' \
|
|
282
|
+
-d '{
|
|
283
|
+
"model": "auto",
|
|
284
|
+
"messages": [{"role": "user", "content": "Generate a REST API scaffold"}],
|
|
285
|
+
"router": {
|
|
286
|
+
"taskType": "code_generate",
|
|
287
|
+
"requireGpuOnly": true
|
|
288
|
+
}
|
|
289
|
+
}'
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
Returns `503` if no GPU-only candidate is available.
|
|
293
|
+
|
|
294
|
+
**Check what the router decided** — every `200` response includes a `router` object:
|
|
295
|
+
|
|
296
|
+
```json
|
|
297
|
+
{
|
|
298
|
+
"router": {
|
|
299
|
+
"mode": "sync",
|
|
300
|
+
"taskType": "agentic_reasoning",
|
|
301
|
+
"selectedModel": "gpt-oss:20b",
|
|
302
|
+
"fallbackModels": ["gpt-oss:20b", "qwen2.5-coder:7b"],
|
|
303
|
+
"queueTimeMs": 3,
|
|
304
|
+
"executionTimeMs": 8420,
|
|
305
|
+
"decisionReason": "Selected gpt-oss:20b for agentic_reasoning with score 290.0"
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
`decisionReason` includes the winning score, which helps diagnose unexpected model selection — compare it against the scoring table above to see which component tipped the balance.
|
|
311
|
+
|
|
99
312
|
## Config Reference
|
|
100
313
|
|
|
101
314
|
Lookup order:
|
package/dist/cli.js
CHANGED
|
@@ -158,8 +158,7 @@ var modelSpecSchema = z2.object({
|
|
|
158
158
|
timeoutMs: z2.number().int().positive(),
|
|
159
159
|
costClass: z2.enum(["low", "medium", "high"]).default("medium"),
|
|
160
160
|
exclusive: z2.boolean().default(false),
|
|
161
|
-
allowWhenBusy: z2.boolean().default(false)
|
|
162
|
-
tags: z2.array(z2.string()).default([])
|
|
161
|
+
allowWhenBusy: z2.boolean().default(false)
|
|
163
162
|
});
|
|
164
163
|
var appConfigSchema = z2.object({
|
|
165
164
|
server: z2.object({
|
|
@@ -363,7 +362,6 @@ models:
|
|
|
363
362
|
costClass: low
|
|
364
363
|
exclusive: false
|
|
365
364
|
allowWhenBusy: true
|
|
366
|
-
tags: [general]
|
|
367
365
|
routes:
|
|
368
366
|
triage: [llama3.2:3b]
|
|
369
367
|
simple_chat: [llama3.2:3b]
|
|
@@ -961,8 +959,7 @@ function buildModelSpec(name, role, sizeGb, cpuOnly) {
|
|
|
961
959
|
timeoutMs: heavy ? 3e5 : code ? 18e4 : 9e4,
|
|
962
960
|
costClass: heavy ? "high" : code ? "medium" : "low",
|
|
963
961
|
exclusive: heavy,
|
|
964
|
-
allowWhenBusy: !heavy
|
|
965
|
-
tags: tagsForRole(role)
|
|
962
|
+
allowWhenBusy: !heavy
|
|
966
963
|
};
|
|
967
964
|
}
|
|
968
965
|
function purposesForRole(role) {
|
|
@@ -980,21 +977,6 @@ function purposesForRole(role) {
|
|
|
980
977
|
return ["triage", "simple_chat", "summarize"];
|
|
981
978
|
}
|
|
982
979
|
}
|
|
983
|
-
function tagsForRole(role) {
|
|
984
|
-
switch (role) {
|
|
985
|
-
case "code":
|
|
986
|
-
return ["code", "fallback"];
|
|
987
|
-
case "review":
|
|
988
|
-
return ["code", "review"];
|
|
989
|
-
case "heavy":
|
|
990
|
-
return ["reasoning", "large_context"];
|
|
991
|
-
case "tool":
|
|
992
|
-
return ["tool_use"];
|
|
993
|
-
case "fast":
|
|
994
|
-
default:
|
|
995
|
-
return ["fast", "chat"];
|
|
996
|
-
}
|
|
997
|
-
}
|
|
998
980
|
function generateRoutes(models) {
|
|
999
981
|
const fast = models.filter((model) => model.costClass === "low").map((model) => model.name);
|
|
1000
982
|
const code = models.filter((model) => model.purpose.includes("code_generate")).map((model) => model.name);
|
|
@@ -1550,7 +1532,6 @@ var RoutingEngine = class {
|
|
|
1550
1532
|
score += Math.max(0, 50 - routeIndex * 8);
|
|
1551
1533
|
score += model.priority;
|
|
1552
1534
|
if (model.purpose.includes(context.classification.taskType)) score += 25;
|
|
1553
|
-
if (model.tags.includes(context.classification.taskType)) score += 15;
|
|
1554
1535
|
if (preferredIndex >= 0) score += 80 - preferredIndex * 10;
|
|
1555
1536
|
if (loaded) score += 20;
|
|
1556
1537
|
if (context.classification.complexity === "heavy" && model.costClass === "high") score += 20;
|
|
@@ -1572,7 +1553,7 @@ var RoutingEngine = class {
|
|
|
1572
1553
|
for (const name of context.router.preferredModels) names.add(name);
|
|
1573
1554
|
for (const name of routeNames) names.add(name);
|
|
1574
1555
|
for (const model of this.config.models) {
|
|
1575
|
-
if (model.purpose.includes(context.classification.taskType)
|
|
1556
|
+
if (model.purpose.includes(context.classification.taskType)) {
|
|
1576
1557
|
names.add(model.name);
|
|
1577
1558
|
}
|
|
1578
1559
|
}
|