ollama-agent-router 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -5
- package/dist/cli.js +235 -10
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +95 -30
- package/dist/index.js +231 -6
- package/dist/index.js.map +1 -1
- package/docs/kong-runtime-contract-plan.md +415 -0
- package/examples/gex44.yaml +1 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -18,10 +18,25 @@ Request flow:
|
|
|
18
18
|
|
|
19
19
|
## Quick Start
|
|
20
20
|
|
|
21
|
+
Install with Homebrew on macOS or Linux:
|
|
22
|
+
|
|
21
23
|
```bash
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
brew install ExeconOne/tap/ollama-agent-router
|
|
25
|
+
ollama-agent-router configure
|
|
26
|
+
ollama-agent-router serve --config ollama-agent-router.yaml
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Or install from the APT repository on Debian/Ubuntu:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
curl -fsSL https://execonone.github.io/ollama-agent-router/apt/gpg.key \
|
|
33
|
+
| sudo gpg --dearmor -o /usr/share/keyrings/ollama-agent-router.gpg
|
|
34
|
+
|
|
35
|
+
echo "deb [signed-by=/usr/share/keyrings/ollama-agent-router.gpg] https://execonone.github.io/ollama-agent-router/apt stable main" \
|
|
36
|
+
| sudo tee /etc/apt/sources.list.d/ollama-agent-router.list
|
|
37
|
+
|
|
38
|
+
sudo apt-get update
|
|
39
|
+
sudo apt-get install ollama-agent-router
|
|
25
40
|
ollama-agent-router configure
|
|
26
41
|
ollama-agent-router serve --config ollama-agent-router.yaml
|
|
27
42
|
```
|
|
@@ -101,6 +116,7 @@ Server options:
|
|
|
101
116
|
|
|
102
117
|
```yaml
|
|
103
118
|
server:
|
|
119
|
+
nodeId: local
|
|
104
120
|
host: 127.0.0.1
|
|
105
121
|
port: 11435
|
|
106
122
|
basePath: /
|
|
@@ -112,12 +128,13 @@ server:
|
|
|
112
128
|
caPath:
|
|
113
129
|
```
|
|
114
130
|
|
|
115
|
-
Set `server.port` to choose the listening port. Set `server.basePath` to expose every router endpoint under a prefix, for example `/ollama-router`; then chat completions move to `/ollama-router/v1/chat/completions`, health to `/ollama-router/health`, and jobs to `/ollama-router/v1/jobs/{jobId}`.
|
|
131
|
+
Set `server.nodeId` to a stable machine/runtime id when the router is used behind Kong. It is embedded in new async job ids so a gateway can route job status/result requests back to the right node-router. Allowed characters are letters, numbers, dots, and dashes. Set `server.port` to choose the listening port. Set `server.basePath` to expose every router endpoint under a prefix, for example `/ollama-router`; then chat completions move to `/ollama-router/v1/chat/completions`, health to `/ollama-router/health`, and jobs to `/ollama-router/v1/jobs/{jobId}`.
|
|
116
132
|
|
|
117
133
|
To run HTTPS directly from the router, set `server.https.enabled: true` and provide PEM certificate and key paths:
|
|
118
134
|
|
|
119
135
|
```yaml
|
|
120
136
|
server:
|
|
137
|
+
nodeId: gex44-a
|
|
121
138
|
host: 0.0.0.0
|
|
122
139
|
port: 11435
|
|
123
140
|
basePath: /ollama-router
|
|
@@ -172,17 +189,68 @@ Status endpoints:
|
|
|
172
189
|
curl http://127.0.0.1:11435/health
|
|
173
190
|
curl http://127.0.0.1:11435/metrics
|
|
174
191
|
curl http://127.0.0.1:11435/v1/router/status
|
|
192
|
+
curl http://127.0.0.1:11435/v1/router/capabilities
|
|
193
|
+
curl http://127.0.0.1:11435/v1/router/runtime
|
|
175
194
|
curl http://127.0.0.1:11435/v1/router/models
|
|
176
195
|
curl http://127.0.0.1:11435/v1/router/gpu
|
|
177
196
|
```
|
|
178
197
|
|
|
198
|
+
## Kong Runtime Agent API
|
|
199
|
+
|
|
200
|
+
When used with `kong-ollama-router`, this process acts as a local runtime agent. Kong owns public request validation, classification, model selection, and response enrichment. The node-router supplies machine-local state and executes the model selected by Kong.
|
|
201
|
+
|
|
202
|
+
Kong-facing endpoints:
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
curl http://127.0.0.1:11435/v1/router/capabilities
|
|
206
|
+
curl http://127.0.0.1:11435/v1/router/runtime
|
|
207
|
+
curl -X POST http://127.0.0.1:11435/v1/router/execute
|
|
208
|
+
curl -X POST http://127.0.0.1:11435/v1/router/jobs
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
`GET /v1/router/capabilities` returns the stable routing config snapshot: `nodeId`, package version, router defaults, GPU policy, queue defaults, configured models, and routes. It does not call Ollama or GPU probes, so Kong can cache it for longer periods.
|
|
212
|
+
|
|
213
|
+
`GET /v1/router/runtime` returns volatile runtime state: Ollama reachability, loaded models, GPU snapshot, queue depth/running counts, and retained job counters. Kong should cache it only briefly.
|
|
214
|
+
|
|
215
|
+
`POST /v1/router/execute` runs a request on a model already selected by Kong. It does not classify or route again:
|
|
216
|
+
|
|
217
|
+
```json
|
|
218
|
+
{
|
|
219
|
+
"selectedModel": "deepseek-coder:6.7b",
|
|
220
|
+
"request": {
|
|
221
|
+
"model": "deepseek-coder:6.7b",
|
|
222
|
+
"messages": [{"role": "user", "content": "Review this TypeScript function"}],
|
|
223
|
+
"stream": false
|
|
224
|
+
},
|
|
225
|
+
"routerDecision": {
|
|
226
|
+
"taskType": "code_review",
|
|
227
|
+
"score": 250,
|
|
228
|
+
"reason": "Selected by Kong"
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
The response is wrapped so Kong can add its own public `router` metadata:
|
|
234
|
+
|
|
235
|
+
```json
|
|
236
|
+
{
|
|
237
|
+
"result": {},
|
|
238
|
+
"nodeId": "gex44-a",
|
|
239
|
+
"selectedModel": "deepseek-coder:6.7b",
|
|
240
|
+
"queueTimeMs": 4,
|
|
241
|
+
"executionTimeMs": 1200
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
`POST /v1/router/jobs` creates an async job on the selected model. New job ids include the node id, for example `job_gex44-a_01JABCDEF123`, so Kong can route later `GET /v1/jobs/{jobId}` and `GET /v1/jobs/{jobId}/result` calls to the owning node-router.
|
|
246
|
+
|
|
179
247
|
## Async Jobs
|
|
180
248
|
|
|
181
249
|
When a selected model is busy or the router detects heavy load and `allowAsync=true`, the API returns:
|
|
182
250
|
|
|
183
251
|
```json
|
|
184
252
|
{
|
|
185
|
-
"id": "
|
|
253
|
+
"id": "job_gex44-a_01JABCDEF123",
|
|
186
254
|
"object": "router.job",
|
|
187
255
|
"status": "queued",
|
|
188
256
|
"message": "Heavy load. Job accepted for asynchronous processing."
|
|
@@ -307,6 +375,7 @@ The project uses TypeScript, ESM, Express, zod, pino, p-queue, nanoid, and Vites
|
|
|
307
375
|
Design notes:
|
|
308
376
|
|
|
309
377
|
- CLI configuration wizard HLD: `docs/cli-configurator-hld.md`
|
|
378
|
+
- Kong runtime agent contract plan: `docs/kong-runtime-contract-plan.md`
|
|
310
379
|
|
|
311
380
|
## Release Guide
|
|
312
381
|
|
package/dist/cli.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli-program.ts
|
|
4
4
|
import { readFile as readFile4 } from "fs/promises";
|
|
5
|
-
import { readFileSync } from "fs";
|
|
5
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
6
6
|
import { Command } from "commander";
|
|
7
7
|
|
|
8
8
|
// src/config.ts
|
|
@@ -46,6 +46,7 @@ var modelSpecSchema = z.object({
|
|
|
46
46
|
});
|
|
47
47
|
var appConfigSchema = z.object({
|
|
48
48
|
server: z.object({
|
|
49
|
+
nodeId: z.string().regex(/^[a-zA-Z0-9.-]+$/, "server.nodeId may contain only letters, numbers, dots, and dashes").default("local"),
|
|
49
50
|
host: z.string().min(1),
|
|
50
51
|
port: z.number().int().min(1).max(65535),
|
|
51
52
|
basePath: z.string().min(1).default("/"),
|
|
@@ -152,6 +153,7 @@ async function writeDefaultConfig(path) {
|
|
|
152
153
|
await writeFile(target, defaultConfigYaml, "utf8");
|
|
153
154
|
}
|
|
154
155
|
var defaultConfigYaml = `server:
|
|
156
|
+
nodeId: local
|
|
155
157
|
host: 127.0.0.1
|
|
156
158
|
port: 11435
|
|
157
159
|
basePath: /
|
|
@@ -249,11 +251,13 @@ var StaticGpuMonitor = class {
|
|
|
249
251
|
async snapshot() {
|
|
250
252
|
if (this.config.provider === "none") return void 0;
|
|
251
253
|
return {
|
|
254
|
+
provider: this.config.provider,
|
|
252
255
|
name: this.config.name ?? "Configured GPU",
|
|
253
256
|
vramTotalMb: this.config.vramTotalMb,
|
|
254
257
|
vramUsedMb: 0,
|
|
255
258
|
vramFreeMb: this.config.vramTotalMb,
|
|
256
|
-
utilizationPct: 0
|
|
259
|
+
utilizationPct: 0,
|
|
260
|
+
snapshotAgeMs: 0
|
|
257
261
|
};
|
|
258
262
|
}
|
|
259
263
|
};
|
|
@@ -280,11 +284,13 @@ function parseNvidiaSmi(output2) {
|
|
|
280
284
|
return output2.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).map((line) => {
|
|
281
285
|
const [name, total, used, free, utilization] = line.split(",").map((part) => part.trim());
|
|
282
286
|
return {
|
|
287
|
+
provider: "nvidia",
|
|
283
288
|
name,
|
|
284
289
|
vramTotalMb: Number(total),
|
|
285
290
|
vramUsedMb: Number(used),
|
|
286
291
|
vramFreeMb: Number(free),
|
|
287
|
-
utilizationPct: Number(utilization)
|
|
292
|
+
utilizationPct: Number(utilization),
|
|
293
|
+
snapshotAgeMs: 0
|
|
288
294
|
};
|
|
289
295
|
}).filter((gpu) => gpu.name && Number.isFinite(gpu.vramTotalMb));
|
|
290
296
|
}
|
|
@@ -341,6 +347,20 @@ var HttpOllamaClient = class {
|
|
|
341
347
|
return [];
|
|
342
348
|
}
|
|
343
349
|
}
|
|
350
|
+
async health() {
|
|
351
|
+
const controller = new AbortController();
|
|
352
|
+
const timer = setTimeout(() => controller.abort(), 1e3);
|
|
353
|
+
try {
|
|
354
|
+
const response = await fetch(new URL(`${this.config.nativeApiBasePath}/tags`, this.config.baseUrl), {
|
|
355
|
+
signal: controller.signal
|
|
356
|
+
});
|
|
357
|
+
return response.ok;
|
|
358
|
+
} catch {
|
|
359
|
+
return false;
|
|
360
|
+
} finally {
|
|
361
|
+
clearTimeout(timer);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
344
364
|
};
|
|
345
365
|
var OllamaHttpError = class extends Error {
|
|
346
366
|
constructor(statusCode, payload) {
|
|
@@ -488,6 +508,7 @@ function generateConfigFromDetection(detection, answers = {}) {
|
|
|
488
508
|
const serverHttps = typeof httpsAnswer === "boolean" ? { enabled: httpsAnswer } : { enabled: false, ...httpsAnswer ?? {} };
|
|
489
509
|
const config = {
|
|
490
510
|
server: {
|
|
511
|
+
nodeId: "local",
|
|
491
512
|
host: "127.0.0.1",
|
|
492
513
|
port: 11435,
|
|
493
514
|
basePath: "/",
|
|
@@ -949,15 +970,17 @@ async function fileExists(path) {
|
|
|
949
970
|
// src/job-store.ts
|
|
950
971
|
import { nanoid } from "nanoid";
|
|
951
972
|
var InMemoryJobStore = class {
|
|
952
|
-
constructor(config) {
|
|
973
|
+
constructor(config, nodeId = "local") {
|
|
953
974
|
this.config = config;
|
|
975
|
+
this.nodeId = nodeId;
|
|
954
976
|
}
|
|
955
977
|
config;
|
|
978
|
+
nodeId;
|
|
956
979
|
jobs = /* @__PURE__ */ new Map();
|
|
957
980
|
create(input2) {
|
|
958
981
|
const now = /* @__PURE__ */ new Date();
|
|
959
982
|
const record = {
|
|
960
|
-
id: `job_${nanoid(16)}`,
|
|
983
|
+
id: `job_${this.nodeId}_${nanoid(16)}`,
|
|
961
984
|
status: "queued",
|
|
962
985
|
task_type: input2.taskType,
|
|
963
986
|
selected_model: input2.selectedModel,
|
|
@@ -981,6 +1004,25 @@ var InMemoryJobStore = class {
|
|
|
981
1004
|
list(limit = 50) {
|
|
982
1005
|
return [...this.jobs.values()].sort((a, b) => b.created_at.localeCompare(a.created_at)).slice(0, limit).map((job) => ({ ...job }));
|
|
983
1006
|
}
|
|
1007
|
+
summary() {
|
|
1008
|
+
const counts = {
|
|
1009
|
+
queued: 0,
|
|
1010
|
+
running: 0,
|
|
1011
|
+
succeededRetained: 0,
|
|
1012
|
+
failedRetained: 0,
|
|
1013
|
+
cancelledRetained: 0,
|
|
1014
|
+
expiredRetained: 0
|
|
1015
|
+
};
|
|
1016
|
+
for (const job of this.jobs.values()) {
|
|
1017
|
+
if (job.status === "queued") counts.queued += 1;
|
|
1018
|
+
if (job.status === "running") counts.running += 1;
|
|
1019
|
+
if (job.status === "succeeded") counts.succeededRetained += 1;
|
|
1020
|
+
if (job.status === "failed") counts.failedRetained += 1;
|
|
1021
|
+
if (job.status === "cancelled") counts.cancelledRetained += 1;
|
|
1022
|
+
if (job.status === "expired") counts.expiredRetained += 1;
|
|
1023
|
+
}
|
|
1024
|
+
return counts;
|
|
1025
|
+
}
|
|
984
1026
|
markRunning(id) {
|
|
985
1027
|
const job = this.jobs.get(id);
|
|
986
1028
|
if (!job || job.status !== "queued" && job.status !== "running") return this.get(id);
|
|
@@ -1152,6 +1194,7 @@ var QueueManager = class {
|
|
|
1152
1194
|
// src/server.ts
|
|
1153
1195
|
import http from "http";
|
|
1154
1196
|
import https from "https";
|
|
1197
|
+
import { readFileSync } from "fs";
|
|
1155
1198
|
import { readFile as readFile3 } from "fs/promises";
|
|
1156
1199
|
import express from "express";
|
|
1157
1200
|
import { pinoHttp } from "pino-http";
|
|
@@ -1403,6 +1446,7 @@ var logger = pino({
|
|
|
1403
1446
|
});
|
|
1404
1447
|
|
|
1405
1448
|
// src/server.ts
|
|
1449
|
+
var packageJson = JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"));
|
|
1406
1450
|
var chatRequestSchema = z3.object({
|
|
1407
1451
|
model: z3.string().optional(),
|
|
1408
1452
|
messages: z3.array(z3.object({ role: z3.string(), content: z3.unknown() })).min(1),
|
|
@@ -1419,6 +1463,32 @@ var chatRequestSchema = z3.object({
|
|
|
1419
1463
|
requireGpuOnly: z3.boolean().optional()
|
|
1420
1464
|
}).optional()
|
|
1421
1465
|
}).passthrough();
|
|
1466
|
+
var classificationSchema = z3.object({
|
|
1467
|
+
taskType: z3.enum(taskTypes).optional(),
|
|
1468
|
+
complexity: z3.enum(["light", "medium", "heavy"]).optional(),
|
|
1469
|
+
requiresLargeContext: z3.boolean().optional(),
|
|
1470
|
+
requiresToolUse: z3.boolean().optional(),
|
|
1471
|
+
confidence: z3.number().min(0).max(1).optional()
|
|
1472
|
+
}).optional();
|
|
1473
|
+
var routerDecisionSchema = z3.object({
|
|
1474
|
+
taskType: z3.enum(taskTypes).optional(),
|
|
1475
|
+
score: z3.number().optional(),
|
|
1476
|
+
reason: z3.string().optional(),
|
|
1477
|
+
priority: z3.enum(["low", "normal", "high"]).optional()
|
|
1478
|
+
}).passthrough().optional();
|
|
1479
|
+
var executeRequestSchema = z3.object({
|
|
1480
|
+
selectedModel: z3.string().min(1),
|
|
1481
|
+
request: chatRequestSchema,
|
|
1482
|
+
priority: z3.enum(["low", "normal", "high"]).optional(),
|
|
1483
|
+
routerDecision: routerDecisionSchema
|
|
1484
|
+
});
|
|
1485
|
+
var createRouterJobSchema = z3.object({
|
|
1486
|
+
selectedModel: z3.string().min(1),
|
|
1487
|
+
request: chatRequestSchema,
|
|
1488
|
+
classification: classificationSchema,
|
|
1489
|
+
priority: z3.enum(["low", "normal", "high"]).optional(),
|
|
1490
|
+
routerDecision: routerDecisionSchema
|
|
1491
|
+
});
|
|
1422
1492
|
function createApp(config, deps) {
|
|
1423
1493
|
const app = express();
|
|
1424
1494
|
const api = express.Router();
|
|
@@ -1430,12 +1500,29 @@ function createApp(config, deps) {
|
|
|
1430
1500
|
api.get("/health", (_req, res) => {
|
|
1431
1501
|
res.json({ status: "ok", service: "ollama-agent-router" });
|
|
1432
1502
|
});
|
|
1433
|
-
api.get("/metrics", (_req, res) => {
|
|
1503
|
+
api.get("/metrics", async (_req, res) => {
|
|
1434
1504
|
const snapshot = deps.queue.snapshot();
|
|
1505
|
+
const jobSummary = deps.jobs.summary();
|
|
1506
|
+
const jobsByStatusAndModel = countJobsByStatusAndModel(deps.jobs.list(Number.MAX_SAFE_INTEGER));
|
|
1507
|
+
const [gpu, ollamaReachable] = await Promise.all([safeGpu(deps.gpu), safeOllamaReachable(deps.ollama)]);
|
|
1435
1508
|
res.type("text/plain").send(
|
|
1436
1509
|
[
|
|
1437
1510
|
`oar_queue_global_queued ${snapshot.globalQueued}`,
|
|
1438
1511
|
`oar_queue_global_running ${snapshot.globalRunning}`,
|
|
1512
|
+
`oar_ollama_reachable ${ollamaReachable ? 1 : 0}`,
|
|
1513
|
+
...gpu ? [
|
|
1514
|
+
`oar_gpu_vram_free_mb ${gpu.vramFreeMb}`,
|
|
1515
|
+
`oar_gpu_utilization_pct ${gpu.utilizationPct}`
|
|
1516
|
+
] : [],
|
|
1517
|
+
`oar_jobs_total{status="queued"} ${jobSummary.queued}`,
|
|
1518
|
+
`oar_jobs_total{status="running"} ${jobSummary.running}`,
|
|
1519
|
+
`oar_jobs_total{status="succeeded"} ${jobSummary.succeededRetained}`,
|
|
1520
|
+
`oar_jobs_total{status="failed"} ${jobSummary.failedRetained}`,
|
|
1521
|
+
`oar_jobs_total{status="cancelled"} ${jobSummary.cancelledRetained}`,
|
|
1522
|
+
`oar_jobs_total{status="expired"} ${jobSummary.expiredRetained}`,
|
|
1523
|
+
...jobsByStatusAndModel.map(
|
|
1524
|
+
(item) => `oar_jobs_total{status="${escapeMetricLabel(item.status)}",model="${escapeMetricLabel(item.model)}"} ${item.count}`
|
|
1525
|
+
),
|
|
1439
1526
|
...snapshot.byModel.flatMap((item) => [
|
|
1440
1527
|
`oar_model_queue_depth{model="${escapeMetricLabel(item.model)}"} ${item.queued}`,
|
|
1441
1528
|
`oar_model_running{model="${escapeMetricLabel(item.model)}"} ${item.running}`
|
|
@@ -1443,9 +1530,20 @@ function createApp(config, deps) {
|
|
|
1443
1530
|
].join("\n")
|
|
1444
1531
|
);
|
|
1445
1532
|
});
|
|
1533
|
+
api.get("/v1/router/capabilities", (_req, res) => {
|
|
1534
|
+
res.json(buildCapabilities(config));
|
|
1535
|
+
});
|
|
1536
|
+
api.get("/v1/router/runtime", async (_req, res, next) => {
|
|
1537
|
+
try {
|
|
1538
|
+
res.json(await buildRuntimeSnapshot(config, deps));
|
|
1539
|
+
} catch (error) {
|
|
1540
|
+
next(error);
|
|
1541
|
+
}
|
|
1542
|
+
});
|
|
1446
1543
|
api.get("/v1/router/status", async (_req, res, next) => {
|
|
1447
1544
|
try {
|
|
1448
1545
|
res.json({
|
|
1546
|
+
nodeId: config.server.nodeId,
|
|
1449
1547
|
service: "ollama-agent-router",
|
|
1450
1548
|
queue: deps.queue.snapshot(),
|
|
1451
1549
|
gpu: await safeGpu(deps.gpu),
|
|
@@ -1499,6 +1597,63 @@ function createApp(config, deps) {
|
|
|
1499
1597
|
if (!job) return res.status(404).json({ error: { message: "Job not found" } });
|
|
1500
1598
|
return res.json(job);
|
|
1501
1599
|
});
|
|
1600
|
+
api.post("/v1/router/execute", async (req, res, next) => {
|
|
1601
|
+
try {
|
|
1602
|
+
const payload = executeRequestSchema.parse(req.body);
|
|
1603
|
+
if (payload.request.stream) {
|
|
1604
|
+
return res.status(400).json({ error: { message: "Streaming is not supported by ollama-agent-router v1" } });
|
|
1605
|
+
}
|
|
1606
|
+
const model = findConfiguredModel(config, payload.selectedModel);
|
|
1607
|
+
if (!model) {
|
|
1608
|
+
return res.status(404).json({ error: { message: `Unknown configured model: ${payload.selectedModel}` } });
|
|
1609
|
+
}
|
|
1610
|
+
const priorityName = payload.priority ?? payload.routerDecision?.priority ?? config.queue.defaultPriority;
|
|
1611
|
+
const output2 = await deps.queue.runSync({
|
|
1612
|
+
model,
|
|
1613
|
+
request: payload.request,
|
|
1614
|
+
priority: priorityWeights[priorityName],
|
|
1615
|
+
timeoutMs: payload.request.router?.maxExecutionTimeMs ?? config.queue.timeoutMs
|
|
1616
|
+
});
|
|
1617
|
+
return res.json({
|
|
1618
|
+
result: output2.result,
|
|
1619
|
+
nodeId: config.server.nodeId,
|
|
1620
|
+
selectedModel: model.name,
|
|
1621
|
+
queueTimeMs: output2.queueTimeMs,
|
|
1622
|
+
executionTimeMs: output2.executionTimeMs
|
|
1623
|
+
});
|
|
1624
|
+
} catch (error) {
|
|
1625
|
+
next(error);
|
|
1626
|
+
}
|
|
1627
|
+
});
|
|
1628
|
+
api.post("/v1/router/jobs", (req, res, next) => {
|
|
1629
|
+
try {
|
|
1630
|
+
const payload = createRouterJobSchema.parse(req.body);
|
|
1631
|
+
if (payload.request.stream) {
|
|
1632
|
+
return res.status(400).json({ error: { message: "Streaming is not supported by ollama-agent-router v1" } });
|
|
1633
|
+
}
|
|
1634
|
+
const model = findConfiguredModel(config, payload.selectedModel);
|
|
1635
|
+
if (!model) {
|
|
1636
|
+
return res.status(404).json({ error: { message: `Unknown configured model: ${payload.selectedModel}` } });
|
|
1637
|
+
}
|
|
1638
|
+
const classification = normalizeClassification(config, payload.classification);
|
|
1639
|
+
const priorityName = payload.priority ?? payload.routerDecision?.priority ?? config.queue.defaultPriority;
|
|
1640
|
+
const job = deps.queue.enqueueAsync({
|
|
1641
|
+
model,
|
|
1642
|
+
request: payload.request,
|
|
1643
|
+
classification,
|
|
1644
|
+
priority: priorityWeights[priorityName]
|
|
1645
|
+
});
|
|
1646
|
+
return res.status(202).json({
|
|
1647
|
+
id: job.id,
|
|
1648
|
+
status: "queued",
|
|
1649
|
+
position: job.position,
|
|
1650
|
+
nodeId: config.server.nodeId,
|
|
1651
|
+
selectedModel: model.name
|
|
1652
|
+
});
|
|
1653
|
+
} catch (error) {
|
|
1654
|
+
next(error);
|
|
1655
|
+
}
|
|
1656
|
+
});
|
|
1502
1657
|
api.post("/v1/chat/completions", async (req, res, next) => {
|
|
1503
1658
|
try {
|
|
1504
1659
|
const request = chatRequestSchema.parse(req.body);
|
|
@@ -1567,11 +1722,74 @@ function createApp(config, deps) {
|
|
|
1567
1722
|
app.use(normalizeBasePath(config.server.basePath), api);
|
|
1568
1723
|
app.use((error, _req, res, _next) => {
|
|
1569
1724
|
const message = error instanceof Error ? error.message : String(error);
|
|
1570
|
-
const status = error instanceof z3.ZodError ? 400 : 500;
|
|
1725
|
+
const status = error instanceof z3.ZodError ? 400 : error instanceof OllamaHttpError ? 502 : 500;
|
|
1571
1726
|
res.status(status).json({ error: { message } });
|
|
1572
1727
|
});
|
|
1573
1728
|
return app;
|
|
1574
1729
|
}
|
|
1730
|
+
function buildCapabilities(config) {
|
|
1731
|
+
return {
|
|
1732
|
+
nodeId: config.server.nodeId,
|
|
1733
|
+
status: "ok",
|
|
1734
|
+
version: packageJson.version,
|
|
1735
|
+
router: config.router,
|
|
1736
|
+
gpu: {
|
|
1737
|
+
requireGpuOnlyByDefault: config.gpu.requireGpuOnlyByDefault,
|
|
1738
|
+
vramSafetyReserveMb: config.gpu.vramSafetyReserveMb
|
|
1739
|
+
},
|
|
1740
|
+
queue: {
|
|
1741
|
+
defaultPriority: config.queue.defaultPriority,
|
|
1742
|
+
timeoutMs: config.queue.timeoutMs
|
|
1743
|
+
},
|
|
1744
|
+
models: config.models,
|
|
1745
|
+
routes: config.routes
|
|
1746
|
+
};
|
|
1747
|
+
}
|
|
1748
|
+
async function buildRuntimeSnapshot(config, deps) {
|
|
1749
|
+
const [ollamaReachable, loadedModels, gpu] = await Promise.all([
|
|
1750
|
+
safeOllamaReachable(deps.ollama),
|
|
1751
|
+
safeLoadedModels(deps.ollama),
|
|
1752
|
+
safeGpu(deps.gpu)
|
|
1753
|
+
]);
|
|
1754
|
+
const status = ollamaReachable ? config.gpu.monitor.enabled && config.gpu.provider !== "none" && !gpu ? "degraded" : "ok" : "unavailable";
|
|
1755
|
+
return {
|
|
1756
|
+
nodeId: config.server.nodeId,
|
|
1757
|
+
status,
|
|
1758
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1759
|
+
ollama: {
|
|
1760
|
+
baseUrl: config.ollama.baseUrl,
|
|
1761
|
+
reachable: ollamaReachable
|
|
1762
|
+
},
|
|
1763
|
+
gpu: gpu ? { provider: config.gpu.provider, snapshotAgeMs: 0, ...gpu } : void 0,
|
|
1764
|
+
loadedModels,
|
|
1765
|
+
queues: deps.queue.snapshot(),
|
|
1766
|
+
jobs: deps.jobs.summary()
|
|
1767
|
+
};
|
|
1768
|
+
}
|
|
1769
|
+
function findConfiguredModel(config, selectedModel) {
|
|
1770
|
+
return config.models.find((model) => model.name === selectedModel);
|
|
1771
|
+
}
|
|
1772
|
+
function normalizeClassification(config, classification) {
|
|
1773
|
+
return {
|
|
1774
|
+
taskType: classification?.taskType ?? config.router.defaultTaskType,
|
|
1775
|
+
complexity: classification?.complexity ?? "medium",
|
|
1776
|
+
requiresLargeContext: classification?.requiresLargeContext ?? false,
|
|
1777
|
+
requiresToolUse: classification?.requiresToolUse ?? false,
|
|
1778
|
+
confidence: classification?.confidence ?? 1
|
|
1779
|
+
};
|
|
1780
|
+
}
|
|
1781
|
+
function countJobsByStatusAndModel(jobs) {
|
|
1782
|
+
const counts = /* @__PURE__ */ new Map();
|
|
1783
|
+
for (const job of jobs) {
|
|
1784
|
+
const status = job.status;
|
|
1785
|
+
const model = job.selected_model ?? "unknown";
|
|
1786
|
+
const key = `${status}\0${model}`;
|
|
1787
|
+
const current = counts.get(key) ?? { status, model, count: 0 };
|
|
1788
|
+
current.count += 1;
|
|
1789
|
+
counts.set(key, current);
|
|
1790
|
+
}
|
|
1791
|
+
return [...counts.values()];
|
|
1792
|
+
}
|
|
1575
1793
|
async function startServer(config, deps) {
|
|
1576
1794
|
const app = createApp(config, deps);
|
|
1577
1795
|
const server = await createHttpServer(config, app);
|
|
@@ -1633,18 +1851,25 @@ async function safeGpu(gpu) {
|
|
|
1633
1851
|
return void 0;
|
|
1634
1852
|
}
|
|
1635
1853
|
}
|
|
1854
|
+
async function safeOllamaReachable(ollama) {
|
|
1855
|
+
try {
|
|
1856
|
+
return await ollama.health();
|
|
1857
|
+
} catch {
|
|
1858
|
+
return false;
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1636
1861
|
function escapeMetricLabel(label) {
|
|
1637
1862
|
return label.replaceAll("\\", "\\\\").replaceAll('"', '\\"');
|
|
1638
1863
|
}
|
|
1639
1864
|
|
|
1640
1865
|
// src/cli-program.ts
|
|
1641
|
-
var
|
|
1866
|
+
var packageJson2 = JSON.parse(readFileSync2(new URL("../package.json", import.meta.url), "utf8"));
|
|
1642
1867
|
function createProgram() {
|
|
1643
1868
|
const program = new Command();
|
|
1644
|
-
program.name("ollama-agent-router").alias("oar").description("Intelligent HTTP/CLI router for Ollama").version(
|
|
1869
|
+
program.name("ollama-agent-router").alias("oar").description("Intelligent HTTP/CLI router for Ollama").version(packageJson2.version, "-v, --version", "display version").option("-c, --config <path>", "config file path").option("-u, --url <url>", "router URL for client commands", "http://127.0.0.1:11435").option("--base-path <path>", "router API base path for client commands", "/");
|
|
1645
1870
|
program.command("serve").description("start the router server").option("-c, --config <path>", "config file path").action(async (options) => {
|
|
1646
1871
|
const { config, path } = await loadConfig(options.config ?? program.opts().config);
|
|
1647
|
-
const jobs = new InMemoryJobStore(config.jobs);
|
|
1872
|
+
const jobs = new InMemoryJobStore(config.jobs, config.server.nodeId);
|
|
1648
1873
|
const ollama = new HttpOllamaClient(config.ollama);
|
|
1649
1874
|
const gpu = new NvidiaGpuMonitor(config.gpu);
|
|
1650
1875
|
const queue = new QueueManager(config, ollama, jobs);
|