ollama-agent-router 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js ADDED
@@ -0,0 +1,1725 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/cli.ts
4
+ import { readFile as readFile4 } from "fs/promises";
5
+ import { Command } from "commander";
6
+
7
+ // src/config.ts
8
+ import { existsSync } from "fs";
9
+ import { access, readFile, writeFile, mkdir } from "fs/promises";
10
+ import { dirname, resolve } from "path";
11
+ import { homedir } from "os";
12
+ import YAML from "yaml";
13
+ import { z } from "zod";
14
+
15
+ // src/types.ts
16
+ var taskTypes = [
17
+ "triage",
18
+ "simple_chat",
19
+ "summarize",
20
+ "code_generate",
21
+ "code_review",
22
+ "code_fix",
23
+ "agentic_reasoning",
24
+ "large_context",
25
+ "tool_use",
26
+ "unknown"
27
+ ];
28
+
29
+ // src/config.ts
30
+ var taskTypeSchema = z.enum(taskTypes);
31
+ var optionalStringSchema = z.preprocess((value) => value === null ? void 0 : value, z.string().min(1).optional());
32
+ var modelSpecSchema = z.object({
33
+ name: z.string().min(1),
34
+ sizeGb: z.number().positive(),
35
+ purpose: z.array(z.string()).default([]),
36
+ priority: z.number().default(50),
37
+ maxConcurrent: z.number().int().positive(),
38
+ defaultContext: z.number().int().positive(),
39
+ maxContext: z.number().int().positive(),
40
+ timeoutMs: z.number().int().positive(),
41
+ costClass: z.enum(["low", "medium", "high"]).default("medium"),
42
+ exclusive: z.boolean().default(false),
43
+ allowWhenBusy: z.boolean().default(false),
44
+ tags: z.array(z.string()).default([])
45
+ });
46
+ var appConfigSchema = z.object({
47
+ server: z.object({
48
+ host: z.string().min(1),
49
+ port: z.number().int().min(1).max(65535),
50
+ basePath: z.string().min(1).default("/"),
51
+ requestBodyLimit: z.string().min(1),
52
+ https: z.object({
53
+ enabled: z.boolean().default(false),
54
+ certPath: optionalStringSchema,
55
+ keyPath: optionalStringSchema,
56
+ caPath: optionalStringSchema
57
+ }).default({ enabled: false })
58
+ }),
59
+ ollama: z.object({
60
+ baseUrl: z.string().url(),
61
+ openAiCompatiblePath: z.string().min(1).default("/v1/chat/completions"),
62
+ nativeApiBasePath: z.string().min(1).default("/api"),
63
+ keepAlive: z.string().default("5m"),
64
+ requestTimeoutMs: z.number().int().positive()
65
+ }),
66
+ gpu: z.object({
67
+ provider: z.enum(["none", "nvidia"]).default("none"),
68
+ name: z.string().optional(),
69
+ vramTotalMb: z.number().nonnegative(),
70
+ vramSafetyReserveMb: z.number().nonnegative(),
71
+ maxGpuUtilizationPct: z.number().min(1).max(100),
72
+ requireGpuOnlyByDefault: z.boolean().default(false),
73
+ monitor: z.object({
74
+ enabled: z.boolean().default(false),
75
+ intervalMs: z.number().int().positive(),
76
+ nvidiaSmiPath: z.string().min(1).default("nvidia-smi")
77
+ })
78
+ }),
79
+ router: z.object({
80
+ defaultMode: z.enum(["auto", "sync", "async"]).default("auto"),
81
+ syncMaxQueueTimeMs: z.number().int().nonnegative(),
82
+ heavyLoadQueueDepth: z.number().int().nonnegative(),
83
+ heavyLoadGpuFreeMbThreshold: z.number().int().nonnegative(),
84
+ defaultTaskType: taskTypeSchema.default("unknown"),
85
+ classification: z.object({
86
+ mode: z.enum(["heuristic", "model"]).default("heuristic"),
87
+ optionalClassifierModel: z.string().optional(),
88
+ classifierTimeoutMs: z.number().int().positive()
89
+ })
90
+ }),
91
+ jobs: z.object({
92
+ store: z.literal("memory").default("memory"),
93
+ resultTtlSeconds: z.number().int().positive(),
94
+ maxAttempts: z.number().int().positive(),
95
+ cleanupIntervalMs: z.number().int().positive()
96
+ }),
97
+ models: z.array(modelSpecSchema).min(1),
98
+ routes: z.record(z.string(), z.array(z.string())),
99
+ queue: z.object({
100
+ globalMaxConcurrent: z.number().int().positive(),
101
+ globalMaxQueued: z.number().int().nonnegative(),
102
+ perUserMaxQueued: z.number().int().nonnegative(),
103
+ defaultPriority: z.enum(["low", "normal", "high"]).default("normal"),
104
+ timeoutMs: z.number().int().positive()
105
+ })
106
+ });
107
+ var configLookupOrder = (explicitPath) => {
108
+ const paths = [
109
+ "./ollama-agent-router.yaml",
110
+ `${homedir()}/.config/ollama-agent-router/config.yaml`,
111
+ "/etc/ollama-agent-router/config.yaml"
112
+ ];
113
+ return explicitPath ? [explicitPath, ...paths] : paths;
114
+ };
115
+ async function findConfigPath(explicitPath) {
116
+ for (const candidate of configLookupOrder(explicitPath)) {
117
+ const path = resolve(candidate);
118
+ try {
119
+ await access(path);
120
+ return path;
121
+ } catch {
122
+ }
123
+ }
124
+ throw new Error(`No config file found. Tried: ${configLookupOrder(explicitPath).join(", ")}`);
125
+ }
126
+ async function loadConfig(explicitPath) {
127
+ const path = await findConfigPath(explicitPath);
128
+ const raw = await readFile(path, "utf8");
129
+ return { path, config: parseConfig(raw) };
130
+ }
131
+ function parseConfig(raw) {
132
+ const parsed = YAML.parse(raw);
133
+ const config = appConfigSchema.parse(parsed);
134
+ if (config.server.https.enabled && (!config.server.https.certPath || !config.server.https.keyPath)) {
135
+ throw new Error("server.https.certPath and server.https.keyPath are required when HTTPS is enabled");
136
+ }
137
+ const modelNames = new Set(config.models.map((model) => model.name));
138
+ const missingRoutes = Object.entries(config.routes).flatMap(([taskType, names]) => (names ?? []).map((name) => ({ taskType, name }))).filter(({ name }) => !modelNames.has(name));
139
+ if (missingRoutes.length > 0) {
140
+ const formatted = missingRoutes.map((route) => `${route.taskType}:${route.name}`).join(", ");
141
+ throw new Error(`Routes reference unknown models: ${formatted}`);
142
+ }
143
+ return config;
144
+ }
145
+ async function writeDefaultConfig(path) {
146
+ const target = resolve(path);
147
+ if (existsSync(target)) {
148
+ throw new Error(`Refusing to overwrite existing config: ${target}`);
149
+ }
150
+ await mkdir(dirname(target), { recursive: true });
151
+ await writeFile(target, defaultConfigYaml, "utf8");
152
+ }
153
+ var defaultConfigYaml = `server:
154
+ host: 127.0.0.1
155
+ port: 11435
156
+ basePath: /
157
+ requestBodyLimit: 4mb
158
+ https:
159
+ enabled: false
160
+ certPath:
161
+ keyPath:
162
+ caPath:
163
+ ollama:
164
+ baseUrl: http://127.0.0.1:11434
165
+ openAiCompatiblePath: /v1/chat/completions
166
+ nativeApiBasePath: /api
167
+ keepAlive: 5m
168
+ requestTimeoutMs: 120000
169
+ gpu:
170
+ provider: none
171
+ name: Local GPU
172
+ vramTotalMb: 0
173
+ vramSafetyReserveMb: 1024
174
+ maxGpuUtilizationPct: 95
175
+ requireGpuOnlyByDefault: false
176
+ monitor:
177
+ enabled: false
178
+ intervalMs: 5000
179
+ nvidiaSmiPath: nvidia-smi
180
+ router:
181
+ defaultMode: auto
182
+ syncMaxQueueTimeMs: 250
183
+ heavyLoadQueueDepth: 4
184
+ heavyLoadGpuFreeMbThreshold: 2048
185
+ defaultTaskType: unknown
186
+ classification:
187
+ mode: heuristic
188
+ optionalClassifierModel:
189
+ classifierTimeoutMs: 1500
190
+ jobs:
191
+ store: memory
192
+ resultTtlSeconds: 86400
193
+ maxAttempts: 2
194
+ cleanupIntervalMs: 60000
195
+ models:
196
+ - name: llama3.2:3b
197
+ sizeGb: 2.0
198
+ purpose: [simple_chat, summarize, triage]
199
+ priority: 50
200
+ maxConcurrent: 1
201
+ defaultContext: 4096
202
+ maxContext: 8192
203
+ timeoutMs: 120000
204
+ costClass: low
205
+ exclusive: false
206
+ allowWhenBusy: true
207
+ tags: [general]
208
+ routes:
209
+ triage: [llama3.2:3b]
210
+ simple_chat: [llama3.2:3b]
211
+ summarize: [llama3.2:3b]
212
+ code_generate: [llama3.2:3b]
213
+ code_review: [llama3.2:3b]
214
+ code_fix: [llama3.2:3b]
215
+ agentic_reasoning: [llama3.2:3b]
216
+ large_context: [llama3.2:3b]
217
+ tool_use: [llama3.2:3b]
218
+ unknown: [llama3.2:3b]
219
+ queue:
220
+ globalMaxConcurrent: 2
221
+ globalMaxQueued: 100
222
+ perUserMaxQueued: 20
223
+ defaultPriority: normal
224
+ timeoutMs: 120000
225
+ `;
226
+
227
+ // src/configurator.ts
228
+ import { access as access2, mkdir as mkdir2, readFile as readFile2, writeFile as writeFile2 } from "fs/promises";
229
+ import { constants } from "fs";
230
+ import { dirname as dirname2, resolve as resolve2 } from "path";
231
+ import { createInterface } from "readline/promises";
232
+ import { stdin as input, stdout as output } from "process";
233
+ import { execFile as execFile3 } from "child_process";
234
+ import { promisify as promisify3 } from "util";
235
+ import os from "os";
236
+ import YAML2 from "yaml";
237
+ import { z as z2 } from "zod";
238
+
239
+ // src/gpu.ts
240
+ import { execFile } from "child_process";
241
+ import { promisify } from "util";
242
+ var execFileAsync = promisify(execFile);
243
+ var StaticGpuMonitor = class {
244
+ constructor(config) {
245
+ this.config = config;
246
+ }
247
+ config;
248
+ async snapshot() {
249
+ if (this.config.provider === "none") return void 0;
250
+ return {
251
+ name: this.config.name ?? "Configured GPU",
252
+ vramTotalMb: this.config.vramTotalMb,
253
+ vramUsedMb: 0,
254
+ vramFreeMb: this.config.vramTotalMb,
255
+ utilizationPct: 0
256
+ };
257
+ }
258
+ };
259
+ var NvidiaGpuMonitor = class {
260
+ constructor(config, commandRunner = defaultCommandRunner) {
261
+ this.config = config;
262
+ this.commandRunner = commandRunner;
263
+ }
264
+ config;
265
+ commandRunner;
266
+ async snapshot() {
267
+ if (!this.config.monitor.enabled || this.config.provider !== "nvidia") {
268
+ return new StaticGpuMonitor(this.config).snapshot();
269
+ }
270
+ const args = [
271
+ "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
272
+ "--format=csv,noheader,nounits"
273
+ ];
274
+ const { stdout } = await this.commandRunner(this.config.monitor.nvidiaSmiPath, args);
275
+ return parseNvidiaSmi(stdout)[0];
276
+ }
277
+ };
278
+ function parseNvidiaSmi(output2) {
279
+ return output2.split(/\r?\n/).map((line) => line.trim()).filter(Boolean).map((line) => {
280
+ const [name, total, used, free, utilization] = line.split(",").map((part) => part.trim());
281
+ return {
282
+ name,
283
+ vramTotalMb: Number(total),
284
+ vramUsedMb: Number(used),
285
+ vramFreeMb: Number(free),
286
+ utilizationPct: Number(utilization)
287
+ };
288
+ }).filter((gpu) => gpu.name && Number.isFinite(gpu.vramTotalMb));
289
+ }
290
+ async function defaultCommandRunner(command, args) {
291
+ const { stdout } = await execFileAsync(command, args, { timeout: 5e3 });
292
+ return { stdout };
293
+ }
294
+
295
+ // src/ollama.ts
296
+ import { execFile as execFile2 } from "child_process";
297
+ import { promisify as promisify2 } from "util";
298
+ var execFileAsync2 = promisify2(execFile2);
299
+ var HttpOllamaClient = class {
300
+ constructor(config, commandRunner = defaultCommandRunner2) {
301
+ this.config = config;
302
+ this.commandRunner = commandRunner;
303
+ }
304
+ config;
305
+ commandRunner;
306
+ async chat(request, model, timeoutMs = this.config.requestTimeoutMs) {
307
+ const url = new URL(this.config.openAiCompatiblePath, this.config.baseUrl);
308
+ const controller = new AbortController();
309
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
310
+ const body = { ...request, model };
311
+ delete body.router;
312
+ if (!("stream" in body)) body.stream = false;
313
+ try {
314
+ const response = await fetch(url, {
315
+ method: "POST",
316
+ headers: { "content-type": "application/json" },
317
+ body: JSON.stringify(body),
318
+ signal: controller.signal
319
+ });
320
+ const payload = await safeJson(response);
321
+ if (!response.ok) {
322
+ throw new OllamaHttpError(response.status, payload);
323
+ }
324
+ return payload;
325
+ } finally {
326
+ clearTimeout(timer);
327
+ }
328
+ }
329
+ async tags() {
330
+ const response = await fetch(new URL(`${this.config.nativeApiBasePath}/tags`, this.config.baseUrl));
331
+ const payload = await safeJson(response);
332
+ if (!response.ok) throw new OllamaHttpError(response.status, payload);
333
+ return payload;
334
+ }
335
+ async ps() {
336
+ try {
337
+ const { stdout } = await this.commandRunner("ollama", ["ps"]);
338
+ return parseOllamaPs(stdout);
339
+ } catch {
340
+ return [];
341
+ }
342
+ }
343
+ };
344
+ var OllamaHttpError = class extends Error {
345
+ constructor(statusCode, payload) {
346
+ super(`Ollama HTTP request failed with status ${statusCode}`);
347
+ this.statusCode = statusCode;
348
+ this.payload = payload;
349
+ }
350
+ statusCode;
351
+ payload;
352
+ };
353
+ function parseOllamaPs(output2) {
354
+ const lines = output2.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
355
+ if (lines.length <= 1) return [];
356
+ return lines.slice(1).map((line) => {
357
+ const parts = line.split(/\s{2,}/).filter(Boolean);
358
+ if (parts.length >= 5) {
359
+ return {
360
+ name: parts[0],
361
+ id: parts[1],
362
+ size: parts[2],
363
+ processor: parts[3],
364
+ until: parts.slice(4).join(" ")
365
+ };
366
+ }
367
+ const fallback = line.match(/^(\S+)\s+(\S+)\s+(.+?)\s+((?:\d+%\s+)?(?:GPU|CPU)(?:\/GPU)?)\s+(.+)$/i);
368
+ if (fallback) {
369
+ return {
370
+ name: fallback[1],
371
+ id: fallback[2],
372
+ size: fallback[3].trim(),
373
+ processor: fallback[4].trim(),
374
+ until: fallback[5].trim()
375
+ };
376
+ }
377
+ return { name: parts[0] ?? line };
378
+ });
379
+ }
380
+ async function safeJson(response) {
381
+ const text = await response.text();
382
+ if (!text) return {};
383
+ try {
384
+ return JSON.parse(text);
385
+ } catch {
386
+ return { raw: text };
387
+ }
388
+ }
389
+ async function defaultCommandRunner2(command, args) {
390
+ const { stdout } = await execFileAsync2(command, args, { timeout: 5e3 });
391
+ return { stdout };
392
+ }
393
+
394
+ // src/configurator.ts
395
+ var execFileAsync3 = promisify3(execFile3);
396
+ var answersSchema = z2.object({
397
+ server: z2.record(z2.unknown()).optional(),
398
+ ollama: z2.record(z2.unknown()).optional(),
399
+ gpu: z2.record(z2.unknown()).optional(),
400
+ router: z2.record(z2.unknown()).optional(),
401
+ jobs: z2.record(z2.unknown()).optional(),
402
+ queue: z2.record(z2.unknown()).optional(),
403
+ models: z2.object({
404
+ mode: z2.enum(["detected", "manual"]).optional(),
405
+ items: z2.array(z2.record(z2.unknown())).optional()
406
+ }).optional(),
407
+ routes: z2.record(z2.array(z2.string())).optional()
408
+ });
409
+ var coreTaskTypes = [
410
+ "triage",
411
+ "simple_chat",
412
+ "summarize",
413
+ "code_generate",
414
+ "code_review",
415
+ "code_fix",
416
+ "agentic_reasoning",
417
+ "large_context",
418
+ "tool_use",
419
+ "unknown"
420
+ ];
421
+ async function runConfigure(options) {
422
+ const detection = await detectEnvironment(options);
423
+ if (options.detectOnly) {
424
+ emit(options, `${formatDetectionSummary(detection)}
425
+ `);
426
+ return;
427
+ }
428
+ const answers = options.answersPath ? await loadAnswers(options.answersPath) : {};
429
+ const config = options.nonInteractive ? generateConfigFromDetection(detection, answers) : await promptForConfig(detection, answers, options);
430
+ const yaml = serializeConfig(config);
431
+ parseConfig(yaml);
432
+ emit(options, `${formatConfigSummary(config, detection, options.outputPath)}
433
+ `);
434
+ if (options.dryRun) {
435
+ emit(options, `${yaml}
436
+ `);
437
+ return;
438
+ }
439
+ if (!options.overwrite && await fileExists(options.outputPath)) {
440
+ throw new Error(`Refusing to overwrite existing config: ${options.outputPath}`);
441
+ }
442
+ await mkdir2(dirname2(resolve2(options.outputPath)), { recursive: true });
443
+ await writeFile2(options.outputPath, yaml, "utf8");
444
+ emit(options, `Wrote ${options.outputPath}
445
+ `);
446
+ }
447
+ async function detectEnvironment(options = {}) {
448
+ const platform = options.platform ?? process.platform;
449
+ const arch = options.arch ?? process.arch;
450
+ const env = options.env ?? process.env;
451
+ const commandRunner = options.commandRunner ?? defaultCommandRunner3;
452
+ const pathLookup = options.pathLookup ?? findExecutable;
453
+ const machine = detectMachine(platform, arch, options);
454
+ const ollamaBinary = await detectOllamaBinary(platform, env, pathLookup);
455
+ const ollamaBaseUrl = detectOllamaBaseUrl(env);
456
+ const ollamaReachable = await detectOllamaReachable(ollamaBaseUrl.value, options.fetchImpl ?? fetch);
457
+ const ollamaModels = await detectOllamaModels(ollamaBinary.value, commandRunner);
458
+ const loadedModels = await detectLoadedModels(ollamaBinary.value, commandRunner);
459
+ const nvidiaSmiPath = platform === "darwin" ? notFound("nvidia-smi is normally unavailable on macOS") : await detectNvidiaSmi(platform, env, pathLookup);
460
+ const gpu = await detectGpu(platform, arch, nvidiaSmiPath.value, commandRunner);
461
+ return {
462
+ ollamaBinary,
463
+ ollamaBaseUrl,
464
+ ollamaReachable,
465
+ ollamaModels,
466
+ loadedModels,
467
+ nvidiaSmiPath,
468
+ gpu,
469
+ machine
470
+ };
471
+ }
472
+ function generateConfigFromDetection(detection, answers = {}) {
473
+ const machine = detection.machine.value ?? detectMachine(process.platform, process.arch, {}).value;
474
+ const gpu = mergeGpu(detection.gpu.value ?? defaultGpuForPlatform(machine.platform, machine.arch), answers.gpu);
475
+ const cpuOnly = gpu.provider === "none";
476
+ const detectedModels = detection.ollamaModels.value ?? [];
477
+ const models = buildModels(detectedModels, answers.models, cpuOnly);
478
+ if (models.length === 0) {
479
+ throw new Error("No models detected or provided. Add at least one model to generate a config.");
480
+ }
481
+ const routes = ensureCoreRoutes({ ...generateRoutes(models), ...answers.routes ?? {} }, models);
482
+ const queue = {
483
+ ...defaultQueue(machine, models, cpuOnly),
484
+ ...answers.queue ?? {}
485
+ };
486
+ const httpsAnswer = answers.server?.https;
487
+ const serverHttps = typeof httpsAnswer === "boolean" ? { enabled: httpsAnswer } : { enabled: false, ...httpsAnswer ?? {} };
488
+ const config = {
489
+ server: {
490
+ host: "127.0.0.1",
491
+ port: 11435,
492
+ basePath: "/",
493
+ requestBodyLimit: "8mb",
494
+ https: serverHttps,
495
+ ...omit(answers.server ?? {}, ["https"])
496
+ },
497
+ ollama: {
498
+ baseUrl: detection.ollamaBaseUrl.value ?? "http://127.0.0.1:11434",
499
+ openAiCompatiblePath: "/v1/chat/completions",
500
+ nativeApiBasePath: "/api",
501
+ keepAlive: "10m",
502
+ requestTimeoutMs: 18e4,
503
+ ...answers.ollama ?? {}
504
+ },
505
+ gpu,
506
+ router: {
507
+ defaultMode: "auto",
508
+ syncMaxQueueTimeMs: cpuOnly ? 100 : 250,
509
+ heavyLoadQueueDepth: cpuOnly ? 1 : models.some((model) => model.exclusive) ? 3 : 4,
510
+ heavyLoadGpuFreeMbThreshold: gpu.provider === "nvidia" ? Math.max(2048, gpu.vramSafetyReserveMb * 2) : 1024,
511
+ defaultTaskType: "unknown",
512
+ classification: {
513
+ mode: "heuristic",
514
+ optionalClassifierModel: models.find((model) => model.costClass === "low")?.name,
515
+ classifierTimeoutMs: 1500
516
+ },
517
+ ...answers.router ?? {}
518
+ },
519
+ jobs: {
520
+ store: "memory",
521
+ resultTtlSeconds: 86400,
522
+ maxAttempts: 2,
523
+ cleanupIntervalMs: 6e4,
524
+ ...answers.jobs ?? {}
525
+ },
526
+ models,
527
+ routes,
528
+ queue
529
+ };
530
+ return parseConfig(serializeConfig(config));
531
+ }
532
+ function parseOllamaList(outputText) {
533
+ const lines = outputText.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
534
+ if (lines.length <= 1) return [];
535
+ return lines.slice(1).map((line) => {
536
+ const wideParts = line.split(/\s{2,}/).filter(Boolean);
537
+ const parts = wideParts.length >= 3 ? wideParts : line.split(/\s+/).filter(Boolean);
538
+ const [name, id] = parts;
539
+ const size = wideParts.length >= 3 ? parts[2] : [parts[2], parts[3]].filter(Boolean).join(" ");
540
+ const modified = wideParts.length >= 3 ? parts[3] : parts.slice(4).join(" ");
541
+ return {
542
+ name: name ?? line,
543
+ id,
544
+ size,
545
+ sizeGb: parseSizeGb(size),
546
+ modified: modified || void 0
547
+ };
548
+ });
549
+ }
550
+ function inferModelRole(model) {
551
+ const name = model.name.toLowerCase();
552
+ if (name.includes("review")) return "review";
553
+ if (name.includes("coder") || name.includes("code") || name.includes("deepseek") || name.includes("qwen")) return "code";
554
+ if ((model.sizeGb ?? 0) >= 12 || name.includes("gpt-oss") || name.includes("reason")) return "heavy";
555
+ if (name.includes("tool")) return "tool";
556
+ return "fast";
557
+ }
558
+ function serializeConfig(config) {
559
+ return YAML2.stringify(config, { lineWidth: 0 });
560
+ }
561
+ function formatDetectionSummary(detection) {
562
+ const machine = detection.machine.value;
563
+ const gpu = detection.gpu.value;
564
+ return [
565
+ "Detected environment",
566
+ "",
567
+ "Ollama:",
568
+ ` binary: ${detection.ollamaBinary.value ?? "not found"} (${detection.ollamaBinary.source}, ${detection.ollamaBinary.confidence})`,
569
+ ` base URL: ${detection.ollamaBaseUrl.value ?? "not detected"} (${detection.ollamaBaseUrl.source})`,
570
+ ` reachable: ${detection.ollamaReachable.value === true ? "yes" : "no"}`,
571
+ ` models: ${detection.ollamaModels.value?.length ?? 0} found`,
572
+ "",
573
+ "GPU:",
574
+ ` provider: ${gpu?.provider ?? "none"}`,
575
+ ` name: ${gpu?.name ?? "not detected"}`,
576
+ ` VRAM: ${gpu?.vramTotalMb ?? 0} MB`,
577
+ ` monitor: ${gpu?.monitor.enabled ? `enabled through ${gpu.monitor.nvidiaSmiPath}` : "disabled"}`,
578
+ "",
579
+ "Machine:",
580
+ ` OS: ${machine?.platform ?? process.platform} ${machine?.arch ?? process.arch}`,
581
+ ` CPU cores: ${machine?.cpuCores ?? os.cpus().length}`,
582
+ ` RAM: ${machine?.totalMemoryMb ?? Math.round(os.totalmem() / 1024 / 1024)} MB`
583
+ ].join("\n");
584
+ }
585
+ function formatConfigSummary(config, detection, outputPath) {
586
+ const protocol = config.server.https.enabled ? "https" : "http";
587
+ const basePath = config.server.basePath === "/" ? "/" : config.server.basePath;
588
+ return [
589
+ "",
590
+ "Configuration summary",
591
+ "",
592
+ `Output: ${outputPath}`,
593
+ `Server: ${protocol}://${config.server.host}:${config.server.port}${basePath}`,
594
+ `Ollama: ${config.ollama.baseUrl}`,
595
+ `GPU: ${config.gpu.provider}${config.gpu.name ? `, ${config.gpu.name}` : ""}, ${config.gpu.vramTotalMb} MB VRAM`,
596
+ `Models: ${config.models.length} configured`,
597
+ `Heavy model: ${config.models.find((model) => model.exclusive)?.name ?? "none"}`,
598
+ `Queue: global concurrency ${config.queue.globalMaxConcurrent}, max queued ${config.queue.globalMaxQueued}`,
599
+ `Jobs: ${config.jobs.store} store, result TTL ${config.jobs.resultTtlSeconds}s`,
600
+ "",
601
+ "Detected:",
602
+ ` Ollama models: ${detection.ollamaModels.value?.length ?? 0}`,
603
+ ` Machine: ${detection.machine.value?.platform ?? process.platform} ${detection.machine.value?.arch ?? process.arch}`,
604
+ ""
605
+ ].join("\n");
606
+ }
607
+ async function promptForConfig(detection, answers, options) {
608
+ const rl = createInterface({ input, output });
609
+ try {
610
+ output.write(`${formatDetectionSummary(detection)}
611
+
612
+ `);
613
+ const useDetected = options.assumeYes || await confirm(rl, "Use these detected values?", true);
614
+ let mergedAnswers = answers;
615
+ if (!useDetected) {
616
+ mergedAnswers = await promptCorrections(rl, detection, answers);
617
+ }
618
+ const config = generateConfigFromDetection(detection, mergedAnswers);
619
+ if (!options.assumeYes && !await confirm(rl, "Write this config?", true)) {
620
+ throw new Error("Configuration cancelled");
621
+ }
622
+ return config;
623
+ } finally {
624
+ rl.close();
625
+ }
626
+ }
627
+ async function promptCorrections(rl, detection, answers) {
628
+ const baseUrl = await ask(rl, "Ollama base URL", detection.ollamaBaseUrl.value ?? "http://127.0.0.1:11434");
629
+ const host = await ask(rl, "Server host", "127.0.0.1");
630
+ const port = Number(await ask(rl, "Server port", "11435"));
631
+ const basePath = await ask(rl, "Server base path", "/");
632
+ const gpu = detection.gpu.value ?? defaultGpuForPlatform(process.platform, process.arch);
633
+ const vramTotalMb = Number(await ask(rl, "GPU VRAM total MB", String(gpu.vramTotalMb)));
634
+ const models = detection.ollamaModels.value?.length ? void 0 : {
635
+ mode: "manual",
636
+ items: [
637
+ {
638
+ name: await ask(rl, "First Ollama model name", "llama3.2:3b"),
639
+ role: "fast",
640
+ sizeGb: Number(await ask(rl, "First model size GB", "2"))
641
+ }
642
+ ]
643
+ };
644
+ return {
645
+ ...answers,
646
+ server: { ...answers.server ?? {}, host, port, basePath },
647
+ ollama: { ...answers.ollama ?? {}, baseUrl },
648
+ gpu: { ...answers.gpu ?? {}, vramTotalMb },
649
+ models: answers.models ?? models
650
+ };
651
+ }
652
+ async function loadAnswers(path) {
653
+ const raw = await readFile2(path, "utf8");
654
+ const parsed = answersSchema.parse(YAML2.parse(raw));
655
+ return parsed;
656
+ }
657
+ async function detectOllamaBinary(platform, env, pathLookup) {
658
+ const pathResult = await pathLookup("ollama", platform, env);
659
+ if (pathResult) return { value: pathResult, source: "command", confidence: "high" };
660
+ const candidates = platform === "darwin" ? ["/opt/homebrew/bin/ollama", "/usr/local/bin/ollama", "/Applications/Ollama.app/Contents/Resources/ollama"] : ["/usr/bin/ollama", "/usr/local/bin/ollama", "/snap/bin/ollama"];
661
+ for (const candidate of candidates) {
662
+ if (await executableExists(candidate)) return { value: candidate, source: "command", confidence: "medium" };
663
+ }
664
+ return notFound("ollama binary not found");
665
+ }
666
+ function detectOllamaBaseUrl(env) {
667
+ if (env.OLLAMA_HOST) return { value: normalizeOllamaHost(env.OLLAMA_HOST), source: "env", confidence: "high" };
668
+ return { value: "http://127.0.0.1:11434", source: "default", confidence: "medium" };
669
+ }
670
+ async function detectOllamaReachable(baseUrl, fetchImpl) {
671
+ if (!baseUrl) return notFound("ollama base URL is unknown");
672
+ const controller = new AbortController();
673
+ const timer = setTimeout(() => controller.abort(), 700);
674
+ try {
675
+ const response = await fetchImpl(new URL("/api/tags", baseUrl), { signal: controller.signal });
676
+ return { value: response.ok, source: "command", confidence: response.ok ? "high" : "low" };
677
+ } catch {
678
+ return { value: false, source: "not_found", confidence: "low", message: "Ollama API did not respond" };
679
+ } finally {
680
+ clearTimeout(timer);
681
+ }
682
+ }
683
+ async function detectOllamaModels(ollamaBinary, commandRunner) {
684
+ if (!ollamaBinary) return notFound("ollama binary not found");
685
+ try {
686
+ const { stdout } = await commandRunner(ollamaBinary, ["list"]);
687
+ return { value: parseOllamaList(stdout), source: "command", confidence: "high" };
688
+ } catch {
689
+ return { value: [], source: "not_found", confidence: "low", message: "ollama list failed" };
690
+ }
691
+ }
692
+ async function detectLoadedModels(ollamaBinary, commandRunner) {
693
+ if (!ollamaBinary) return notFound("ollama binary not found");
694
+ try {
695
+ const { stdout } = await commandRunner(ollamaBinary, ["ps"]);
696
+ return { value: parseOllamaPs(stdout), source: "command", confidence: "high" };
697
+ } catch {
698
+ return { value: [], source: "not_found", confidence: "low", message: "ollama ps failed" };
699
+ }
700
+ }
701
+ async function detectNvidiaSmi(platform, env, pathLookup) {
702
+ const found = await pathLookup("nvidia-smi", platform, env);
703
+ return found ? { value: found, source: "command", confidence: "high" } : notFound("nvidia-smi not found");
704
+ }
705
+ async function detectGpu(platform, arch, nvidiaSmiPath, commandRunner) {
706
+ if (platform === "darwin") {
707
+ return { value: defaultGpuForPlatform(platform, arch), source: "default", confidence: arch === "arm64" ? "medium" : "low" };
708
+ }
709
+ if (!nvidiaSmiPath) {
710
+ return { value: defaultGpuForPlatform(platform, arch), source: "not_found", confidence: "medium" };
711
+ }
712
+ try {
713
+ const { stdout } = await commandRunner(nvidiaSmiPath, [
714
+ "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
715
+ "--format=csv,noheader,nounits"
716
+ ]);
717
+ const gpu = parseNvidiaSmi(stdout)[0];
718
+ if (!gpu) return { value: defaultGpuForPlatform(platform, arch), source: "not_found", confidence: "low" };
719
+ return {
720
+ value: {
721
+ provider: "nvidia",
722
+ name: gpu.name,
723
+ vramTotalMb: gpu.vramTotalMb,
724
+ vramSafetyReserveMb: safetyReserveMb(gpu.vramTotalMb),
725
+ maxGpuUtilizationPct: 92,
726
+ requireGpuOnlyByDefault: true,
727
+ monitor: {
728
+ enabled: true,
729
+ intervalMs: 5e3,
730
+ nvidiaSmiPath
731
+ }
732
+ },
733
+ source: "command",
734
+ confidence: "high"
735
+ };
736
+ } catch {
737
+ return { value: defaultGpuForPlatform(platform, arch), source: "not_found", confidence: "low" };
738
+ }
739
+ }
740
+ function detectMachine(platform, arch, options) {
741
+ const totalMemoryMb = options.totalMemoryMb ?? Math.round(os.totalmem() / 1024 / 1024);
742
+ const cpuCores = options.cpuCores ?? os.cpus().length;
743
+ return {
744
+ value: {
745
+ platform,
746
+ arch,
747
+ cpuCores,
748
+ totalMemoryMb,
749
+ class: totalMemoryMb < 16384 || cpuCores < 4 ? "small" : totalMemoryMb > 65536 || cpuCores > 16 ? "large" : "medium"
750
+ },
751
+ source: "command",
752
+ confidence: "high"
753
+ };
754
+ }
755
+ function buildModels(detectedModels, answers, cpuOnly = false) {
756
+ const source = answers?.mode === "manual" ? answers.items ?? [] : answers?.items ?? detectedModels;
757
+ return source.filter((model) => Boolean(model.name)).map(
758
+ (model) => buildModelSpec(
759
+ model.name,
760
+ model.role ?? inferModelRole(model),
761
+ Number(model.sizeGb ?? 2),
762
+ cpuOnly
763
+ )
764
+ );
765
+ }
766
+ function buildModelSpec(name, role, sizeGb, cpuOnly) {
767
+ const heavy = role === "heavy";
768
+ const code = role === "code" || role === "review" || role === "tool";
769
+ const naturalMaxConcurrent = heavy ? 1 : role === "fast" ? 2 : 1;
770
+ return {
771
+ name,
772
+ sizeGb,
773
+ purpose: purposesForRole(role),
774
+ priority: heavy ? 95 : code ? 70 : 50,
775
+ maxConcurrent: cpuOnly ? 1 : naturalMaxConcurrent,
776
+ defaultContext: heavy ? 16384 : code ? 8192 : 4096,
777
+ maxContext: heavy ? 65536 : code ? 32768 : 8192,
778
+ timeoutMs: heavy ? 3e5 : code ? 18e4 : 9e4,
779
+ costClass: heavy ? "high" : code ? "medium" : "low",
780
+ exclusive: heavy,
781
+ allowWhenBusy: !heavy,
782
+ tags: tagsForRole(role)
783
+ };
784
+ }
785
+ function purposesForRole(role) {
786
+ switch (role) {
787
+ case "code":
788
+ return ["code_generate", "code_fix", "tool_use"];
789
+ case "review":
790
+ return ["code_review", "code_generate", "code_fix"];
791
+ case "heavy":
792
+ return ["agentic_reasoning", "large_context", "planning", "tool_use"];
793
+ case "tool":
794
+ return ["tool_use", "code_generate"];
795
+ case "fast":
796
+ default:
797
+ return ["triage", "simple_chat", "summarize"];
798
+ }
799
+ }
800
+ function tagsForRole(role) {
801
+ switch (role) {
802
+ case "code":
803
+ return ["code", "fallback"];
804
+ case "review":
805
+ return ["code", "review"];
806
+ case "heavy":
807
+ return ["reasoning", "large_context"];
808
+ case "tool":
809
+ return ["tool_use"];
810
+ case "fast":
811
+ default:
812
+ return ["fast", "chat"];
813
+ }
814
+ }
815
+ function generateRoutes(models) {
816
+ const fast = models.filter((model) => model.costClass === "low").map((model) => model.name);
817
+ const code = models.filter((model) => model.purpose.includes("code_generate")).map((model) => model.name);
818
+ const review = models.filter((model) => model.purpose.includes("code_review")).map((model) => model.name);
819
+ const heavy = models.filter((model) => model.exclusive || model.costClass === "high").map((model) => model.name);
820
+ const tool = models.filter((model) => model.purpose.includes("tool_use")).map((model) => model.name);
821
+ const fallback = [...fast, ...code, ...models.map((model) => model.name)];
822
+ return {
823
+ triage: firstNonEmpty(fast, fallback),
824
+ simple_chat: firstNonEmpty(fast, fallback),
825
+ summarize: firstNonEmpty(fast, fallback),
826
+ code_generate: firstNonEmpty(code, fallback),
827
+ code_review: firstNonEmpty(review, code, fallback),
828
+ code_fix: firstNonEmpty(code, review, fallback),
829
+ agentic_reasoning: firstNonEmpty(heavy, code, fallback),
830
+ large_context: firstNonEmpty(heavy, code, fallback),
831
+ tool_use: firstNonEmpty(tool, code, heavy, fallback),
832
+ unknown: firstNonEmpty(fast, code, fallback)
833
+ };
834
+ }
835
+ function ensureCoreRoutes(routes, models) {
836
+ const fallback = [models[0].name];
837
+ return Object.fromEntries(coreTaskTypes.map((taskType) => [taskType, routes[taskType]?.length ? routes[taskType] : fallback]));
838
+ }
839
+ function defaultQueue(machine, models, cpuOnly = false) {
840
+ const maxByModels = models.reduce((sum, model) => sum + model.maxConcurrent, 0);
841
+ const suggested = cpuOnly ? 1 : machine.class === "small" ? 1 : machine.class === "large" ? 4 : 3;
842
+ return {
843
+ globalMaxConcurrent: Math.max(1, Math.min(maxByModels, suggested)),
844
+ globalMaxQueued: cpuOnly || machine.class === "small" ? 50 : 100,
845
+ perUserMaxQueued: cpuOnly || machine.class === "small" ? 10 : 20,
846
+ defaultPriority: "normal",
847
+ timeoutMs: 18e4
848
+ };
849
+ }
850
+ function mergeGpu(base, override) {
851
+ return {
852
+ ...base,
853
+ ...override ?? {},
854
+ monitor: {
855
+ ...base.monitor,
856
+ ...override?.monitor ?? {}
857
+ }
858
+ };
859
+ }
860
+ function defaultGpuForPlatform(platform, arch) {
861
+ const mac = platform === "darwin";
862
+ return {
863
+ provider: "none",
864
+ name: mac && arch === "arm64" ? "Apple Silicon / macOS GPU" : mac ? "macOS GPU" : "No NVIDIA GPU detected",
865
+ vramTotalMb: 0,
866
+ vramSafetyReserveMb: 1024,
867
+ maxGpuUtilizationPct: 95,
868
+ requireGpuOnlyByDefault: false,
869
+ monitor: {
870
+ enabled: false,
871
+ intervalMs: 5e3,
872
+ nvidiaSmiPath: "nvidia-smi"
873
+ }
874
+ };
875
+ }
876
+ function safetyReserveMb(totalMb) {
877
+ if (totalMb < 8192) return 1024;
878
+ if (totalMb <= 24576) return 1536;
879
+ return 2048;
880
+ }
881
+ function firstNonEmpty(...lists) {
882
+ return lists.find((list) => list.length > 0) ?? [];
883
+ }
884
+ function parseSizeGb(size) {
885
+ if (!size) return void 0;
886
+ const match = size.match(/([\d.]+)\s*([kmgt]i?b|[kmgt]b)?/i);
887
+ if (!match) return void 0;
888
+ const value = Number(match[1]);
889
+ const unit = (match[2] ?? "GB").toLowerCase();
890
+ if (!Number.isFinite(value)) return void 0;
891
+ if (unit.startsWith("m")) return value / 1024;
892
+ if (unit.startsWith("k")) return value / 1024 / 1024;
893
+ if (unit.startsWith("t")) return value * 1024;
894
+ return value;
895
+ }
896
+ function normalizeOllamaHost(host) {
897
+ if (host.startsWith("http://") || host.startsWith("https://")) return host;
898
+ return `http://${host}`;
899
+ }
900
+ async function findExecutable(command, _platform, env) {
901
+ const pathValue = env.PATH ?? "";
902
+ for (const entry of pathValue.split(":").filter(Boolean)) {
903
+ const candidate = resolve2(entry, command);
904
+ if (await executableExists(candidate)) return candidate;
905
+ }
906
+ return void 0;
907
+ }
908
+ async function executableExists(path) {
909
+ try {
910
+ await access2(path, constants.X_OK);
911
+ return true;
912
+ } catch {
913
+ return false;
914
+ }
915
+ }
916
+ async function defaultCommandRunner3(command, args) {
917
+ const { stdout, stderr } = await execFileAsync3(command, args, { timeout: 5e3 });
918
+ return { stdout, stderr };
919
+ }
920
+ function notFound(message) {
921
+ return { source: "not_found", confidence: "low", message };
922
+ }
923
+ async function confirm(rl, question, defaultValue) {
924
+ const suffix = defaultValue ? "[Y/n]" : "[y/N]";
925
+ const answer = (await rl.question(`${question} ${suffix} `)).trim().toLowerCase();
926
+ if (!answer) return defaultValue;
927
+ return answer === "y" || answer === "yes";
928
+ }
929
+ async function ask(rl, question, defaultValue) {
930
+ const answer = await rl.question(`${question} (${defaultValue}): `);
931
+ return answer.trim() || defaultValue;
932
+ }
933
+ function omit(value, keys) {
934
+ return Object.fromEntries(Object.entries(value).filter(([key]) => !keys.includes(key)));
935
+ }
936
+ function emit(options, text) {
937
+ if (!options.silent) output.write(text);
938
+ }
939
+ async function fileExists(path) {
940
+ try {
941
+ await access2(path);
942
+ return true;
943
+ } catch {
944
+ return false;
945
+ }
946
+ }
947
+
948
+ // src/job-store.ts
949
+ import { nanoid } from "nanoid";
950
+ var InMemoryJobStore = class {
951
+ constructor(config) {
952
+ this.config = config;
953
+ }
954
+ config;
955
+ jobs = /* @__PURE__ */ new Map();
956
+ create(input2) {
957
+ const now = /* @__PURE__ */ new Date();
958
+ const record = {
959
+ id: `job_${nanoid(16)}`,
960
+ status: "queued",
961
+ task_type: input2.taskType,
962
+ selected_model: input2.selectedModel,
963
+ request_json: JSON.stringify(input2.request),
964
+ result_json: null,
965
+ error_json: null,
966
+ attempts: 0,
967
+ priority: input2.priority,
968
+ created_at: now.toISOString(),
969
+ started_at: null,
970
+ finished_at: null,
971
+ expires_at: new Date(now.getTime() + this.config.resultTtlSeconds * 1e3).toISOString()
972
+ };
973
+ this.jobs.set(record.id, record);
974
+ return { ...record };
975
+ }
976
+ get(id) {
977
+ const job = this.jobs.get(id);
978
+ return job ? { ...job } : void 0;
979
+ }
980
+ list(limit = 50) {
981
+ return [...this.jobs.values()].sort((a, b) => b.created_at.localeCompare(a.created_at)).slice(0, limit).map((job) => ({ ...job }));
982
+ }
983
+ markRunning(id) {
984
+ const job = this.jobs.get(id);
985
+ if (!job || job.status !== "queued" && job.status !== "running") return this.get(id);
986
+ const now = (/* @__PURE__ */ new Date()).toISOString();
987
+ job.status = "running";
988
+ job.started_at ??= now;
989
+ job.attempts += 1;
990
+ return this.get(id);
991
+ }
992
+ markSucceeded(id, result) {
993
+ this.finish(id, "succeeded", JSON.stringify(result), null);
994
+ return this.get(id);
995
+ }
996
+ markFailed(id, error) {
997
+ this.finish(id, "failed", null, JSON.stringify(normalizeError(error)));
998
+ return this.get(id);
999
+ }
1000
+ cancel(id) {
1001
+ const job = this.jobs.get(id);
1002
+ if (!job) return void 0;
1003
+ if (job.status === "queued" || job.status === "running") {
1004
+ job.status = "cancelled";
1005
+ job.finished_at = (/* @__PURE__ */ new Date()).toISOString();
1006
+ }
1007
+ return this.get(id);
1008
+ }
1009
+ cleanupExpired(now = /* @__PURE__ */ new Date()) {
1010
+ let changed = 0;
1011
+ for (const job of this.jobs.values()) {
1012
+ if (job.expires_at && job.expires_at < now.toISOString() && ["queued", "running", "succeeded", "failed"].includes(job.status)) {
1013
+ job.status = "expired";
1014
+ changed += 1;
1015
+ }
1016
+ }
1017
+ return changed;
1018
+ }
1019
+ close() {
1020
+ this.jobs.clear();
1021
+ }
1022
+ finish(id, status, resultJson, errorJson) {
1023
+ const job = this.jobs.get(id);
1024
+ if (!job || job.status !== "queued" && job.status !== "running") return;
1025
+ job.status = status;
1026
+ job.result_json = resultJson;
1027
+ job.error_json = errorJson;
1028
+ job.finished_at = (/* @__PURE__ */ new Date()).toISOString();
1029
+ }
1030
+ };
1031
+ function parseJobResult(job) {
1032
+ if (!job.result_json) return void 0;
1033
+ return JSON.parse(job.result_json);
1034
+ }
1035
+ function parseJobError(job) {
1036
+ if (!job.error_json) return void 0;
1037
+ return JSON.parse(job.error_json);
1038
+ }
1039
+ function normalizeError(error) {
1040
+ if (error instanceof Error) {
1041
+ return { message: error.message, name: error.name, stack: error.stack };
1042
+ }
1043
+ return { message: String(error), value: error };
1044
+ }
1045
+
1046
+ // src/queue-manager.ts
1047
+ import PQueue from "p-queue";
1048
+ var QueueManager = class {
1049
+ constructor(config, ollama, jobs) {
1050
+ this.config = config;
1051
+ this.ollama = ollama;
1052
+ this.jobs = jobs;
1053
+ for (const model of config.models) {
1054
+ this.queues.set(model.name, new PQueue({ concurrency: model.maxConcurrent }));
1055
+ }
1056
+ }
1057
+ config;
1058
+ ollama;
1059
+ jobs;
1060
+ queues = /* @__PURE__ */ new Map();
1061
+ async runSync(input2) {
1062
+ this.ensureQueueCapacity();
1063
+ const queuedAt = Date.now();
1064
+ const result = await this.queueFor(input2.model.name).add(
1065
+ async () => {
1066
+ const startedAt = Date.now();
1067
+ const result2 = await this.ollama.chat(input2.request, input2.model.name, Math.min(input2.timeoutMs, input2.model.timeoutMs));
1068
+ return { result: result2, queueTimeMs: startedAt - queuedAt, executionTimeMs: Date.now() - startedAt };
1069
+ },
1070
+ { priority: input2.priority, timeout: input2.timeoutMs, throwOnTimeout: true }
1071
+ );
1072
+ return result;
1073
+ }
1074
+ enqueueAsync(input2) {
1075
+ this.ensureQueueCapacity();
1076
+ const job = this.jobs.create({
1077
+ taskType: input2.classification.taskType,
1078
+ selectedModel: input2.model.name,
1079
+ request: input2.request,
1080
+ priority: input2.priority
1081
+ });
1082
+ const queue = this.queueFor(input2.model.name);
1083
+ const position = queue.size + 1;
1084
+ void queue.add(() => this.runJob(job.id), {
1085
+ priority: input2.priority,
1086
+ timeout: input2.model.timeoutMs,
1087
+ throwOnTimeout: true
1088
+ });
1089
+ return { id: job.id, position };
1090
+ }
1091
+ snapshot() {
1092
+ const byModel = [...this.queues.entries()].map(([model, queue]) => ({
1093
+ model,
1094
+ queued: queue.size,
1095
+ running: queue.pending,
1096
+ concurrency: this.config.models.find((spec) => spec.name === model)?.maxConcurrent ?? 1
1097
+ }));
1098
+ return {
1099
+ globalQueued: byModel.reduce((sum, item) => sum + item.queued, 0),
1100
+ globalRunning: byModel.reduce((sum, item) => sum + item.running, 0),
1101
+ byModel
1102
+ };
1103
+ }
1104
+ queueDepthByModel() {
1105
+ return new Map([...this.queues.entries()].map(([model, queue]) => [model, queue.size]));
1106
+ }
1107
+ runningByModel() {
1108
+ return new Map([...this.queues.entries()].map(([model, queue]) => [model, queue.pending]));
1109
+ }
1110
+ async runJob(jobId) {
1111
+ const job = this.jobs.get(jobId);
1112
+ if (!job || job.status === "cancelled") return;
1113
+ const modelName = job.selected_model;
1114
+ const model = this.config.models.find((spec) => spec.name === modelName);
1115
+ if (!model) {
1116
+ this.jobs.markFailed(jobId, new Error(`Configured model disappeared: ${modelName}`));
1117
+ return;
1118
+ }
1119
+ this.jobs.markRunning(jobId);
1120
+ try {
1121
+ const request = JSON.parse(job.request_json);
1122
+ const result = await this.ollama.chat(request, model.name, model.timeoutMs);
1123
+ this.jobs.markSucceeded(jobId, result);
1124
+ } catch (error) {
1125
+ const latest = this.jobs.get(jobId);
1126
+ if (latest && latest.attempts < this.config.jobs.maxAttempts && latest.status !== "cancelled") {
1127
+ const queue = this.queueFor(model.name);
1128
+ void queue.add(() => this.runJob(jobId), {
1129
+ priority: latest.priority,
1130
+ timeout: model.timeoutMs,
1131
+ throwOnTimeout: true
1132
+ });
1133
+ } else {
1134
+ this.jobs.markFailed(jobId, error);
1135
+ }
1136
+ }
1137
+ }
1138
+ ensureQueueCapacity() {
1139
+ const snapshot = this.snapshot();
1140
+ if (snapshot.globalQueued >= this.config.queue.globalMaxQueued) {
1141
+ throw new Error(`Global queue limit exceeded: ${this.config.queue.globalMaxQueued}`);
1142
+ }
1143
+ }
1144
+ queueFor(model) {
1145
+ const queue = this.queues.get(model);
1146
+ if (!queue) throw new Error(`No queue configured for model: ${model}`);
1147
+ return queue;
1148
+ }
1149
+ };
1150
+
1151
+ // src/server.ts
1152
+ import http from "http";
1153
+ import https from "https";
1154
+ import { readFile as readFile3 } from "fs/promises";
1155
+ import express from "express";
1156
+ import { pinoHttp } from "pino-http";
1157
+ import { z as z3 } from "zod";
1158
+
1159
+ // src/classifier.ts
1160
+ var codeMarkers = [
1161
+ "typescript",
1162
+ "javascript",
1163
+ "node.js",
1164
+ "python",
1165
+ "function",
1166
+ "class ",
1167
+ "stack trace",
1168
+ "exception",
1169
+ "compile",
1170
+ "refactor",
1171
+ "pull request",
1172
+ "diff --git",
1173
+ "```"
1174
+ ];
1175
+ var toolMarkers = ["tool", "function call", "json schema", "api call", "webhook", "bash", "shell command"];
1176
+ var reasoningMarkers = ["plan", "architecture", "design", "debug", "investigate", "root cause", "step by step"];
1177
+ var summarizeMarkers = ["summarize", "summary", "tl;dr", "extract key points"];
1178
+ var reviewMarkers = ["review", "audit", "risks", "find bugs", "code review"];
1179
+ var fixMarkers = ["fix", "bug", "failing test", "patch", "regression"];
1180
+ var generateMarkers = ["write", "implement", "create", "generate", "build"];
1181
+ function classifyTask(request, explicitTaskType) {
1182
+ if (explicitTaskType && explicitTaskType !== "auto") {
1183
+ return {
1184
+ taskType: explicitTaskType,
1185
+ complexity: explicitTaskType === "agentic_reasoning" || explicitTaskType === "large_context" ? "heavy" : "medium",
1186
+ requiresLargeContext: explicitTaskType === "large_context",
1187
+ requiresToolUse: explicitTaskType === "tool_use",
1188
+ confidence: 1
1189
+ };
1190
+ }
1191
+ const text = extractMessageText(request).toLowerCase();
1192
+ const tokenEstimate = Math.ceil(text.length / 4);
1193
+ const hasCode = containsAny(text, codeMarkers);
1194
+ const requiresToolUse = containsAny(text, toolMarkers);
1195
+ const requiresLargeContext = tokenEstimate > 12e3 || text.includes("large context") || text.includes("entire repository");
1196
+ let taskType = "simple_chat";
1197
+ let confidence = 0.55;
1198
+ if (requiresLargeContext) {
1199
+ taskType = "large_context";
1200
+ confidence = 0.8;
1201
+ } else if (requiresToolUse) {
1202
+ taskType = "tool_use";
1203
+ confidence = 0.75;
1204
+ } else if (containsAny(text, reviewMarkers) && hasCode) {
1205
+ taskType = "code_review";
1206
+ confidence = 0.82;
1207
+ } else if (containsAny(text, fixMarkers) && hasCode) {
1208
+ taskType = "code_fix";
1209
+ confidence = 0.8;
1210
+ } else if (containsAny(text, generateMarkers) && hasCode) {
1211
+ taskType = "code_generate";
1212
+ confidence = 0.78;
1213
+ } else if (containsAny(text, summarizeMarkers)) {
1214
+ taskType = "summarize";
1215
+ confidence = 0.86;
1216
+ } else if (containsAny(text, reasoningMarkers) && (text.length > 1200 || text.includes("multi-step"))) {
1217
+ taskType = "agentic_reasoning";
1218
+ confidence = 0.72;
1219
+ } else if (text.length < 180 && (text.includes("classify") || text.includes("route") || text.includes("triage"))) {
1220
+ taskType = "triage";
1221
+ confidence = 0.7;
1222
+ }
1223
+ const complexity = classifyComplexity(text, taskType, tokenEstimate);
1224
+ return { taskType, complexity, requiresLargeContext, requiresToolUse, confidence };
1225
+ }
1226
+ function extractMessageText(request) {
1227
+ return request.messages.map((message) => {
1228
+ if (typeof message.content === "string") return message.content;
1229
+ if (Array.isArray(message.content)) {
1230
+ return message.content.map((part) => {
1231
+ if (typeof part === "string") return part;
1232
+ if (part && typeof part === "object" && "text" in part) return String(part.text ?? "");
1233
+ return "";
1234
+ }).join("\n");
1235
+ }
1236
+ return JSON.stringify(message.content ?? "");
1237
+ }).join("\n");
1238
+ }
1239
+ function classifyComplexity(text, taskType, tokenEstimate) {
1240
+ if (taskType === "large_context" || taskType === "agentic_reasoning" || tokenEstimate > 12e3) return "heavy";
1241
+ if (tokenEstimate > 3e3 || text.includes("architecture") || text.includes("debug")) return "medium";
1242
+ if (taskType.startsWith("code_") || taskType === "tool_use") return "medium";
1243
+ return "light";
1244
+ }
1245
+ function containsAny(text, markers) {
1246
+ return markers.some((marker) => text.includes(marker));
1247
+ }
1248
+
1249
+ // src/router-engine.ts
1250
+ var priorityWeights = {
1251
+ low: 10,
1252
+ normal: 50,
1253
+ high: 90
1254
+ };
1255
+ function normalizeRouterMetadata(config, metadata = {}) {
1256
+ return {
1257
+ mode: metadata.mode ?? config.router.defaultMode,
1258
+ allowAsync: metadata.allowAsync ?? true,
1259
+ taskType: metadata.taskType ?? "auto",
1260
+ priority: metadata.priority ?? config.queue.defaultPriority,
1261
+ preferredModels: metadata.preferredModels ?? [],
1262
+ forbiddenModels: metadata.forbiddenModels ?? [],
1263
+ maxQueueTimeMs: metadata.maxQueueTimeMs ?? config.router.syncMaxQueueTimeMs,
1264
+ maxExecutionTimeMs: metadata.maxExecutionTimeMs ?? config.queue.timeoutMs,
1265
+ requireGpuOnly: metadata.requireGpuOnly ?? config.gpu.requireGpuOnlyByDefault
1266
+ };
1267
+ }
1268
+ var RoutingEngine = class {
1269
+ constructor(config) {
1270
+ this.config = config;
1271
+ this.modelsByName = new Map(config.models.map((model) => [model.name, model]));
1272
+ }
1273
+ config;
1274
+ modelsByName;
1275
+ decide(context) {
1276
+ const candidates = this.getCandidates(context);
1277
+ if (candidates.length === 0) {
1278
+ return { type: "reject", statusCode: 503, reason: "No configured model can satisfy this request" };
1279
+ }
1280
+ const blocked = candidates.map((model) => ({ model, blockReason: this.blockReason(model, context) })).filter((entry) => entry.blockReason);
1281
+ const available = candidates.filter((model) => !this.blockReason(model, context));
1282
+ const scoredAvailable = available.map((model) => this.score(model, context)).sort((a, b) => b.score - a.score);
1283
+ const fallbackModels = candidates.map((model) => model.name);
1284
+ if (context.router.mode === "async") {
1285
+ const model = scoredAvailable[0]?.model ?? candidates[0];
1286
+ return {
1287
+ type: "async",
1288
+ model,
1289
+ fallbackModels,
1290
+ reason: "Request explicitly requested async mode",
1291
+ score: scoredAvailable[0]?.score ?? 0,
1292
+ position: (context.queueDepthByModel.get(model.name) ?? 0) + 1
1293
+ };
1294
+ }
1295
+ const preferredBusy = this.preferredBusyModel(candidates, context);
1296
+ const totalQueueDepth = [...context.queueDepthByModel.values()].reduce((sum, depth) => sum + depth, 0);
1297
+ const gpuHeavy = Boolean(
1298
+ context.gpu && context.gpu.vramFreeMb < this.config.router.heavyLoadGpuFreeMbThreshold
1299
+ );
1300
+ const heavyLoad = totalQueueDepth >= this.config.router.heavyLoadQueueDepth || gpuHeavy;
1301
+ if (context.router.mode !== "sync" && context.router.allowAsync && (heavyLoad || preferredBusy)) {
1302
+ const model = preferredBusy ?? scoredAvailable[0]?.model ?? candidates[0];
1303
+ return {
1304
+ type: "async",
1305
+ model,
1306
+ fallbackModels,
1307
+ reason: preferredBusy ? "Preferred model is busy; accepted for async processing" : "Heavy load detected",
1308
+ score: this.score(model, context).score,
1309
+ position: (context.queueDepthByModel.get(model.name) ?? 0) + 1
1310
+ };
1311
+ }
1312
+ if (scoredAvailable.length > 0) {
1313
+ return {
1314
+ type: "sync",
1315
+ model: scoredAvailable[0].model,
1316
+ fallbackModels,
1317
+ reason: scoredAvailable[0].reason,
1318
+ score: scoredAvailable[0].score
1319
+ };
1320
+ }
1321
+ if (blocked.some((entry) => entry.blockReason === "busy") && context.router.allowAsync && context.router.mode !== "sync") {
1322
+ const model = blocked[0].model;
1323
+ return {
1324
+ type: "async",
1325
+ model,
1326
+ fallbackModels,
1327
+ reason: "Selected model is busy; accepted for async processing",
1328
+ score: 0,
1329
+ position: (context.queueDepthByModel.get(model.name) ?? 0) + 1
1330
+ };
1331
+ }
1332
+ const reason = blocked.map((entry) => `${entry.model.name}: ${entry.blockReason}`).join("; ");
1333
+ return { type: "reject", statusCode: 503, reason: reason || "No model available" };
1334
+ }
1335
+ score(model, context) {
1336
+ const route = this.config.routes[context.classification.taskType] ?? this.config.routes.unknown ?? [];
1337
+ const routeIndex = route.indexOf(model.name);
1338
+ const loaded = context.loadedModels.some((loadedModel) => loadedModel.name === model.name);
1339
+ const queueDepth = context.queueDepthByModel.get(model.name) ?? 0;
1340
+ const running = context.runningByModel.get(model.name) ?? 0;
1341
+ const preferredIndex = context.router.preferredModels.indexOf(model.name);
1342
+ const freeMb = context.gpu?.vramFreeMb ?? this.config.gpu.vramTotalMb;
1343
+ const requiredMb = model.sizeGb * 1024 + this.config.gpu.vramSafetyReserveMb;
1344
+ let score = 100;
1345
+ score += Math.max(0, 50 - routeIndex * 8);
1346
+ score += model.priority;
1347
+ if (model.purpose.includes(context.classification.taskType)) score += 25;
1348
+ if (model.tags.includes(context.classification.taskType)) score += 15;
1349
+ if (preferredIndex >= 0) score += 80 - preferredIndex * 10;
1350
+ if (loaded) score += 20;
1351
+ if (context.classification.complexity === "heavy" && model.costClass === "high") score += 20;
1352
+ if (context.classification.complexity === "light" && model.costClass === "low") score += 15;
1353
+ if (freeMb > requiredMb) score += Math.min(25, (freeMb - requiredMb) / 512);
1354
+ if (freeMb < requiredMb) score -= 60;
1355
+ score -= queueDepth * 18;
1356
+ score -= running * 25;
1357
+ if (model.exclusive) score -= running * 80;
1358
+ return {
1359
+ model,
1360
+ score,
1361
+ reason: `Selected ${model.name} for ${context.classification.taskType} with score ${score.toFixed(1)}`
1362
+ };
1363
+ }
1364
+ getCandidates(context) {
1365
+ const routeNames = this.config.routes[context.classification.taskType] ?? this.config.routes.unknown ?? [];
1366
+ const names = /* @__PURE__ */ new Set();
1367
+ for (const name of context.router.preferredModels) names.add(name);
1368
+ for (const name of routeNames) names.add(name);
1369
+ for (const model of this.config.models) {
1370
+ if (model.purpose.includes(context.classification.taskType) || model.tags.includes(context.classification.taskType)) {
1371
+ names.add(model.name);
1372
+ }
1373
+ }
1374
+ return [...names].map((name) => this.modelsByName.get(name)).filter((model) => Boolean(model)).filter((model) => !context.router.forbiddenModels.includes(model.name));
1375
+ }
1376
+ blockReason(model, context) {
1377
+ const loaded = context.loadedModels.find((loadedModel) => loadedModel.name === model.name);
1378
+ const processor = loaded?.processor?.toLowerCase() ?? "";
1379
+ if (context.router.requireGpuOnly) {
1380
+ if (!context.gpu && this.config.gpu.vramTotalMb <= 0) return "gpu_only";
1381
+ if (processor.includes("cpu") && !processor.includes("100% gpu")) return "gpu_only";
1382
+ const freeMb = context.gpu?.vramFreeMb ?? this.config.gpu.vramTotalMb;
1383
+ if (model.sizeGb * 1024 + this.config.gpu.vramSafetyReserveMb > freeMb && !loaded) return "gpu_only";
1384
+ }
1385
+ const running = context.runningByModel.get(model.name) ?? 0;
1386
+ if (model.exclusive && running > 0 || !model.allowWhenBusy && running >= model.maxConcurrent) {
1387
+ return "busy";
1388
+ }
1389
+ return void 0;
1390
+ }
1391
+ preferredBusyModel(candidates, context) {
1392
+ const preferredNames = new Set(context.router.preferredModels);
1393
+ const ordered = preferredNames.size > 0 ? candidates.filter((model) => preferredNames.has(model.name)) : candidates.slice(0, 1);
1394
+ return ordered.find((model) => this.blockReason(model, context) === "busy");
1395
+ }
1396
+ };
1397
+
1398
+ // src/logger.ts
1399
+ import pino from "pino";
1400
+ var logger = pino({
1401
+ level: process.env.LOG_LEVEL ?? "info"
1402
+ });
1403
+
1404
+ // src/server.ts
1405
+ var chatRequestSchema = z3.object({
1406
+ model: z3.string().optional(),
1407
+ messages: z3.array(z3.object({ role: z3.string(), content: z3.unknown() })).min(1),
1408
+ stream: z3.boolean().optional(),
1409
+ router: z3.object({
1410
+ mode: z3.enum(["auto", "sync", "async"]).optional(),
1411
+ allowAsync: z3.boolean().optional(),
1412
+ taskType: z3.string().optional(),
1413
+ priority: z3.enum(["low", "normal", "high"]).optional(),
1414
+ preferredModels: z3.array(z3.string()).optional(),
1415
+ forbiddenModels: z3.array(z3.string()).optional(),
1416
+ maxQueueTimeMs: z3.number().int().nonnegative().optional(),
1417
+ maxExecutionTimeMs: z3.number().int().positive().optional(),
1418
+ requireGpuOnly: z3.boolean().optional()
1419
+ }).optional()
1420
+ }).passthrough();
1421
+ function createApp(config, deps) {
1422
+ const app = express();
1423
+ const api = express.Router();
1424
+ const routing = new RoutingEngine(config);
1425
+ if (process.env.NODE_ENV !== "test") {
1426
+ app.use(pinoHttp({ logger }));
1427
+ }
1428
+ app.use(express.json({ limit: config.server.requestBodyLimit }));
1429
+ api.get("/health", (_req, res) => {
1430
+ res.json({ status: "ok", service: "ollama-agent-router" });
1431
+ });
1432
+ api.get("/metrics", (_req, res) => {
1433
+ const snapshot = deps.queue.snapshot();
1434
+ res.type("text/plain").send(
1435
+ [
1436
+ `oar_queue_global_queued ${snapshot.globalQueued}`,
1437
+ `oar_queue_global_running ${snapshot.globalRunning}`,
1438
+ ...snapshot.byModel.flatMap((item) => [
1439
+ `oar_model_queue_depth{model="${escapeMetricLabel(item.model)}"} ${item.queued}`,
1440
+ `oar_model_running{model="${escapeMetricLabel(item.model)}"} ${item.running}`
1441
+ ])
1442
+ ].join("\n")
1443
+ );
1444
+ });
1445
+ api.get("/v1/router/status", async (_req, res, next) => {
1446
+ try {
1447
+ res.json({
1448
+ service: "ollama-agent-router",
1449
+ queue: deps.queue.snapshot(),
1450
+ gpu: await safeGpu(deps.gpu),
1451
+ loadedModels: await safeLoadedModels(deps.ollama),
1452
+ config: {
1453
+ models: config.models.length,
1454
+ routes: Object.keys(config.routes),
1455
+ basePath: normalizeBasePath(config.server.basePath),
1456
+ protocol: config.server.https.enabled ? "https" : "http"
1457
+ }
1458
+ });
1459
+ } catch (error) {
1460
+ next(error);
1461
+ }
1462
+ });
1463
+ api.get("/v1/router/models", async (_req, res, next) => {
1464
+ try {
1465
+ res.json({
1466
+ configured: config.models,
1467
+ ollama: await deps.ollama.tags(),
1468
+ loaded: await safeLoadedModels(deps.ollama)
1469
+ });
1470
+ } catch (error) {
1471
+ next(error);
1472
+ }
1473
+ });
1474
+ api.get("/v1/router/gpu", async (_req, res, next) => {
1475
+ try {
1476
+ res.json(await safeGpu(deps.gpu) ?? { provider: config.gpu.provider, available: false });
1477
+ } catch (error) {
1478
+ next(error);
1479
+ }
1480
+ });
1481
+ api.get("/v1/jobs", (_req, res) => {
1482
+ res.json({ jobs: deps.jobs.list() });
1483
+ });
1484
+ api.get("/v1/jobs/:jobId", (req, res) => {
1485
+ const job = deps.jobs.get(req.params.jobId);
1486
+ if (!job) return res.status(404).json({ error: { message: "Job not found" } });
1487
+ return res.json(job);
1488
+ });
1489
+ api.get("/v1/jobs/:jobId/result", (req, res) => {
1490
+ const job = deps.jobs.get(req.params.jobId);
1491
+ if (!job) return res.status(404).json({ error: { message: "Job not found" } });
1492
+ if (job.status === "failed") return res.status(500).json({ status: job.status, error: parseJobError(job) });
1493
+ if (job.status !== "succeeded") return res.status(202).json({ status: job.status });
1494
+ return res.json(parseJobResult(job));
1495
+ });
1496
+ api.delete("/v1/jobs/:jobId", (req, res) => {
1497
+ const job = deps.jobs.cancel(req.params.jobId);
1498
+ if (!job) return res.status(404).json({ error: { message: "Job not found" } });
1499
+ return res.json(job);
1500
+ });
1501
+ api.post("/v1/chat/completions", async (req, res, next) => {
1502
+ try {
1503
+ const request = chatRequestSchema.parse(req.body);
1504
+ if (request.stream) {
1505
+ return res.status(400).json({ error: { message: "Streaming is not supported by ollama-agent-router v1" } });
1506
+ }
1507
+ const router = normalizeRouterMetadata(config, request.router);
1508
+ const classification = classifyTask(request, router.taskType);
1509
+ const loadedModels = await safeLoadedModels(deps.ollama);
1510
+ const gpu = await safeGpu(deps.gpu);
1511
+ const decision = routing.decide({
1512
+ request,
1513
+ router,
1514
+ classification,
1515
+ loadedModels,
1516
+ gpu,
1517
+ queueDepthByModel: deps.queue.queueDepthByModel(),
1518
+ runningByModel: deps.queue.runningByModel()
1519
+ });
1520
+ if (decision.type === "reject") {
1521
+ return res.status(decision.statusCode).json({ error: { message: decision.reason } });
1522
+ }
1523
+ const priority = priorityWeights[router.priority];
1524
+ if (decision.type === "async") {
1525
+ const job = deps.queue.enqueueAsync({
1526
+ model: decision.model,
1527
+ request,
1528
+ classification,
1529
+ priority
1530
+ });
1531
+ return res.status(202).json({
1532
+ id: job.id,
1533
+ object: "router.job",
1534
+ status: "queued",
1535
+ message: "Heavy load. Job accepted for asynchronous processing.",
1536
+ router: {
1537
+ mode: "async",
1538
+ taskType: classification.taskType,
1539
+ preferredModel: decision.model.name,
1540
+ position: job.position,
1541
+ estimatedClass: classification.complexity
1542
+ }
1543
+ });
1544
+ }
1545
+ const output2 = await deps.queue.runSync({
1546
+ model: decision.model,
1547
+ request,
1548
+ priority,
1549
+ timeoutMs: router.maxExecutionTimeMs
1550
+ });
1551
+ return res.json(
1552
+ withRouterMetadata(output2.result, {
1553
+ mode: "sync",
1554
+ taskType: classification.taskType,
1555
+ selectedModel: decision.model.name,
1556
+ fallbackModels: decision.fallbackModels.filter((name) => name !== decision.model.name),
1557
+ queueTimeMs: output2.queueTimeMs,
1558
+ executionTimeMs: output2.executionTimeMs,
1559
+ decisionReason: decision.reason
1560
+ })
1561
+ );
1562
+ } catch (error) {
1563
+ next(error);
1564
+ }
1565
+ });
1566
+ app.use(normalizeBasePath(config.server.basePath), api);
1567
+ app.use((error, _req, res, _next) => {
1568
+ const message = error instanceof Error ? error.message : String(error);
1569
+ const status = error instanceof z3.ZodError ? 400 : 500;
1570
+ res.status(status).json({ error: { message } });
1571
+ });
1572
+ return app;
1573
+ }
1574
+ async function startServer(config, deps) {
1575
+ const app = createApp(config, deps);
1576
+ const server = await createHttpServer(config, app);
1577
+ server.listen(config.server.port, config.server.host);
1578
+ await new Promise((resolve3) => server.once("listening", resolve3));
1579
+ logger.info(
1580
+ {
1581
+ host: config.server.host,
1582
+ port: config.server.port,
1583
+ basePath: normalizeBasePath(config.server.basePath),
1584
+ protocol: config.server.https.enabled ? "https" : "http"
1585
+ },
1586
+ "ollama-agent-router listening"
1587
+ );
1588
+ return {
1589
+ close: () => new Promise((resolve3, reject) => {
1590
+ server.close((error) => error ? reject(error) : resolve3());
1591
+ })
1592
+ };
1593
+ }
1594
+ function normalizeBasePath(basePath) {
1595
+ const trimmed = basePath.trim();
1596
+ if (!trimmed || trimmed === "/") return "/";
1597
+ return `/${trimmed.replace(/^\/+|\/+$/g, "")}`;
1598
+ }
1599
+ async function createHttpServer(config, app) {
1600
+ if (!config.server.https.enabled) {
1601
+ return http.createServer(app);
1602
+ }
1603
+ if (!config.server.https.certPath || !config.server.https.keyPath) {
1604
+ throw new Error("server.https.certPath and server.https.keyPath are required when HTTPS is enabled");
1605
+ }
1606
+ return https.createServer(
1607
+ {
1608
+ cert: await readFile3(config.server.https.certPath),
1609
+ key: await readFile3(config.server.https.keyPath),
1610
+ ca: config.server.https.caPath ? await readFile3(config.server.https.caPath) : void 0
1611
+ },
1612
+ app
1613
+ );
1614
+ }
1615
+ function withRouterMetadata(result, router) {
1616
+ if (result && typeof result === "object" && !Array.isArray(result)) {
1617
+ return { ...result, router };
1618
+ }
1619
+ return { result, router };
1620
+ }
1621
+ async function safeLoadedModels(ollama) {
1622
+ try {
1623
+ return await ollama.ps();
1624
+ } catch {
1625
+ return [];
1626
+ }
1627
+ }
1628
+ async function safeGpu(gpu) {
1629
+ try {
1630
+ return await gpu.snapshot();
1631
+ } catch {
1632
+ return void 0;
1633
+ }
1634
+ }
1635
+ function escapeMetricLabel(label) {
1636
+ return label.replaceAll("\\", "\\\\").replaceAll('"', '\\"');
1637
+ }
1638
+
1639
+ // src/cli.ts
1640
+ var program = new Command();
1641
+ program.name("ollama-agent-router").alias("oar").description("Intelligent HTTP/CLI router for Ollama").option("-c, --config <path>", "config file path").option("-u, --url <url>", "router URL for client commands", "http://127.0.0.1:11435").option("--base-path <path>", "router API base path for client commands", "/");
1642
+ program.command("serve").description("start the router server").option("-c, --config <path>", "config file path").action(async (options) => {
1643
+ const { config, path } = await loadConfig(options.config ?? program.opts().config);
1644
+ const jobs = new InMemoryJobStore(config.jobs);
1645
+ const ollama = new HttpOllamaClient(config.ollama);
1646
+ const gpu = new NvidiaGpuMonitor(config.gpu);
1647
+ const queue = new QueueManager(config, ollama, jobs);
1648
+ const cleanup = setInterval(() => jobs.cleanupExpired(), config.jobs.cleanupIntervalMs);
1649
+ const server = await startServer(config, { ollama, gpu, jobs, queue });
1650
+ logger.info({ configPath: path }, "loaded config");
1651
+ const shutdown = async () => {
1652
+ clearInterval(cleanup);
1653
+ await server.close();
1654
+ jobs.close();
1655
+ process.exit(0);
1656
+ };
1657
+ process.once("SIGINT", shutdown);
1658
+ process.once("SIGTERM", shutdown);
1659
+ });
1660
+ program.command("init").description("write a starter config").option("-o, --output <path>", "output path", "./ollama-agent-router.yaml").option("--wizard", "run the detect-first configuration wizard").action(async (options) => {
1661
+ if (options.wizard) {
1662
+ await runConfigure({ outputPath: options.output });
1663
+ return;
1664
+ }
1665
+ await writeDefaultConfig(options.output);
1666
+ console.log(`Wrote ${options.output}`);
1667
+ });
1668
+ program.command("configure").description("run the detect-first configuration wizard").option("-o, --output <path>", "output path", "./ollama-agent-router.yaml").option("--answers <path>", "answers YAML for non-interactive mode").option("--non-interactive", "generate config without interactive prompts").option("--detect", "print detected environment and exit").option("--dry-run", "print generated YAML without writing").option("--overwrite", "overwrite output if it already exists").option("-y, --yes", "accept detected values and write without confirmation").action(async (options) => {
1669
+ await runConfigure({
1670
+ outputPath: options.output,
1671
+ answersPath: options.answers,
1672
+ nonInteractive: options.nonInteractive || options.yes,
1673
+ detectOnly: options.detect,
1674
+ dryRun: options.dryRun,
1675
+ overwrite: options.overwrite,
1676
+ assumeYes: options.yes
1677
+ });
1678
+ });
1679
+ program.command("validate-config").description("validate YAML configuration").option("-c, --config <path>", "config file path").action(async (options) => {
1680
+ const path = options.config ?? program.opts().config;
1681
+ if (path) {
1682
+ parseConfig(await readFile4(path, "utf8"));
1683
+ console.log(`Config is valid: ${path}`);
1684
+ return;
1685
+ }
1686
+ const found = await loadConfig();
1687
+ console.log(`Config is valid: ${found.path}`);
1688
+ });
1689
+ program.command("status").description("show router status").action(() => printJson("/v1/router/status"));
1690
+ program.command("models").description("show configured and Ollama models").action(() => printJson("/v1/router/models"));
1691
+ program.command("gpu").description("show GPU state").action(() => printJson("/v1/router/gpu"));
1692
+ program.command("jobs").description("list jobs").action(() => printJson("/v1/jobs"));
1693
+ program.command("job <jobId>").description("show job").action((jobId) => printJson(`/v1/jobs/${jobId}`));
1694
+ program.command("result <jobId>").description("show job result").action((jobId) => printJson(`/v1/jobs/${jobId}/result`));
1695
+ program.command("cancel <jobId>").description("cancel a job").action((jobId) => printJson(`/v1/jobs/${jobId}`, { method: "DELETE" }));
1696
+ program.parseAsync().catch((error) => {
1697
+ console.error(error instanceof Error ? error.message : String(error));
1698
+ process.exitCode = 1;
1699
+ });
1700
+ async function printJson(path, init) {
1701
+ const response = await fetch(buildClientUrl(path), init);
1702
+ const text = await response.text();
1703
+ if (!response.ok) {
1704
+ throw new Error(text || `HTTP ${response.status}`);
1705
+ }
1706
+ try {
1707
+ console.log(JSON.stringify(JSON.parse(text), null, 2));
1708
+ } catch {
1709
+ console.log(text);
1710
+ }
1711
+ }
1712
+ function buildClientUrl(path) {
1713
+ const options = program.opts();
1714
+ const url = new URL(options.url);
1715
+ const basePath = normalizeBasePath2(options.basePath);
1716
+ const pieces = [url.pathname, basePath, path].map((piece) => piece.replace(/^\/+|\/+$/g, "")).filter(Boolean);
1717
+ url.pathname = `/${pieces.join("/")}`;
1718
+ return url;
1719
+ }
1720
+ function normalizeBasePath2(basePath) {
1721
+ const trimmed = basePath.trim();
1722
+ if (!trimmed || trimmed === "/") return "/";
1723
+ return `/${trimmed.replace(/^\/+|\/+$/g, "")}`;
1724
+ }
1725
+ //# sourceMappingURL=cli.js.map