@khanglvm/llm-router 2.6.0 → 2.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +0 -13
- package/README.md +0 -3
- package/package.json +1 -1
- package/src/cli/router-module.js +284 -2
- package/src/node/coding-tool-config.js +15 -1
- package/src/node/huggingface-gguf.js +0 -12
- package/src/node/llamacpp-runtime.js +78 -256
- package/src/node/local-models-service.js +2 -25
- package/src/node/local-server.js +2 -60
- package/src/node/provider-probe.js +18 -0
- package/src/node/quota-probe-mapping.js +215 -0
- package/src/node/quota-probe-runner.js +234 -0
- package/src/node/web-console-client.js +33 -27
- package/src/node/web-console-server.js +107 -64
- package/src/node/web-console-styles.generated.js +1 -1
- package/src/node/web-console-ui/api-client.js +27 -0
- package/src/node/web-console-ui/local-models-utils.js +0 -33
- package/src/runtime/balancer.js +47 -4
- package/src/runtime/config.js +9 -4
- package/src/runtime/handler/fallback.js +7 -0
- package/src/runtime/handler/provider-call.js +18 -36
- package/src/runtime/handler/runtime-policy.js +1 -4
- package/src/runtime/local-models.js +0 -36
- package/src/runtime/quota-probe.js +179 -0
- package/src/translator/request/claude-to-openai.js +28 -0
- package/src/node/llamacpp-managed-runtime.js +0 -202
- package/src/node/llamacpp-runtime-profile.js +0 -133
|
@@ -2,11 +2,6 @@ import path from "node:path";
|
|
|
2
2
|
import os from "node:os";
|
|
3
3
|
import { existsSync } from "node:fs";
|
|
4
4
|
import { spawn, spawnSync } from "node:child_process";
|
|
5
|
-
import { setTimeout as delay } from "node:timers/promises";
|
|
6
|
-
import { deriveLlamacppLaunchProfile } from "./llamacpp-runtime-profile.js";
|
|
7
|
-
import { createLlamacppManagedRuntimeRegistry } from "./llamacpp-managed-runtime.js";
|
|
8
|
-
import { listListeningPids as listListeningPidsForPort } from "./port-reclaim.js";
|
|
9
|
-
import { stopProcessByPid as stopProcessByPidForRuntime } from "./instance-state.js";
|
|
10
5
|
|
|
11
6
|
export const LLAMACPP_DEFAULT_HOST = "127.0.0.1";
|
|
12
7
|
export const LLAMACPP_DEFAULT_PORT = 39391;
|
|
@@ -21,8 +16,8 @@ const COMMON_SOURCE_BUILD_PATHS = Object.freeze([
|
|
|
21
16
|
"src/llama-cpp-turboquant/build/bin/llama-server",
|
|
22
17
|
"src/llama.cpp-turboquant/build/bin/llama-server"
|
|
23
18
|
]);
|
|
24
|
-
|
|
25
|
-
let
|
|
19
|
+
|
|
20
|
+
let managedLlamacppRuntime = null;
|
|
26
21
|
|
|
27
22
|
function isPlainObject(value) {
|
|
28
23
|
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
@@ -44,34 +39,6 @@ function normalizePathEntries(entries) {
|
|
|
44
39
|
: [];
|
|
45
40
|
}
|
|
46
41
|
|
|
47
|
-
function buildRuntimeProfileHash({ command, host, port, args = [] } = {}) {
|
|
48
|
-
const normalizedArgs = Array.isArray(args) ? args.filter(Boolean) : [];
|
|
49
|
-
return `${normalizeString(command)}|${normalizeString(host)}|${String(normalizePort(port, LLAMACPP_DEFAULT_PORT))}|${normalizedArgs.join("\u001f")}`;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
function isManagedRuntimeAlive(instance) {
|
|
53
|
-
const child = instance?.child;
|
|
54
|
-
if (!child) return false;
|
|
55
|
-
return child.exitCode === null && child.killed !== true;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
function normalizeListeningPidResult(result) {
|
|
59
|
-
if (result && typeof result === "object" && result.ok === false) {
|
|
60
|
-
return { ok: false, pids: [] };
|
|
61
|
-
}
|
|
62
|
-
if (Array.isArray(result)) {
|
|
63
|
-
return result
|
|
64
|
-
.map((value) => Number(value))
|
|
65
|
-
.filter((pid) => Number.isInteger(pid) && pid > 0);
|
|
66
|
-
}
|
|
67
|
-
if (result && typeof result === "object" && Array.isArray(result.pids)) {
|
|
68
|
-
return result.pids
|
|
69
|
-
.map((value) => Number(value))
|
|
70
|
-
.filter((pid) => Number.isInteger(pid) && pid > 0);
|
|
71
|
-
}
|
|
72
|
-
return [];
|
|
73
|
-
}
|
|
74
|
-
|
|
75
42
|
function readConfiguredLlamacppRuntime(config) {
|
|
76
43
|
const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
|
|
77
44
|
if (!isPlainObject(runtime)) {
|
|
@@ -105,8 +72,6 @@ function buildPreloadModels(config) {
|
|
|
105
72
|
if (!modelPath) continue;
|
|
106
73
|
preloadModels.push({
|
|
107
74
|
variantId: normalizeString(variant.id),
|
|
108
|
-
variant,
|
|
109
|
-
baseModel,
|
|
110
75
|
modelPath,
|
|
111
76
|
contextWindow: Number.isFinite(Number(variant.contextWindow)) ? Number(variant.contextWindow) : undefined
|
|
112
77
|
});
|
|
@@ -114,15 +79,6 @@ function buildPreloadModels(config) {
|
|
|
114
79
|
return preloadModels;
|
|
115
80
|
}
|
|
116
81
|
|
|
117
|
-
function detectLlamacppSystemProfile(system = {}) {
|
|
118
|
-
const totalMemoryBytes = Number(system?.totalMemoryBytes);
|
|
119
|
-
return {
|
|
120
|
-
platform: normalizeString(system?.platform) || process.platform,
|
|
121
|
-
unifiedMemory: system?.unifiedMemory === true || process.platform === "darwin",
|
|
122
|
-
totalMemoryBytes: Number.isFinite(totalMemoryBytes) && totalMemoryBytes > 0 ? totalMemoryBytes : os.totalmem()
|
|
123
|
-
};
|
|
124
|
-
}
|
|
125
|
-
|
|
126
82
|
export function detectLlamacppCandidates({
|
|
127
83
|
envPathEntries = process.env.PATH?.split(path.delimiter) || [],
|
|
128
84
|
homeDir = os.homedir(),
|
|
@@ -166,18 +122,16 @@ export function buildLlamacppLaunchArgs({
|
|
|
166
122
|
command,
|
|
167
123
|
host = LLAMACPP_DEFAULT_HOST,
|
|
168
124
|
port = LLAMACPP_DEFAULT_PORT,
|
|
169
|
-
preloadModels = []
|
|
170
|
-
launchProfile = null
|
|
125
|
+
preloadModels = []
|
|
171
126
|
} = {}) {
|
|
172
127
|
const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
|
|
173
128
|
const args = [
|
|
174
129
|
normalizeString(command),
|
|
175
130
|
"--host", normalizeString(host) || LLAMACPP_DEFAULT_HOST,
|
|
176
|
-
"--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
|
|
177
|
-
...((Array.isArray(launchProfile?.args) ? launchProfile.args : []).filter(Boolean))
|
|
131
|
+
"--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
|
|
178
132
|
];
|
|
179
133
|
|
|
180
|
-
if (
|
|
134
|
+
if (firstModel?.modelPath) {
|
|
181
135
|
args.push("-m", firstModel.modelPath);
|
|
182
136
|
if (Number.isFinite(Number(firstModel.contextWindow)) && Number(firstModel.contextWindow) > 0) {
|
|
183
137
|
args.push("-c", String(Math.floor(Number(firstModel.contextWindow))));
|
|
@@ -187,31 +141,6 @@ export function buildLlamacppLaunchArgs({
|
|
|
187
141
|
return args.filter(Boolean);
|
|
188
142
|
}
|
|
189
143
|
|
|
190
|
-
export async function spawnManagedLlamacppRuntime({
|
|
191
|
-
command,
|
|
192
|
-
host = LLAMACPP_DEFAULT_HOST,
|
|
193
|
-
port = LLAMACPP_DEFAULT_PORT,
|
|
194
|
-
launchProfile
|
|
195
|
-
} = {}, {
|
|
196
|
-
spawnImpl = spawn
|
|
197
|
-
} = {}) {
|
|
198
|
-
const args = buildLlamacppLaunchArgs({
|
|
199
|
-
command,
|
|
200
|
-
host,
|
|
201
|
-
port,
|
|
202
|
-
launchProfile
|
|
203
|
-
});
|
|
204
|
-
const child = spawnImpl(args[0], args.slice(1), { stdio: "ignore" });
|
|
205
|
-
return {
|
|
206
|
-
pid: child?.pid,
|
|
207
|
-
child,
|
|
208
|
-
host,
|
|
209
|
-
port,
|
|
210
|
-
baseUrl: `http://${host}:${port}/v1`,
|
|
211
|
-
args
|
|
212
|
-
};
|
|
213
|
-
}
|
|
214
|
-
|
|
215
144
|
export function parseLlamacppValidationOutput(output = "") {
|
|
216
145
|
const text = String(output || "").trim();
|
|
217
146
|
const lowered = text.toLowerCase();
|
|
@@ -270,142 +199,78 @@ async function startConfiguredRuntime(config, {
|
|
|
270
199
|
requireAutostart = true
|
|
271
200
|
} = {}, {
|
|
272
201
|
spawnSyncImpl = spawnSync,
|
|
273
|
-
spawnImpl = spawn
|
|
274
|
-
system = undefined,
|
|
275
|
-
listListeningPids = undefined,
|
|
276
|
-
stopProcessByPid = undefined
|
|
202
|
+
spawnImpl = spawn
|
|
277
203
|
} = {}) {
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
return { ok: true, skipped: true, reason: "autostart-disabled" };
|
|
283
|
-
}
|
|
204
|
+
const runtime = readConfiguredLlamacppRuntime(config);
|
|
205
|
+
if (requireAutostart && !runtime.startWithRouter) {
|
|
206
|
+
return { ok: true, skipped: true, reason: "autostart-disabled" };
|
|
207
|
+
}
|
|
284
208
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
209
|
+
if (!runtime.command) {
|
|
210
|
+
const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
|
|
211
|
+
error(errorMessage);
|
|
212
|
+
return { ok: false, errorMessage };
|
|
213
|
+
}
|
|
290
214
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
: null;
|
|
300
|
-
const args = buildLlamacppLaunchArgs({
|
|
301
|
-
command: runtime.command,
|
|
302
|
-
host: runtime.host,
|
|
303
|
-
port: runtime.port,
|
|
304
|
-
preloadModels,
|
|
305
|
-
launchProfile
|
|
306
|
-
});
|
|
307
|
-
const variantKey = normalizeString(firstModel?.variant?.key || firstModel?.variantId) || "default";
|
|
308
|
-
const profileHash = buildRuntimeProfileHash({
|
|
309
|
-
command: runtime.command,
|
|
310
|
-
host: runtime.host,
|
|
311
|
-
port: runtime.port,
|
|
312
|
-
args: args.slice(1)
|
|
313
|
-
});
|
|
314
|
-
const listListeningPidsFn = typeof listListeningPids === "function"
|
|
315
|
-
? listListeningPids
|
|
316
|
-
: (port) => listListeningPidsForPort(port, { spawnSync: spawnSyncImpl });
|
|
317
|
-
const stopProcessByPidFn = typeof stopProcessByPid === "function"
|
|
318
|
-
? stopProcessByPid
|
|
319
|
-
: (pid) => stopProcessByPidForRuntime(pid);
|
|
320
|
-
await managedLlamacppRuntimeRegistry.reconcile({
|
|
321
|
-
listListeningPids: async (port) => normalizeListeningPidResult(await listListeningPidsFn(port)),
|
|
322
|
-
stopProcessByPid: async (pid) => stopProcessByPidFn(pid)
|
|
323
|
-
});
|
|
215
|
+
if (managedLlamacppRuntime
|
|
216
|
+
&& managedLlamacppRuntime.command === runtime.command
|
|
217
|
+
&& managedLlamacppRuntime.host === runtime.host
|
|
218
|
+
&& managedLlamacppRuntime.port === runtime.port
|
|
219
|
+
&& managedLlamacppRuntime.child?.exitCode === null
|
|
220
|
+
&& managedLlamacppRuntime.child?.killed !== true) {
|
|
221
|
+
return { ok: true, alreadyRunning: true, runtime: managedLlamacppRuntime };
|
|
222
|
+
}
|
|
324
223
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
&& isManagedRuntimeAlive(instance)
|
|
331
|
-
));
|
|
332
|
-
if (existing) {
|
|
333
|
-
return { ok: true, alreadyRunning: true, runtime: existing };
|
|
334
|
-
}
|
|
224
|
+
const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
|
|
225
|
+
if (!validation.ok) {
|
|
226
|
+
error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
|
|
227
|
+
return validation;
|
|
228
|
+
}
|
|
335
229
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
230
|
+
const preloadModels = buildPreloadModels(config);
|
|
231
|
+
const args = buildLlamacppLaunchArgs({
|
|
232
|
+
command: runtime.command,
|
|
233
|
+
host: runtime.host,
|
|
234
|
+
port: runtime.port,
|
|
235
|
+
preloadModels
|
|
236
|
+
});
|
|
341
237
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
});
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
const settleResolve = (value) => {
|
|
370
|
-
if (settled) return;
|
|
371
|
-
settled = true;
|
|
372
|
-
resolve(value);
|
|
373
|
-
};
|
|
374
|
-
const settleReject = (reason) => {
|
|
375
|
-
if (settled) return;
|
|
376
|
-
settled = true;
|
|
377
|
-
reject(reason);
|
|
378
|
-
};
|
|
379
|
-
|
|
380
|
-
child.once("spawn", () => {
|
|
381
|
-
if (typeof child.unref === "function") child.unref();
|
|
382
|
-
settleResolve({
|
|
383
|
-
pid: child?.pid,
|
|
384
|
-
child,
|
|
385
|
-
command: runtime.command,
|
|
386
|
-
host: runtime.host,
|
|
387
|
-
port,
|
|
388
|
-
args: allocatedArgs,
|
|
389
|
-
baseUrl: `http://${runtime.host}:${port}/v1`
|
|
390
|
-
});
|
|
391
|
-
});
|
|
392
|
-
child.once("error", (spawnError) => {
|
|
393
|
-
settleReject(spawnError);
|
|
394
|
-
});
|
|
395
|
-
}),
|
|
396
|
-
waitForHealthy: async (instance) => instance
|
|
238
|
+
return new Promise((resolve) => {
|
|
239
|
+
let settled = false;
|
|
240
|
+
const child = spawnImpl(args[0], args.slice(1), {
|
|
241
|
+
stdio: "ignore"
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
const finish = (result) => {
|
|
245
|
+
if (settled) return;
|
|
246
|
+
settled = true;
|
|
247
|
+
resolve(result);
|
|
248
|
+
};
|
|
249
|
+
|
|
250
|
+
child.once("spawn", () => {
|
|
251
|
+
managedLlamacppRuntime = {
|
|
252
|
+
child,
|
|
253
|
+
command: runtime.command,
|
|
254
|
+
host: runtime.host,
|
|
255
|
+
port: runtime.port,
|
|
256
|
+
args
|
|
257
|
+
};
|
|
258
|
+
child.once("exit", () => {
|
|
259
|
+
if (managedLlamacppRuntime?.child === child) {
|
|
260
|
+
managedLlamacppRuntime = null;
|
|
261
|
+
}
|
|
397
262
|
});
|
|
263
|
+
if (typeof child.unref === "function") child.unref();
|
|
264
|
+
line(`Started llama.cpp runtime on http://${runtime.host}:${runtime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
|
|
265
|
+
finish({ ok: true, runtime: managedLlamacppRuntime, validation });
|
|
266
|
+
});
|
|
398
267
|
|
|
399
|
-
|
|
400
|
-
return { ok: true, runtime: managedRuntime, validation };
|
|
401
|
-
} catch (spawnError) {
|
|
268
|
+
child.once("error", (spawnError) => {
|
|
402
269
|
const errorMessage = spawnError instanceof Error ? spawnError.message : String(spawnError);
|
|
403
270
|
error(`Failed starting llama.cpp runtime: ${errorMessage}`);
|
|
404
|
-
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
inFlightConfiguredStartCount = Math.max(0, inFlightConfiguredStartCount - 1);
|
|
408
|
-
}
|
|
271
|
+
finish({ ok: false, errorMessage });
|
|
272
|
+
});
|
|
273
|
+
});
|
|
409
274
|
}
|
|
410
275
|
|
|
411
276
|
export async function ensureConfiguredLlamacppRuntimeStarted(config, callbacks = {}, deps = {}) {
|
|
@@ -422,66 +287,23 @@ export async function startConfiguredLlamacppRuntime(config, callbacks = {}, dep
|
|
|
422
287
|
}, deps);
|
|
423
288
|
}
|
|
424
289
|
|
|
425
|
-
export function getManagedLlamacppRuntimeSnapshot() {
|
|
426
|
-
return managedLlamacppRuntimeRegistry.snapshot().map((instance) => {
|
|
427
|
-
const { child: _child, ...rest } = instance || {};
|
|
428
|
-
return JSON.parse(JSON.stringify(rest));
|
|
429
|
-
});
|
|
430
|
-
}
|
|
431
|
-
|
|
432
290
|
export async function stopManagedLlamacppRuntime({
|
|
433
291
|
line = () => {},
|
|
434
292
|
error = () => {}
|
|
435
293
|
} = {}) {
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
}
|
|
439
|
-
if (typeof managedLlamacppRuntimeRegistry.waitForInFlightStarts === "function") {
|
|
440
|
-
await managedLlamacppRuntimeRegistry.waitForInFlightStarts();
|
|
441
|
-
}
|
|
442
|
-
const instances = managedLlamacppRuntimeRegistry.snapshot();
|
|
443
|
-
if (instances.length === 0) {
|
|
294
|
+
const active = managedLlamacppRuntime;
|
|
295
|
+
if (!active?.child) {
|
|
444
296
|
return { ok: true, skipped: true, reason: "not-running" };
|
|
445
297
|
}
|
|
446
298
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
if (!isManagedRuntimeAlive(instance)) {
|
|
459
|
-
await managedLlamacppRuntimeRegistry.untrackInstance(instance?.instanceId);
|
|
460
|
-
} else {
|
|
461
|
-
pendingExitCount += 1;
|
|
462
|
-
}
|
|
463
|
-
} catch (stopError) {
|
|
464
|
-
const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
|
|
465
|
-
failures.push(errorMessage);
|
|
466
|
-
error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
if (stoppedCount > 0) {
|
|
471
|
-
if (pendingExitCount === 0) {
|
|
472
|
-
line(stoppedCount === 1 ? "Stopped managed llama.cpp runtime." : `Stopped ${stoppedCount} managed llama.cpp runtimes.`);
|
|
473
|
-
} else {
|
|
474
|
-
line(stoppedCount === 1
|
|
475
|
-
? "Stop signal sent to managed llama.cpp runtime; waiting for exit."
|
|
476
|
-
: `Stop signal sent to ${stoppedCount} managed llama.cpp runtimes; waiting for exits.`);
|
|
477
|
-
}
|
|
299
|
+
managedLlamacppRuntime = null;
|
|
300
|
+
try {
|
|
301
|
+
active.child.kill("SIGTERM");
|
|
302
|
+
line("Stopped managed llama.cpp runtime.");
|
|
303
|
+
return { ok: true };
|
|
304
|
+
} catch (stopError) {
|
|
305
|
+
const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
|
|
306
|
+
error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
|
|
307
|
+
return { ok: false, errorMessage };
|
|
478
308
|
}
|
|
479
|
-
|
|
480
|
-
const completed = failures.length === 0 && pendingExitCount === 0;
|
|
481
|
-
return {
|
|
482
|
-
ok: completed,
|
|
483
|
-
stoppedCount,
|
|
484
|
-
pendingExitCount,
|
|
485
|
-
...(failures.length > 0 ? { errorMessage: failures.join("; ") } : {})
|
|
486
|
-
};
|
|
487
309
|
}
|
|
@@ -212,42 +212,19 @@ export async function saveLocalModelVariant(config, draft, {
|
|
|
212
212
|
activeVariants,
|
|
213
213
|
totalMemoryBytes: system.totalMemoryBytes
|
|
214
214
|
});
|
|
215
|
-
|
|
215
|
+
if (!decision.allowed) {
|
|
216
216
|
throw new Error(decision.reason);
|
|
217
217
|
}
|
|
218
218
|
}
|
|
219
219
|
|
|
220
|
-
const normalizedMetadata = normalizeLocalModelsMetadata({
|
|
221
|
-
variants: {
|
|
222
|
-
draft: normalizedDraft
|
|
223
|
-
}
|
|
224
|
-
});
|
|
225
|
-
const normalizedVariantDraft = Object.values(normalizedMetadata.variants)[0] || {};
|
|
226
|
-
const previousVariant = isPlainObject(next.metadata.localModels.variants[key])
|
|
227
|
-
? next.metadata.localModels.variants[key]
|
|
228
|
-
: {};
|
|
229
|
-
|
|
230
220
|
next.metadata.localModels.variants[key] = {
|
|
231
|
-
...
|
|
221
|
+
...(isPlainObject(next.metadata.localModels.variants[key]) ? next.metadata.localModels.variants[key] : {}),
|
|
232
222
|
key,
|
|
233
223
|
baseModelId,
|
|
234
224
|
id: modelId,
|
|
235
225
|
name,
|
|
236
226
|
runtime,
|
|
237
227
|
preset: normalizeString(normalizedDraft.preset),
|
|
238
|
-
runtimeProfile: runtime === "llamacpp"
|
|
239
|
-
? normalizedVariantDraft.runtimeProfile
|
|
240
|
-
: undefined,
|
|
241
|
-
runtimeStatus: runtime === "llamacpp"
|
|
242
|
-
? (isPlainObject(previousVariant.runtimeStatus)
|
|
243
|
-
? previousVariant.runtimeStatus
|
|
244
|
-
: {
|
|
245
|
-
activeInstanceId: "",
|
|
246
|
-
lastFailure: null,
|
|
247
|
-
lastStartedAt: "",
|
|
248
|
-
lastHealthyAt: ""
|
|
249
|
-
})
|
|
250
|
-
: undefined,
|
|
251
228
|
enabled: normalizedDraft.enabled === true,
|
|
252
229
|
preload: normalizedDraft.preload === true,
|
|
253
230
|
contextWindow: Number.isFinite(Number(normalizedDraft.contextWindow)) ? Number(normalizedDraft.contextWindow) : undefined,
|
package/src/node/local-server.js
CHANGED
|
@@ -13,10 +13,6 @@ import { readActivityLogSettings } from "../shared/local-router-defaults.js";
|
|
|
13
13
|
import { appendActivityLogEntry, resolveActivityLogPath } from "./activity-log.js";
|
|
14
14
|
import { appendLargeRequestLogEntry, resolveLargeRequestLogPath } from "./large-request-log.js";
|
|
15
15
|
import { isLargeRequestLoggingEnabled } from "../runtime/handler/large-request-log.js";
|
|
16
|
-
import {
|
|
17
|
-
startConfiguredLlamacppRuntime,
|
|
18
|
-
stopManagedLlamacppRuntime
|
|
19
|
-
} from "./llamacpp-runtime.js";
|
|
20
16
|
|
|
21
17
|
const DEFAULT_CONFIG_RELOAD_DEBOUNCE_MS = 300;
|
|
22
18
|
const MAX_CONFIG_RELOAD_DEBOUNCE_MS = 5000;
|
|
@@ -38,10 +34,6 @@ function formatError(error) {
|
|
|
38
34
|
return error instanceof Error ? error.message : String(error);
|
|
39
35
|
}
|
|
40
36
|
|
|
41
|
-
function normalizeString(value) {
|
|
42
|
-
return typeof value === "string" ? value.trim() : "";
|
|
43
|
-
}
|
|
44
|
-
|
|
45
37
|
function createLiveConfigStore({
|
|
46
38
|
configPath,
|
|
47
39
|
watchConfig = true,
|
|
@@ -245,39 +237,6 @@ async function writeFetchResponseToNode(res, response) {
|
|
|
245
237
|
readable.pipe(res);
|
|
246
238
|
}
|
|
247
239
|
|
|
248
|
-
function buildVariantLlamacppRuntimeConfig(config, variantKey) {
|
|
249
|
-
const normalizedVariantKey = normalizeString(variantKey);
|
|
250
|
-
const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
|
|
251
|
-
const variants = config?.metadata?.localModels?.variants;
|
|
252
|
-
const library = config?.metadata?.localModels?.library;
|
|
253
|
-
const variant = variants?.[normalizedVariantKey];
|
|
254
|
-
if (!runtime || !variant || variant.runtime !== "llamacpp") return null;
|
|
255
|
-
|
|
256
|
-
const baseModelId = normalizeString(variant?.baseModelId);
|
|
257
|
-
const baseModel = library?.[baseModelId];
|
|
258
|
-
if (!baseModel) return null;
|
|
259
|
-
|
|
260
|
-
return {
|
|
261
|
-
metadata: {
|
|
262
|
-
localModels: {
|
|
263
|
-
runtime: {
|
|
264
|
-
llamacpp: { ...runtime }
|
|
265
|
-
},
|
|
266
|
-
library: {
|
|
267
|
-
[baseModelId]: { ...baseModel }
|
|
268
|
-
},
|
|
269
|
-
variants: {
|
|
270
|
-
[normalizedVariantKey]: {
|
|
271
|
-
...variant,
|
|
272
|
-
enabled: true,
|
|
273
|
-
preload: true
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
};
|
|
279
|
-
}
|
|
280
|
-
|
|
281
240
|
export async function startLocalRouteServer({
|
|
282
241
|
port = FIXED_LOCAL_ROUTER_PORT,
|
|
283
242
|
host = FIXED_LOCAL_ROUTER_HOST,
|
|
@@ -289,10 +248,7 @@ export async function startLocalRouteServer({
|
|
|
289
248
|
validateConfig,
|
|
290
249
|
onConfigReload,
|
|
291
250
|
onConfigReloadError,
|
|
292
|
-
requireAuth = false
|
|
293
|
-
createFetchHandlerImpl = createFetchHandler,
|
|
294
|
-
startConfiguredLlamacppRuntimeImpl = startConfiguredLlamacppRuntime,
|
|
295
|
-
stopManagedLlamacppRuntimeImpl = stopManagedLlamacppRuntime
|
|
251
|
+
requireAuth = false
|
|
296
252
|
} = {}) {
|
|
297
253
|
const reloadDebounceMs = resolveReloadDebounceMs(configReloadDebounceMs);
|
|
298
254
|
const resolvedActivityLogPath = resolveActivityLogPath(configPath, activityLogPath);
|
|
@@ -314,22 +270,9 @@ export async function startLocalRouteServer({
|
|
|
314
270
|
const initialConfig = await configStore.getConfig();
|
|
315
271
|
activityLogEnabled = readActivityLogSettings(initialConfig).enabled;
|
|
316
272
|
|
|
317
|
-
const fetchHandler =
|
|
273
|
+
const fetchHandler = createFetchHandler({
|
|
318
274
|
ignoreAuth: !requireAuth,
|
|
319
|
-
runtime: "node",
|
|
320
275
|
getConfig: () => configStore.getConfig(),
|
|
321
|
-
resolveLocalRuntimeBaseUrl: async ({ candidate }) => {
|
|
322
|
-
const variantKey = candidate?.model?.metadata?.localVariantKey;
|
|
323
|
-
const config = await configStore.getConfig();
|
|
324
|
-
const targetedConfig = buildVariantLlamacppRuntimeConfig(config, variantKey);
|
|
325
|
-
if (!targetedConfig) return "";
|
|
326
|
-
|
|
327
|
-
const started = await startConfiguredLlamacppRuntimeImpl(targetedConfig);
|
|
328
|
-
if (!started?.ok) {
|
|
329
|
-
throw new Error(started?.errorMessage || `Failed starting local runtime for ${normalizeString(variantKey) || "unknown variant"}.`);
|
|
330
|
-
}
|
|
331
|
-
return normalizeString(started?.runtime?.baseUrl);
|
|
332
|
-
},
|
|
333
276
|
defaultStateStoreBackend: "file",
|
|
334
277
|
onActivityLog: (entry) => {
|
|
335
278
|
if (!activityLogEnabled) return;
|
|
@@ -412,7 +355,6 @@ export async function startLocalRouteServer({
|
|
|
412
355
|
server.close = (callback) => {
|
|
413
356
|
shuttingDown = true;
|
|
414
357
|
Promise.resolve()
|
|
415
|
-
.then(() => stopManagedLlamacppRuntimeImpl().catch(() => {}))
|
|
416
358
|
.then(() => configStore.close())
|
|
417
359
|
.then(() => (typeof fetchHandler.close === "function" ? fetchHandler.close() : undefined))
|
|
418
360
|
.finally(() => {
|
|
@@ -310,6 +310,15 @@ function isTransientModelRuntimeError(result, message) {
|
|
|
310
310
|
return patterns.some((pattern) => pattern.test(text));
|
|
311
311
|
}
|
|
312
312
|
|
|
313
|
+
function isOutputLimitReachedMessage(message) {
|
|
314
|
+
const text = String(message || "").toLowerCase();
|
|
315
|
+
if (!text) return false;
|
|
316
|
+
return (
|
|
317
|
+
text.includes("max_tokens") &&
|
|
318
|
+
(text.includes("output limit") || text.includes("token limit") || text.includes("finish"))
|
|
319
|
+
);
|
|
320
|
+
}
|
|
321
|
+
|
|
313
322
|
function isRateLimitResult(result, message) {
|
|
314
323
|
const status = Number(result?.status || 0);
|
|
315
324
|
if (status === 429) return true;
|
|
@@ -377,6 +386,15 @@ function classifyModelProbeResult(format, result) {
|
|
|
377
386
|
};
|
|
378
387
|
}
|
|
379
388
|
|
|
389
|
+
if (isOutputLimitReachedMessage(message)) {
|
|
390
|
+
return {
|
|
391
|
+
supported: true,
|
|
392
|
+
confirmed: true,
|
|
393
|
+
outcome: "output-limit",
|
|
394
|
+
message: message || "Request reached model but the probe token budget was too small."
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
|
|
380
398
|
if (isUnsupportedModelMessage(message)) {
|
|
381
399
|
return {
|
|
382
400
|
supported: false,
|