@khanglvm/llm-router 2.6.0 → 2.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +0 -13
- package/README.md +0 -3
- package/package.json +1 -1
- package/src/node/coding-tool-config.js +15 -1
- package/src/node/huggingface-gguf.js +0 -12
- package/src/node/llamacpp-runtime.js +78 -256
- package/src/node/local-models-service.js +2 -25
- package/src/node/local-server.js +2 -60
- package/src/node/provider-probe.js +18 -0
- package/src/node/quota-probe-mapping.js +215 -0
- package/src/node/quota-probe-runner.js +234 -0
- package/src/node/web-console-client.js +33 -27
- package/src/node/web-console-server.js +107 -64
- package/src/node/web-console-styles.generated.js +1 -1
- package/src/node/web-console-ui/api-client.js +27 -0
- package/src/node/web-console-ui/local-models-utils.js +0 -33
- package/src/runtime/balancer.js +47 -4
- package/src/runtime/config.js +9 -4
- package/src/runtime/handler/fallback.js +7 -0
- package/src/runtime/handler/provider-call.js +18 -36
- package/src/runtime/handler/runtime-policy.js +1 -4
- package/src/runtime/local-models.js +0 -36
- package/src/runtime/quota-probe.js +179 -0
- package/src/node/llamacpp-managed-runtime.js +0 -202
- package/src/node/llamacpp-runtime-profile.js +0 -133
package/CHANGELOG.md
CHANGED
|
@@ -7,19 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
-
## [2.6.0] - 2026-04-23
|
|
11
|
-
|
|
12
|
-
### Added
|
|
13
|
-
- Local `llama.cpp` variants can now persist a per-model runtime profile, including auto-tuned presets and custom launch overrides, so each GGUF variant can run with settings that match its own size and context shape instead of sharing one global `llama-server` startup profile.
|
|
14
|
-
- The Web UI now exposes managed `llama.cpp` runtime health for Local Models, including tracked instance counts, healthy/stale summaries, and persisted runtime-profile data for each saved variant.
|
|
15
|
-
|
|
16
|
-
### Changed
|
|
17
|
-
- Local variant requests are now resolved through a managed per-variant `llama.cpp` runtime layer that can reuse compatible instances, allocate fallback ports safely, and start the right runtime configuration for the specific model variant without exposing multi-process lifecycle management to the user.
|
|
18
|
-
- Hugging Face GGUF search/download flows now surface file size plus estimated runtime memory guidance directly in the Local Models workflow, making it easier to choose a viable quantization before download.
|
|
19
|
-
|
|
20
|
-
### Fixed
|
|
21
|
-
- Managed `llama.cpp` runtimes now reconcile stale tracked instances before reuse, avoid reserving dead immediate-exit servers, and drain pending shutdown/startup edges more reliably so local per-model routing does not leave behind stale `llama-server` processes.
|
|
22
|
-
|
|
23
10
|
## [2.5.2] - 2026-04-23
|
|
24
11
|
|
|
25
12
|
### Fixed
|
package/README.md
CHANGED
|
@@ -44,9 +44,6 @@ Open `llr` and use the **Local Models** tab to manage local inference sources al
|
|
|
44
44
|
- **Native macOS browsing** — use the built-in file picker to choose a single GGUF file, scan a folder recursively for GGUF models, or browse directly to a local `llama-server` binary
|
|
45
45
|
- **Managed + attached model library** — stale or moved files stay visible instead of crashing the app, and can be repaired by locating the file again or removed cleanly
|
|
46
46
|
- **Router-visible local variants** — create friendly model variants with bounded presets, context-window metadata, preload toggles, and Mac unified-memory fit guidance with clearer safe/tight recommendations
|
|
47
|
-
- **Per-variant llama.cpp tuning** — each local variant can store its own runtime profile so balanced, throughput, long-context, low-memory, or custom launch overrides do not fight over one shared global `llama-server` config
|
|
48
|
-
- **Managed per-model runtimes** — the router automatically starts, reuses, and stops the right `llama.cpp` instance for the requested local variant, with stale-runtime cleanup handled internally instead of asking the user to manage separate servers
|
|
49
|
-
- **GGUF size + memory guidance** — Hugging Face search results now show model file size plus estimated runtime memory fit guidance before download, helping choose viable quantizations faster
|
|
50
47
|
- **Alias-ready local routing** — once saved, local variants behave like normal router models and can be used in aliases, capability flags, and fallback chains
|
|
51
48
|
|
|
52
49
|
For v1, the managed download flow only searches public Hugging Face GGUF files and the fit guidance is tuned for Macs with unified memory.
|
package/package.json
CHANGED
|
@@ -17,6 +17,7 @@ import {
|
|
|
17
17
|
normalizeFactoryDroidReasoningEffort,
|
|
18
18
|
resolveFactoryDroidRouterModelRef
|
|
19
19
|
} from "../shared/coding-tool-bindings.js";
|
|
20
|
+
import { LOCAL_RUNTIME_PROVIDER_TYPE } from "../runtime/local-models.js";
|
|
20
21
|
|
|
21
22
|
const BACKUP_SUFFIX = ".llm_router_backup";
|
|
22
23
|
const CODEX_PROVIDER_ID = "llm-router";
|
|
@@ -972,9 +973,11 @@ export async function patchClaudeCodeEffortLevel({
|
|
|
972
973
|
const FACTORY_DROID_ROUTER_MARKER = "_llmRouterManaged";
|
|
973
974
|
const FACTORY_DROID_OPENAI_PROVIDER = "openai";
|
|
974
975
|
const FACTORY_DROID_ANTHROPIC_PROVIDER = "anthropic";
|
|
976
|
+
const FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER = "generic-chat-completion-api";
|
|
975
977
|
const FACTORY_DROID_ROUTER_PROVIDERS = Object.freeze([
|
|
976
978
|
FACTORY_DROID_OPENAI_PROVIDER,
|
|
977
|
-
FACTORY_DROID_ANTHROPIC_PROVIDER
|
|
979
|
+
FACTORY_DROID_ANTHROPIC_PROVIDER,
|
|
980
|
+
FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER
|
|
978
981
|
]);
|
|
979
982
|
|
|
980
983
|
function dedupeStrings(values = []) {
|
|
@@ -1116,6 +1119,17 @@ function resolveFactoryDroidRouteFormat(modelRef, config = {}, seen = new Set())
|
|
|
1116
1119
|
}
|
|
1117
1120
|
|
|
1118
1121
|
function resolveFactoryDroidCustomModelProvider(modelRef, config = {}) {
|
|
1122
|
+
const normalizedModelRef = String(modelRef || "").trim();
|
|
1123
|
+
if (normalizedModelRef.includes("/")) {
|
|
1124
|
+
const separatorIndex = normalizedModelRef.indexOf("/");
|
|
1125
|
+
const providerId = normalizedModelRef.slice(0, separatorIndex).trim();
|
|
1126
|
+
const provider = (Array.isArray(config?.providers) ? config.providers : [])
|
|
1127
|
+
.find((entry) => String(entry?.id || "").trim() === providerId);
|
|
1128
|
+
if (String(provider?.type || "").trim().toLowerCase() === LOCAL_RUNTIME_PROVIDER_TYPE) {
|
|
1129
|
+
return FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1119
1133
|
return mapFactoryDroidFormatToProvider(resolveFactoryDroidRouteFormat(modelRef, config))
|
|
1120
1134
|
|| FACTORY_DROID_OPENAI_PROVIDER;
|
|
1121
1135
|
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import path from "node:path";
|
|
2
2
|
import { promises as fs } from "node:fs";
|
|
3
|
-
import { estimateLlamacppRuntimeBytes } from "./llamacpp-runtime-profile.js";
|
|
4
3
|
|
|
5
4
|
const HUGGING_FACE_API_URL = "https://huggingface.co/api/models";
|
|
6
5
|
const HUGGING_FACE_BASE_URL = "https://huggingface.co";
|
|
@@ -155,13 +154,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
|
|
|
155
154
|
expectedContextWindow: systemInfo?.expectedContextWindow
|
|
156
155
|
}, systemInfo);
|
|
157
156
|
const quantization = parseQuantizationFromFileName(file);
|
|
158
|
-
const estimatedRuntimeBytes = sizeBytes
|
|
159
|
-
? estimateLlamacppRuntimeBytes({
|
|
160
|
-
sizeBytes,
|
|
161
|
-
contextWindow: systemInfo?.expectedContextWindow,
|
|
162
|
-
preset: status.fit === "tight" ? "memory-safe" : "balanced"
|
|
163
|
-
})
|
|
164
|
-
: undefined;
|
|
165
157
|
const fitScore = status.fit === "safe" ? 30 : status.fit === "tight" ? 15 : status.fit === "unknown" ? 8 : -20;
|
|
166
158
|
const rankingScore = fitScore
|
|
167
159
|
+ (status.disabled ? -100 : 0)
|
|
@@ -174,10 +166,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
|
|
|
174
166
|
file,
|
|
175
167
|
quantization,
|
|
176
168
|
sizeBytes,
|
|
177
|
-
estimatedRuntimeBytes,
|
|
178
|
-
memoryLabel: estimatedRuntimeBytes
|
|
179
|
-
? `${(estimatedRuntimeBytes / (1024 ** 3)).toFixed(1)} GB runtime est.`
|
|
180
|
-
: "Runtime estimate unavailable",
|
|
181
169
|
disabled: status.disabled,
|
|
182
170
|
disabledReason: status.reason,
|
|
183
171
|
fit: status.fit,
|
|
@@ -2,11 +2,6 @@ import path from "node:path";
|
|
|
2
2
|
import os from "node:os";
|
|
3
3
|
import { existsSync } from "node:fs";
|
|
4
4
|
import { spawn, spawnSync } from "node:child_process";
|
|
5
|
-
import { setTimeout as delay } from "node:timers/promises";
|
|
6
|
-
import { deriveLlamacppLaunchProfile } from "./llamacpp-runtime-profile.js";
|
|
7
|
-
import { createLlamacppManagedRuntimeRegistry } from "./llamacpp-managed-runtime.js";
|
|
8
|
-
import { listListeningPids as listListeningPidsForPort } from "./port-reclaim.js";
|
|
9
|
-
import { stopProcessByPid as stopProcessByPidForRuntime } from "./instance-state.js";
|
|
10
5
|
|
|
11
6
|
export const LLAMACPP_DEFAULT_HOST = "127.0.0.1";
|
|
12
7
|
export const LLAMACPP_DEFAULT_PORT = 39391;
|
|
@@ -21,8 +16,8 @@ const COMMON_SOURCE_BUILD_PATHS = Object.freeze([
|
|
|
21
16
|
"src/llama-cpp-turboquant/build/bin/llama-server",
|
|
22
17
|
"src/llama.cpp-turboquant/build/bin/llama-server"
|
|
23
18
|
]);
|
|
24
|
-
|
|
25
|
-
let
|
|
19
|
+
|
|
20
|
+
let managedLlamacppRuntime = null;
|
|
26
21
|
|
|
27
22
|
function isPlainObject(value) {
|
|
28
23
|
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
@@ -44,34 +39,6 @@ function normalizePathEntries(entries) {
|
|
|
44
39
|
: [];
|
|
45
40
|
}
|
|
46
41
|
|
|
47
|
-
function buildRuntimeProfileHash({ command, host, port, args = [] } = {}) {
|
|
48
|
-
const normalizedArgs = Array.isArray(args) ? args.filter(Boolean) : [];
|
|
49
|
-
return `${normalizeString(command)}|${normalizeString(host)}|${String(normalizePort(port, LLAMACPP_DEFAULT_PORT))}|${normalizedArgs.join("\u001f")}`;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
function isManagedRuntimeAlive(instance) {
|
|
53
|
-
const child = instance?.child;
|
|
54
|
-
if (!child) return false;
|
|
55
|
-
return child.exitCode === null && child.killed !== true;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
function normalizeListeningPidResult(result) {
|
|
59
|
-
if (result && typeof result === "object" && result.ok === false) {
|
|
60
|
-
return { ok: false, pids: [] };
|
|
61
|
-
}
|
|
62
|
-
if (Array.isArray(result)) {
|
|
63
|
-
return result
|
|
64
|
-
.map((value) => Number(value))
|
|
65
|
-
.filter((pid) => Number.isInteger(pid) && pid > 0);
|
|
66
|
-
}
|
|
67
|
-
if (result && typeof result === "object" && Array.isArray(result.pids)) {
|
|
68
|
-
return result.pids
|
|
69
|
-
.map((value) => Number(value))
|
|
70
|
-
.filter((pid) => Number.isInteger(pid) && pid > 0);
|
|
71
|
-
}
|
|
72
|
-
return [];
|
|
73
|
-
}
|
|
74
|
-
|
|
75
42
|
function readConfiguredLlamacppRuntime(config) {
|
|
76
43
|
const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
|
|
77
44
|
if (!isPlainObject(runtime)) {
|
|
@@ -105,8 +72,6 @@ function buildPreloadModels(config) {
|
|
|
105
72
|
if (!modelPath) continue;
|
|
106
73
|
preloadModels.push({
|
|
107
74
|
variantId: normalizeString(variant.id),
|
|
108
|
-
variant,
|
|
109
|
-
baseModel,
|
|
110
75
|
modelPath,
|
|
111
76
|
contextWindow: Number.isFinite(Number(variant.contextWindow)) ? Number(variant.contextWindow) : undefined
|
|
112
77
|
});
|
|
@@ -114,15 +79,6 @@ function buildPreloadModels(config) {
|
|
|
114
79
|
return preloadModels;
|
|
115
80
|
}
|
|
116
81
|
|
|
117
|
-
function detectLlamacppSystemProfile(system = {}) {
|
|
118
|
-
const totalMemoryBytes = Number(system?.totalMemoryBytes);
|
|
119
|
-
return {
|
|
120
|
-
platform: normalizeString(system?.platform) || process.platform,
|
|
121
|
-
unifiedMemory: system?.unifiedMemory === true || process.platform === "darwin",
|
|
122
|
-
totalMemoryBytes: Number.isFinite(totalMemoryBytes) && totalMemoryBytes > 0 ? totalMemoryBytes : os.totalmem()
|
|
123
|
-
};
|
|
124
|
-
}
|
|
125
|
-
|
|
126
82
|
export function detectLlamacppCandidates({
|
|
127
83
|
envPathEntries = process.env.PATH?.split(path.delimiter) || [],
|
|
128
84
|
homeDir = os.homedir(),
|
|
@@ -166,18 +122,16 @@ export function buildLlamacppLaunchArgs({
|
|
|
166
122
|
command,
|
|
167
123
|
host = LLAMACPP_DEFAULT_HOST,
|
|
168
124
|
port = LLAMACPP_DEFAULT_PORT,
|
|
169
|
-
preloadModels = []
|
|
170
|
-
launchProfile = null
|
|
125
|
+
preloadModels = []
|
|
171
126
|
} = {}) {
|
|
172
127
|
const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
|
|
173
128
|
const args = [
|
|
174
129
|
normalizeString(command),
|
|
175
130
|
"--host", normalizeString(host) || LLAMACPP_DEFAULT_HOST,
|
|
176
|
-
"--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
|
|
177
|
-
...((Array.isArray(launchProfile?.args) ? launchProfile.args : []).filter(Boolean))
|
|
131
|
+
"--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
|
|
178
132
|
];
|
|
179
133
|
|
|
180
|
-
if (
|
|
134
|
+
if (firstModel?.modelPath) {
|
|
181
135
|
args.push("-m", firstModel.modelPath);
|
|
182
136
|
if (Number.isFinite(Number(firstModel.contextWindow)) && Number(firstModel.contextWindow) > 0) {
|
|
183
137
|
args.push("-c", String(Math.floor(Number(firstModel.contextWindow))));
|
|
@@ -187,31 +141,6 @@ export function buildLlamacppLaunchArgs({
|
|
|
187
141
|
return args.filter(Boolean);
|
|
188
142
|
}
|
|
189
143
|
|
|
190
|
-
export async function spawnManagedLlamacppRuntime({
|
|
191
|
-
command,
|
|
192
|
-
host = LLAMACPP_DEFAULT_HOST,
|
|
193
|
-
port = LLAMACPP_DEFAULT_PORT,
|
|
194
|
-
launchProfile
|
|
195
|
-
} = {}, {
|
|
196
|
-
spawnImpl = spawn
|
|
197
|
-
} = {}) {
|
|
198
|
-
const args = buildLlamacppLaunchArgs({
|
|
199
|
-
command,
|
|
200
|
-
host,
|
|
201
|
-
port,
|
|
202
|
-
launchProfile
|
|
203
|
-
});
|
|
204
|
-
const child = spawnImpl(args[0], args.slice(1), { stdio: "ignore" });
|
|
205
|
-
return {
|
|
206
|
-
pid: child?.pid,
|
|
207
|
-
child,
|
|
208
|
-
host,
|
|
209
|
-
port,
|
|
210
|
-
baseUrl: `http://${host}:${port}/v1`,
|
|
211
|
-
args
|
|
212
|
-
};
|
|
213
|
-
}
|
|
214
|
-
|
|
215
144
|
export function parseLlamacppValidationOutput(output = "") {
|
|
216
145
|
const text = String(output || "").trim();
|
|
217
146
|
const lowered = text.toLowerCase();
|
|
@@ -270,142 +199,78 @@ async function startConfiguredRuntime(config, {
|
|
|
270
199
|
requireAutostart = true
|
|
271
200
|
} = {}, {
|
|
272
201
|
spawnSyncImpl = spawnSync,
|
|
273
|
-
spawnImpl = spawn
|
|
274
|
-
system = undefined,
|
|
275
|
-
listListeningPids = undefined,
|
|
276
|
-
stopProcessByPid = undefined
|
|
202
|
+
spawnImpl = spawn
|
|
277
203
|
} = {}) {
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
return { ok: true, skipped: true, reason: "autostart-disabled" };
|
|
283
|
-
}
|
|
204
|
+
const runtime = readConfiguredLlamacppRuntime(config);
|
|
205
|
+
if (requireAutostart && !runtime.startWithRouter) {
|
|
206
|
+
return { ok: true, skipped: true, reason: "autostart-disabled" };
|
|
207
|
+
}
|
|
284
208
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
209
|
+
if (!runtime.command) {
|
|
210
|
+
const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
|
|
211
|
+
error(errorMessage);
|
|
212
|
+
return { ok: false, errorMessage };
|
|
213
|
+
}
|
|
290
214
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
: null;
|
|
300
|
-
const args = buildLlamacppLaunchArgs({
|
|
301
|
-
command: runtime.command,
|
|
302
|
-
host: runtime.host,
|
|
303
|
-
port: runtime.port,
|
|
304
|
-
preloadModels,
|
|
305
|
-
launchProfile
|
|
306
|
-
});
|
|
307
|
-
const variantKey = normalizeString(firstModel?.variant?.key || firstModel?.variantId) || "default";
|
|
308
|
-
const profileHash = buildRuntimeProfileHash({
|
|
309
|
-
command: runtime.command,
|
|
310
|
-
host: runtime.host,
|
|
311
|
-
port: runtime.port,
|
|
312
|
-
args: args.slice(1)
|
|
313
|
-
});
|
|
314
|
-
const listListeningPidsFn = typeof listListeningPids === "function"
|
|
315
|
-
? listListeningPids
|
|
316
|
-
: (port) => listListeningPidsForPort(port, { spawnSync: spawnSyncImpl });
|
|
317
|
-
const stopProcessByPidFn = typeof stopProcessByPid === "function"
|
|
318
|
-
? stopProcessByPid
|
|
319
|
-
: (pid) => stopProcessByPidForRuntime(pid);
|
|
320
|
-
await managedLlamacppRuntimeRegistry.reconcile({
|
|
321
|
-
listListeningPids: async (port) => normalizeListeningPidResult(await listListeningPidsFn(port)),
|
|
322
|
-
stopProcessByPid: async (pid) => stopProcessByPidFn(pid)
|
|
323
|
-
});
|
|
215
|
+
if (managedLlamacppRuntime
|
|
216
|
+
&& managedLlamacppRuntime.command === runtime.command
|
|
217
|
+
&& managedLlamacppRuntime.host === runtime.host
|
|
218
|
+
&& managedLlamacppRuntime.port === runtime.port
|
|
219
|
+
&& managedLlamacppRuntime.child?.exitCode === null
|
|
220
|
+
&& managedLlamacppRuntime.child?.killed !== true) {
|
|
221
|
+
return { ok: true, alreadyRunning: true, runtime: managedLlamacppRuntime };
|
|
222
|
+
}
|
|
324
223
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
&& isManagedRuntimeAlive(instance)
|
|
331
|
-
));
|
|
332
|
-
if (existing) {
|
|
333
|
-
return { ok: true, alreadyRunning: true, runtime: existing };
|
|
334
|
-
}
|
|
224
|
+
const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
|
|
225
|
+
if (!validation.ok) {
|
|
226
|
+
error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
|
|
227
|
+
return validation;
|
|
228
|
+
}
|
|
335
229
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
230
|
+
const preloadModels = buildPreloadModels(config);
|
|
231
|
+
const args = buildLlamacppLaunchArgs({
|
|
232
|
+
command: runtime.command,
|
|
233
|
+
host: runtime.host,
|
|
234
|
+
port: runtime.port,
|
|
235
|
+
preloadModels
|
|
236
|
+
});
|
|
341
237
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
});
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
const settleResolve = (value) => {
|
|
370
|
-
if (settled) return;
|
|
371
|
-
settled = true;
|
|
372
|
-
resolve(value);
|
|
373
|
-
};
|
|
374
|
-
const settleReject = (reason) => {
|
|
375
|
-
if (settled) return;
|
|
376
|
-
settled = true;
|
|
377
|
-
reject(reason);
|
|
378
|
-
};
|
|
379
|
-
|
|
380
|
-
child.once("spawn", () => {
|
|
381
|
-
if (typeof child.unref === "function") child.unref();
|
|
382
|
-
settleResolve({
|
|
383
|
-
pid: child?.pid,
|
|
384
|
-
child,
|
|
385
|
-
command: runtime.command,
|
|
386
|
-
host: runtime.host,
|
|
387
|
-
port,
|
|
388
|
-
args: allocatedArgs,
|
|
389
|
-
baseUrl: `http://${runtime.host}:${port}/v1`
|
|
390
|
-
});
|
|
391
|
-
});
|
|
392
|
-
child.once("error", (spawnError) => {
|
|
393
|
-
settleReject(spawnError);
|
|
394
|
-
});
|
|
395
|
-
}),
|
|
396
|
-
waitForHealthy: async (instance) => instance
|
|
238
|
+
return new Promise((resolve) => {
|
|
239
|
+
let settled = false;
|
|
240
|
+
const child = spawnImpl(args[0], args.slice(1), {
|
|
241
|
+
stdio: "ignore"
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
const finish = (result) => {
|
|
245
|
+
if (settled) return;
|
|
246
|
+
settled = true;
|
|
247
|
+
resolve(result);
|
|
248
|
+
};
|
|
249
|
+
|
|
250
|
+
child.once("spawn", () => {
|
|
251
|
+
managedLlamacppRuntime = {
|
|
252
|
+
child,
|
|
253
|
+
command: runtime.command,
|
|
254
|
+
host: runtime.host,
|
|
255
|
+
port: runtime.port,
|
|
256
|
+
args
|
|
257
|
+
};
|
|
258
|
+
child.once("exit", () => {
|
|
259
|
+
if (managedLlamacppRuntime?.child === child) {
|
|
260
|
+
managedLlamacppRuntime = null;
|
|
261
|
+
}
|
|
397
262
|
});
|
|
263
|
+
if (typeof child.unref === "function") child.unref();
|
|
264
|
+
line(`Started llama.cpp runtime on http://${runtime.host}:${runtime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
|
|
265
|
+
finish({ ok: true, runtime: managedLlamacppRuntime, validation });
|
|
266
|
+
});
|
|
398
267
|
|
|
399
|
-
|
|
400
|
-
return { ok: true, runtime: managedRuntime, validation };
|
|
401
|
-
} catch (spawnError) {
|
|
268
|
+
child.once("error", (spawnError) => {
|
|
402
269
|
const errorMessage = spawnError instanceof Error ? spawnError.message : String(spawnError);
|
|
403
270
|
error(`Failed starting llama.cpp runtime: ${errorMessage}`);
|
|
404
|
-
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
inFlightConfiguredStartCount = Math.max(0, inFlightConfiguredStartCount - 1);
|
|
408
|
-
}
|
|
271
|
+
finish({ ok: false, errorMessage });
|
|
272
|
+
});
|
|
273
|
+
});
|
|
409
274
|
}
|
|
410
275
|
|
|
411
276
|
export async function ensureConfiguredLlamacppRuntimeStarted(config, callbacks = {}, deps = {}) {
|
|
@@ -422,66 +287,23 @@ export async function startConfiguredLlamacppRuntime(config, callbacks = {}, dep
|
|
|
422
287
|
}, deps);
|
|
423
288
|
}
|
|
424
289
|
|
|
425
|
-
export function getManagedLlamacppRuntimeSnapshot() {
|
|
426
|
-
return managedLlamacppRuntimeRegistry.snapshot().map((instance) => {
|
|
427
|
-
const { child: _child, ...rest } = instance || {};
|
|
428
|
-
return JSON.parse(JSON.stringify(rest));
|
|
429
|
-
});
|
|
430
|
-
}
|
|
431
|
-
|
|
432
290
|
export async function stopManagedLlamacppRuntime({
|
|
433
291
|
line = () => {},
|
|
434
292
|
error = () => {}
|
|
435
293
|
} = {}) {
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
}
|
|
439
|
-
if (typeof managedLlamacppRuntimeRegistry.waitForInFlightStarts === "function") {
|
|
440
|
-
await managedLlamacppRuntimeRegistry.waitForInFlightStarts();
|
|
441
|
-
}
|
|
442
|
-
const instances = managedLlamacppRuntimeRegistry.snapshot();
|
|
443
|
-
if (instances.length === 0) {
|
|
294
|
+
const active = managedLlamacppRuntime;
|
|
295
|
+
if (!active?.child) {
|
|
444
296
|
return { ok: true, skipped: true, reason: "not-running" };
|
|
445
297
|
}
|
|
446
298
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
if (!isManagedRuntimeAlive(instance)) {
|
|
459
|
-
await managedLlamacppRuntimeRegistry.untrackInstance(instance?.instanceId);
|
|
460
|
-
} else {
|
|
461
|
-
pendingExitCount += 1;
|
|
462
|
-
}
|
|
463
|
-
} catch (stopError) {
|
|
464
|
-
const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
|
|
465
|
-
failures.push(errorMessage);
|
|
466
|
-
error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
if (stoppedCount > 0) {
|
|
471
|
-
if (pendingExitCount === 0) {
|
|
472
|
-
line(stoppedCount === 1 ? "Stopped managed llama.cpp runtime." : `Stopped ${stoppedCount} managed llama.cpp runtimes.`);
|
|
473
|
-
} else {
|
|
474
|
-
line(stoppedCount === 1
|
|
475
|
-
? "Stop signal sent to managed llama.cpp runtime; waiting for exit."
|
|
476
|
-
: `Stop signal sent to ${stoppedCount} managed llama.cpp runtimes; waiting for exits.`);
|
|
477
|
-
}
|
|
299
|
+
managedLlamacppRuntime = null;
|
|
300
|
+
try {
|
|
301
|
+
active.child.kill("SIGTERM");
|
|
302
|
+
line("Stopped managed llama.cpp runtime.");
|
|
303
|
+
return { ok: true };
|
|
304
|
+
} catch (stopError) {
|
|
305
|
+
const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
|
|
306
|
+
error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
|
|
307
|
+
return { ok: false, errorMessage };
|
|
478
308
|
}
|
|
479
|
-
|
|
480
|
-
const completed = failures.length === 0 && pendingExitCount === 0;
|
|
481
|
-
return {
|
|
482
|
-
ok: completed,
|
|
483
|
-
stoppedCount,
|
|
484
|
-
pendingExitCount,
|
|
485
|
-
...(failures.length > 0 ? { errorMessage: failures.join("; ") } : {})
|
|
486
|
-
};
|
|
487
309
|
}
|
|
@@ -212,42 +212,19 @@ export async function saveLocalModelVariant(config, draft, {
|
|
|
212
212
|
activeVariants,
|
|
213
213
|
totalMemoryBytes: system.totalMemoryBytes
|
|
214
214
|
});
|
|
215
|
-
|
|
215
|
+
if (!decision.allowed) {
|
|
216
216
|
throw new Error(decision.reason);
|
|
217
217
|
}
|
|
218
218
|
}
|
|
219
219
|
|
|
220
|
-
const normalizedMetadata = normalizeLocalModelsMetadata({
|
|
221
|
-
variants: {
|
|
222
|
-
draft: normalizedDraft
|
|
223
|
-
}
|
|
224
|
-
});
|
|
225
|
-
const normalizedVariantDraft = Object.values(normalizedMetadata.variants)[0] || {};
|
|
226
|
-
const previousVariant = isPlainObject(next.metadata.localModels.variants[key])
|
|
227
|
-
? next.metadata.localModels.variants[key]
|
|
228
|
-
: {};
|
|
229
|
-
|
|
230
220
|
next.metadata.localModels.variants[key] = {
|
|
231
|
-
...
|
|
221
|
+
...(isPlainObject(next.metadata.localModels.variants[key]) ? next.metadata.localModels.variants[key] : {}),
|
|
232
222
|
key,
|
|
233
223
|
baseModelId,
|
|
234
224
|
id: modelId,
|
|
235
225
|
name,
|
|
236
226
|
runtime,
|
|
237
227
|
preset: normalizeString(normalizedDraft.preset),
|
|
238
|
-
runtimeProfile: runtime === "llamacpp"
|
|
239
|
-
? normalizedVariantDraft.runtimeProfile
|
|
240
|
-
: undefined,
|
|
241
|
-
runtimeStatus: runtime === "llamacpp"
|
|
242
|
-
? (isPlainObject(previousVariant.runtimeStatus)
|
|
243
|
-
? previousVariant.runtimeStatus
|
|
244
|
-
: {
|
|
245
|
-
activeInstanceId: "",
|
|
246
|
-
lastFailure: null,
|
|
247
|
-
lastStartedAt: "",
|
|
248
|
-
lastHealthyAt: ""
|
|
249
|
-
})
|
|
250
|
-
: undefined,
|
|
251
228
|
enabled: normalizedDraft.enabled === true,
|
|
252
229
|
preload: normalizedDraft.preload === true,
|
|
253
230
|
contextWindow: Number.isFinite(Number(normalizedDraft.contextWindow)) ? Number(normalizedDraft.contextWindow) : undefined,
|