@khanglvm/llm-router 2.6.0 → 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,19 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
- ## [2.6.0] - 2026-04-23
11
-
12
- ### Added
13
- - Local `llama.cpp` variants can now persist a per-model runtime profile, including auto-tuned presets and custom launch overrides, so each GGUF variant can run with settings that match its own size and context shape instead of sharing one global `llama-server` startup profile.
14
- - The Web UI now exposes managed `llama.cpp` runtime health for Local Models, including tracked instance counts, healthy/stale summaries, and persisted runtime-profile data for each saved variant.
15
-
16
- ### Changed
17
- - Local variant requests are now resolved through a managed per-variant `llama.cpp` runtime layer that can reuse compatible instances, allocate fallback ports safely, and start the right runtime configuration for the specific model variant without exposing multi-process lifecycle management to the user.
18
- - Hugging Face GGUF search/download flows now surface file size plus estimated runtime memory guidance directly in the Local Models workflow, making it easier to choose a viable quantization before download.
19
-
20
- ### Fixed
21
- - Managed `llama.cpp` runtimes now reconcile stale tracked instances before reuse, avoid reserving dead immediate-exit servers, and drain pending shutdown/startup edges more reliably so local per-model routing does not leave behind stale `llama-server` processes.
22
-
23
10
  ## [2.5.2] - 2026-04-23
24
11
 
25
12
  ### Fixed
package/README.md CHANGED
@@ -44,9 +44,6 @@ Open `llr` and use the **Local Models** tab to manage local inference sources al
44
44
  - **Native macOS browsing** — use the built-in file picker to choose a single GGUF file, scan a folder recursively for GGUF models, or browse directly to a local `llama-server` binary
45
45
  - **Managed + attached model library** — stale or moved files stay visible instead of crashing the app, and can be repaired by locating the file again or removed cleanly
46
46
  - **Router-visible local variants** — create friendly model variants with bounded presets, context-window metadata, preload toggles, and Mac unified-memory fit guidance with clearer safe/tight recommendations
47
- - **Per-variant llama.cpp tuning** — each local variant can store its own runtime profile so balanced, throughput, long-context, low-memory, or custom launch overrides do not fight over one shared global `llama-server` config
48
- - **Managed per-model runtimes** — the router automatically starts, reuses, and stops the right `llama.cpp` instance for the requested local variant, with stale-runtime cleanup handled internally instead of asking the user to manage separate servers
49
- - **GGUF size + memory guidance** — Hugging Face search results now show model file size plus estimated runtime memory fit guidance before download, helping choose viable quantizations faster
50
47
  - **Alias-ready local routing** — once saved, local variants behave like normal router models and can be used in aliases, capability flags, and fallback chains
51
48
 
52
49
  For v1, the managed download flow only searches public Hugging Face GGUF files and the fit guidance is tuned for Macs with unified memory.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@khanglvm/llm-router",
3
- "version": "2.6.0",
3
+ "version": "2.6.1",
4
4
  "description": "LLM Router: single gateway endpoint for multi-provider LLMs with unified OpenAI+Anthropic format and seamless fallback",
5
5
  "keywords": [
6
6
  "llm-router",
@@ -17,6 +17,7 @@ import {
17
17
  normalizeFactoryDroidReasoningEffort,
18
18
  resolveFactoryDroidRouterModelRef
19
19
  } from "../shared/coding-tool-bindings.js";
20
+ import { LOCAL_RUNTIME_PROVIDER_TYPE } from "../runtime/local-models.js";
20
21
 
21
22
  const BACKUP_SUFFIX = ".llm_router_backup";
22
23
  const CODEX_PROVIDER_ID = "llm-router";
@@ -972,9 +973,11 @@ export async function patchClaudeCodeEffortLevel({
972
973
  const FACTORY_DROID_ROUTER_MARKER = "_llmRouterManaged";
973
974
  const FACTORY_DROID_OPENAI_PROVIDER = "openai";
974
975
  const FACTORY_DROID_ANTHROPIC_PROVIDER = "anthropic";
976
+ const FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER = "generic-chat-completion-api";
975
977
  const FACTORY_DROID_ROUTER_PROVIDERS = Object.freeze([
976
978
  FACTORY_DROID_OPENAI_PROVIDER,
977
- FACTORY_DROID_ANTHROPIC_PROVIDER
979
+ FACTORY_DROID_ANTHROPIC_PROVIDER,
980
+ FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER
978
981
  ]);
979
982
 
980
983
  function dedupeStrings(values = []) {
@@ -1116,6 +1119,17 @@ function resolveFactoryDroidRouteFormat(modelRef, config = {}, seen = new Set())
1116
1119
  }
1117
1120
 
1118
1121
  function resolveFactoryDroidCustomModelProvider(modelRef, config = {}) {
1122
+ const normalizedModelRef = String(modelRef || "").trim();
1123
+ if (normalizedModelRef.includes("/")) {
1124
+ const separatorIndex = normalizedModelRef.indexOf("/");
1125
+ const providerId = normalizedModelRef.slice(0, separatorIndex).trim();
1126
+ const provider = (Array.isArray(config?.providers) ? config.providers : [])
1127
+ .find((entry) => String(entry?.id || "").trim() === providerId);
1128
+ if (String(provider?.type || "").trim().toLowerCase() === LOCAL_RUNTIME_PROVIDER_TYPE) {
1129
+ return FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER;
1130
+ }
1131
+ }
1132
+
1119
1133
  return mapFactoryDroidFormatToProvider(resolveFactoryDroidRouteFormat(modelRef, config))
1120
1134
  || FACTORY_DROID_OPENAI_PROVIDER;
1121
1135
  }
@@ -1,6 +1,5 @@
1
1
  import path from "node:path";
2
2
  import { promises as fs } from "node:fs";
3
- import { estimateLlamacppRuntimeBytes } from "./llamacpp-runtime-profile.js";
4
3
 
5
4
  const HUGGING_FACE_API_URL = "https://huggingface.co/api/models";
6
5
  const HUGGING_FACE_BASE_URL = "https://huggingface.co";
@@ -155,13 +154,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
155
154
  expectedContextWindow: systemInfo?.expectedContextWindow
156
155
  }, systemInfo);
157
156
  const quantization = parseQuantizationFromFileName(file);
158
- const estimatedRuntimeBytes = sizeBytes
159
- ? estimateLlamacppRuntimeBytes({
160
- sizeBytes,
161
- contextWindow: systemInfo?.expectedContextWindow,
162
- preset: status.fit === "tight" ? "memory-safe" : "balanced"
163
- })
164
- : undefined;
165
157
  const fitScore = status.fit === "safe" ? 30 : status.fit === "tight" ? 15 : status.fit === "unknown" ? 8 : -20;
166
158
  const rankingScore = fitScore
167
159
  + (status.disabled ? -100 : 0)
@@ -174,10 +166,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
174
166
  file,
175
167
  quantization,
176
168
  sizeBytes,
177
- estimatedRuntimeBytes,
178
- memoryLabel: estimatedRuntimeBytes
179
- ? `${(estimatedRuntimeBytes / (1024 ** 3)).toFixed(1)} GB runtime est.`
180
- : "Runtime estimate unavailable",
181
169
  disabled: status.disabled,
182
170
  disabledReason: status.reason,
183
171
  fit: status.fit,
@@ -2,11 +2,6 @@ import path from "node:path";
2
2
  import os from "node:os";
3
3
  import { existsSync } from "node:fs";
4
4
  import { spawn, spawnSync } from "node:child_process";
5
- import { setTimeout as delay } from "node:timers/promises";
6
- import { deriveLlamacppLaunchProfile } from "./llamacpp-runtime-profile.js";
7
- import { createLlamacppManagedRuntimeRegistry } from "./llamacpp-managed-runtime.js";
8
- import { listListeningPids as listListeningPidsForPort } from "./port-reclaim.js";
9
- import { stopProcessByPid as stopProcessByPidForRuntime } from "./instance-state.js";
10
5
 
11
6
  export const LLAMACPP_DEFAULT_HOST = "127.0.0.1";
12
7
  export const LLAMACPP_DEFAULT_PORT = 39391;
@@ -21,8 +16,8 @@ const COMMON_SOURCE_BUILD_PATHS = Object.freeze([
21
16
  "src/llama-cpp-turboquant/build/bin/llama-server",
22
17
  "src/llama.cpp-turboquant/build/bin/llama-server"
23
18
  ]);
24
- const managedLlamacppRuntimeRegistry = createLlamacppManagedRuntimeRegistry();
25
- let inFlightConfiguredStartCount = 0;
19
+
20
+ let managedLlamacppRuntime = null;
26
21
 
27
22
  function isPlainObject(value) {
28
23
  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
@@ -44,34 +39,6 @@ function normalizePathEntries(entries) {
44
39
  : [];
45
40
  }
46
41
 
47
- function buildRuntimeProfileHash({ command, host, port, args = [] } = {}) {
48
- const normalizedArgs = Array.isArray(args) ? args.filter(Boolean) : [];
49
- return `${normalizeString(command)}|${normalizeString(host)}|${String(normalizePort(port, LLAMACPP_DEFAULT_PORT))}|${normalizedArgs.join("\u001f")}`;
50
- }
51
-
52
- function isManagedRuntimeAlive(instance) {
53
- const child = instance?.child;
54
- if (!child) return false;
55
- return child.exitCode === null && child.killed !== true;
56
- }
57
-
58
- function normalizeListeningPidResult(result) {
59
- if (result && typeof result === "object" && result.ok === false) {
60
- return { ok: false, pids: [] };
61
- }
62
- if (Array.isArray(result)) {
63
- return result
64
- .map((value) => Number(value))
65
- .filter((pid) => Number.isInteger(pid) && pid > 0);
66
- }
67
- if (result && typeof result === "object" && Array.isArray(result.pids)) {
68
- return result.pids
69
- .map((value) => Number(value))
70
- .filter((pid) => Number.isInteger(pid) && pid > 0);
71
- }
72
- return [];
73
- }
74
-
75
42
  function readConfiguredLlamacppRuntime(config) {
76
43
  const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
77
44
  if (!isPlainObject(runtime)) {
@@ -105,8 +72,6 @@ function buildPreloadModels(config) {
105
72
  if (!modelPath) continue;
106
73
  preloadModels.push({
107
74
  variantId: normalizeString(variant.id),
108
- variant,
109
- baseModel,
110
75
  modelPath,
111
76
  contextWindow: Number.isFinite(Number(variant.contextWindow)) ? Number(variant.contextWindow) : undefined
112
77
  });
@@ -114,15 +79,6 @@ function buildPreloadModels(config) {
114
79
  return preloadModels;
115
80
  }
116
81
 
117
- function detectLlamacppSystemProfile(system = {}) {
118
- const totalMemoryBytes = Number(system?.totalMemoryBytes);
119
- return {
120
- platform: normalizeString(system?.platform) || process.platform,
121
- unifiedMemory: system?.unifiedMemory === true || process.platform === "darwin",
122
- totalMemoryBytes: Number.isFinite(totalMemoryBytes) && totalMemoryBytes > 0 ? totalMemoryBytes : os.totalmem()
123
- };
124
- }
125
-
126
82
  export function detectLlamacppCandidates({
127
83
  envPathEntries = process.env.PATH?.split(path.delimiter) || [],
128
84
  homeDir = os.homedir(),
@@ -166,18 +122,16 @@ export function buildLlamacppLaunchArgs({
166
122
  command,
167
123
  host = LLAMACPP_DEFAULT_HOST,
168
124
  port = LLAMACPP_DEFAULT_PORT,
169
- preloadModels = [],
170
- launchProfile = null
125
+ preloadModels = []
171
126
  } = {}) {
172
127
  const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
173
128
  const args = [
174
129
  normalizeString(command),
175
130
  "--host", normalizeString(host) || LLAMACPP_DEFAULT_HOST,
176
- "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT)),
177
- ...((Array.isArray(launchProfile?.args) ? launchProfile.args : []).filter(Boolean))
131
+ "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
178
132
  ];
179
133
 
180
- if (!launchProfile && firstModel?.modelPath) {
134
+ if (firstModel?.modelPath) {
181
135
  args.push("-m", firstModel.modelPath);
182
136
  if (Number.isFinite(Number(firstModel.contextWindow)) && Number(firstModel.contextWindow) > 0) {
183
137
  args.push("-c", String(Math.floor(Number(firstModel.contextWindow))));
@@ -187,31 +141,6 @@ export function buildLlamacppLaunchArgs({
187
141
  return args.filter(Boolean);
188
142
  }
189
143
 
190
- export async function spawnManagedLlamacppRuntime({
191
- command,
192
- host = LLAMACPP_DEFAULT_HOST,
193
- port = LLAMACPP_DEFAULT_PORT,
194
- launchProfile
195
- } = {}, {
196
- spawnImpl = spawn
197
- } = {}) {
198
- const args = buildLlamacppLaunchArgs({
199
- command,
200
- host,
201
- port,
202
- launchProfile
203
- });
204
- const child = spawnImpl(args[0], args.slice(1), { stdio: "ignore" });
205
- return {
206
- pid: child?.pid,
207
- child,
208
- host,
209
- port,
210
- baseUrl: `http://${host}:${port}/v1`,
211
- args
212
- };
213
- }
214
-
215
144
  export function parseLlamacppValidationOutput(output = "") {
216
145
  const text = String(output || "").trim();
217
146
  const lowered = text.toLowerCase();
@@ -270,142 +199,78 @@ async function startConfiguredRuntime(config, {
270
199
  requireAutostart = true
271
200
  } = {}, {
272
201
  spawnSyncImpl = spawnSync,
273
- spawnImpl = spawn,
274
- system = undefined,
275
- listListeningPids = undefined,
276
- stopProcessByPid = undefined
202
+ spawnImpl = spawn
277
203
  } = {}) {
278
- inFlightConfiguredStartCount += 1;
279
- try {
280
- const runtime = readConfiguredLlamacppRuntime(config);
281
- if (requireAutostart && !runtime.startWithRouter) {
282
- return { ok: true, skipped: true, reason: "autostart-disabled" };
283
- }
204
+ const runtime = readConfiguredLlamacppRuntime(config);
205
+ if (requireAutostart && !runtime.startWithRouter) {
206
+ return { ok: true, skipped: true, reason: "autostart-disabled" };
207
+ }
284
208
 
285
- if (!runtime.command) {
286
- const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
287
- error(errorMessage);
288
- return { ok: false, errorMessage };
289
- }
209
+ if (!runtime.command) {
210
+ const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
211
+ error(errorMessage);
212
+ return { ok: false, errorMessage };
213
+ }
290
214
 
291
- const preloadModels = buildPreloadModels(config);
292
- const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
293
- const launchProfile = firstModel?.variant && firstModel?.baseModel
294
- ? deriveLlamacppLaunchProfile({
295
- variant: firstModel.variant,
296
- baseModel: firstModel.baseModel,
297
- system: detectLlamacppSystemProfile(system)
298
- })
299
- : null;
300
- const args = buildLlamacppLaunchArgs({
301
- command: runtime.command,
302
- host: runtime.host,
303
- port: runtime.port,
304
- preloadModels,
305
- launchProfile
306
- });
307
- const variantKey = normalizeString(firstModel?.variant?.key || firstModel?.variantId) || "default";
308
- const profileHash = buildRuntimeProfileHash({
309
- command: runtime.command,
310
- host: runtime.host,
311
- port: runtime.port,
312
- args: args.slice(1)
313
- });
314
- const listListeningPidsFn = typeof listListeningPids === "function"
315
- ? listListeningPids
316
- : (port) => listListeningPidsForPort(port, { spawnSync: spawnSyncImpl });
317
- const stopProcessByPidFn = typeof stopProcessByPid === "function"
318
- ? stopProcessByPid
319
- : (pid) => stopProcessByPidForRuntime(pid);
320
- await managedLlamacppRuntimeRegistry.reconcile({
321
- listListeningPids: async (port) => normalizeListeningPidResult(await listListeningPidsFn(port)),
322
- stopProcessByPid: async (pid) => stopProcessByPidFn(pid)
323
- });
215
+ if (managedLlamacppRuntime
216
+ && managedLlamacppRuntime.command === runtime.command
217
+ && managedLlamacppRuntime.host === runtime.host
218
+ && managedLlamacppRuntime.port === runtime.port
219
+ && managedLlamacppRuntime.child?.exitCode === null
220
+ && managedLlamacppRuntime.child?.killed !== true) {
221
+ return { ok: true, alreadyRunning: true, runtime: managedLlamacppRuntime };
222
+ }
324
223
 
325
- const existing = managedLlamacppRuntimeRegistry
326
- .snapshot()
327
- .find((instance) => (
328
- instance.variantKey === variantKey
329
- && instance.profileHash === profileHash
330
- && isManagedRuntimeAlive(instance)
331
- ));
332
- if (existing) {
333
- return { ok: true, alreadyRunning: true, runtime: existing };
334
- }
224
+ const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
225
+ if (!validation.ok) {
226
+ error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
227
+ return validation;
228
+ }
335
229
 
336
- const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
337
- if (!validation.ok) {
338
- error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
339
- return validation;
340
- }
230
+ const preloadModels = buildPreloadModels(config);
231
+ const args = buildLlamacppLaunchArgs({
232
+ command: runtime.command,
233
+ host: runtime.host,
234
+ port: runtime.port,
235
+ preloadModels
236
+ });
341
237
 
342
- try {
343
- const managedRuntime = await managedLlamacppRuntimeRegistry.ensureRuntimeForVariant({
344
- variantKey,
345
- profileHash,
346
- launchArgs: args.slice(1),
347
- preferredPort: runtime.port
348
- }, {
349
- spawnRuntime: async ({ port }) => new Promise((resolve, reject) => {
350
- let settled = false;
351
- const allocatedArgs = buildLlamacppLaunchArgs({
352
- command: runtime.command,
353
- host: runtime.host,
354
- port,
355
- preloadModels,
356
- launchProfile
357
- });
358
- const child = spawnImpl(allocatedArgs[0], allocatedArgs.slice(1), {
359
- stdio: "ignore"
360
- });
361
- const expectedInstanceId = `${variantKey}:${profileHash}:${port}`;
362
- if (child && child.__llamacppManagedExitHookAttached !== true) {
363
- child.__llamacppManagedExitHookAttached = true;
364
- child.once("exit", () => {
365
- void managedLlamacppRuntimeRegistry.untrackInstance(expectedInstanceId);
366
- });
367
- }
368
-
369
- const settleResolve = (value) => {
370
- if (settled) return;
371
- settled = true;
372
- resolve(value);
373
- };
374
- const settleReject = (reason) => {
375
- if (settled) return;
376
- settled = true;
377
- reject(reason);
378
- };
379
-
380
- child.once("spawn", () => {
381
- if (typeof child.unref === "function") child.unref();
382
- settleResolve({
383
- pid: child?.pid,
384
- child,
385
- command: runtime.command,
386
- host: runtime.host,
387
- port,
388
- args: allocatedArgs,
389
- baseUrl: `http://${runtime.host}:${port}/v1`
390
- });
391
- });
392
- child.once("error", (spawnError) => {
393
- settleReject(spawnError);
394
- });
395
- }),
396
- waitForHealthy: async (instance) => instance
238
+ return new Promise((resolve) => {
239
+ let settled = false;
240
+ const child = spawnImpl(args[0], args.slice(1), {
241
+ stdio: "ignore"
242
+ });
243
+
244
+ const finish = (result) => {
245
+ if (settled) return;
246
+ settled = true;
247
+ resolve(result);
248
+ };
249
+
250
+ child.once("spawn", () => {
251
+ managedLlamacppRuntime = {
252
+ child,
253
+ command: runtime.command,
254
+ host: runtime.host,
255
+ port: runtime.port,
256
+ args
257
+ };
258
+ child.once("exit", () => {
259
+ if (managedLlamacppRuntime?.child === child) {
260
+ managedLlamacppRuntime = null;
261
+ }
397
262
  });
263
+ if (typeof child.unref === "function") child.unref();
264
+ line(`Started llama.cpp runtime on http://${runtime.host}:${runtime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
265
+ finish({ ok: true, runtime: managedLlamacppRuntime, validation });
266
+ });
398
267
 
399
- line(`Started llama.cpp runtime on http://${managedRuntime.host}:${managedRuntime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
400
- return { ok: true, runtime: managedRuntime, validation };
401
- } catch (spawnError) {
268
+ child.once("error", (spawnError) => {
402
269
  const errorMessage = spawnError instanceof Error ? spawnError.message : String(spawnError);
403
270
  error(`Failed starting llama.cpp runtime: ${errorMessage}`);
404
- return { ok: false, errorMessage };
405
- }
406
- } finally {
407
- inFlightConfiguredStartCount = Math.max(0, inFlightConfiguredStartCount - 1);
408
- }
271
+ finish({ ok: false, errorMessage });
272
+ });
273
+ });
409
274
  }
410
275
 
411
276
  export async function ensureConfiguredLlamacppRuntimeStarted(config, callbacks = {}, deps = {}) {
@@ -422,66 +287,23 @@ export async function startConfiguredLlamacppRuntime(config, callbacks = {}, dep
422
287
  }, deps);
423
288
  }
424
289
 
425
- export function getManagedLlamacppRuntimeSnapshot() {
426
- return managedLlamacppRuntimeRegistry.snapshot().map((instance) => {
427
- const { child: _child, ...rest } = instance || {};
428
- return JSON.parse(JSON.stringify(rest));
429
- });
430
- }
431
-
432
290
  export async function stopManagedLlamacppRuntime({
433
291
  line = () => {},
434
292
  error = () => {}
435
293
  } = {}) {
436
- while (inFlightConfiguredStartCount > 0) {
437
- await delay(0);
438
- }
439
- if (typeof managedLlamacppRuntimeRegistry.waitForInFlightStarts === "function") {
440
- await managedLlamacppRuntimeRegistry.waitForInFlightStarts();
441
- }
442
- const instances = managedLlamacppRuntimeRegistry.snapshot();
443
- if (instances.length === 0) {
294
+ const active = managedLlamacppRuntime;
295
+ if (!active?.child) {
444
296
  return { ok: true, skipped: true, reason: "not-running" };
445
297
  }
446
298
 
447
- const failures = [];
448
- let stoppedCount = 0;
449
- let pendingExitCount = 0;
450
- for (const instance of instances) {
451
- try {
452
- if (instance?.owner === "llm-router" && typeof instance?.child?.kill === "function") {
453
- const killResult = instance.child.kill("SIGTERM");
454
- if (killResult !== false) {
455
- stoppedCount += 1;
456
- }
457
- }
458
- if (!isManagedRuntimeAlive(instance)) {
459
- await managedLlamacppRuntimeRegistry.untrackInstance(instance?.instanceId);
460
- } else {
461
- pendingExitCount += 1;
462
- }
463
- } catch (stopError) {
464
- const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
465
- failures.push(errorMessage);
466
- error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
467
- }
468
- }
469
-
470
- if (stoppedCount > 0) {
471
- if (pendingExitCount === 0) {
472
- line(stoppedCount === 1 ? "Stopped managed llama.cpp runtime." : `Stopped ${stoppedCount} managed llama.cpp runtimes.`);
473
- } else {
474
- line(stoppedCount === 1
475
- ? "Stop signal sent to managed llama.cpp runtime; waiting for exit."
476
- : `Stop signal sent to ${stoppedCount} managed llama.cpp runtimes; waiting for exits.`);
477
- }
299
+ managedLlamacppRuntime = null;
300
+ try {
301
+ active.child.kill("SIGTERM");
302
+ line("Stopped managed llama.cpp runtime.");
303
+ return { ok: true };
304
+ } catch (stopError) {
305
+ const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
306
+ error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
307
+ return { ok: false, errorMessage };
478
308
  }
479
-
480
- const completed = failures.length === 0 && pendingExitCount === 0;
481
- return {
482
- ok: completed,
483
- stoppedCount,
484
- pendingExitCount,
485
- ...(failures.length > 0 ? { errorMessage: failures.join("; ") } : {})
486
- };
487
309
  }
@@ -212,42 +212,19 @@ export async function saveLocalModelVariant(config, draft, {
212
212
  activeVariants,
213
213
  totalMemoryBytes: system.totalMemoryBytes
214
214
  });
215
- if (!decision.allowed) {
215
+ if (!decision.allowed) {
216
216
  throw new Error(decision.reason);
217
217
  }
218
218
  }
219
219
 
220
- const normalizedMetadata = normalizeLocalModelsMetadata({
221
- variants: {
222
- draft: normalizedDraft
223
- }
224
- });
225
- const normalizedVariantDraft = Object.values(normalizedMetadata.variants)[0] || {};
226
- const previousVariant = isPlainObject(next.metadata.localModels.variants[key])
227
- ? next.metadata.localModels.variants[key]
228
- : {};
229
-
230
220
  next.metadata.localModels.variants[key] = {
231
- ...previousVariant,
221
+ ...(isPlainObject(next.metadata.localModels.variants[key]) ? next.metadata.localModels.variants[key] : {}),
232
222
  key,
233
223
  baseModelId,
234
224
  id: modelId,
235
225
  name,
236
226
  runtime,
237
227
  preset: normalizeString(normalizedDraft.preset),
238
- runtimeProfile: runtime === "llamacpp"
239
- ? normalizedVariantDraft.runtimeProfile
240
- : undefined,
241
- runtimeStatus: runtime === "llamacpp"
242
- ? (isPlainObject(previousVariant.runtimeStatus)
243
- ? previousVariant.runtimeStatus
244
- : {
245
- activeInstanceId: "",
246
- lastFailure: null,
247
- lastStartedAt: "",
248
- lastHealthyAt: ""
249
- })
250
- : undefined,
251
228
  enabled: normalizedDraft.enabled === true,
252
229
  preload: normalizedDraft.preload === true,
253
230
  contextWindow: Number.isFinite(Number(normalizedDraft.contextWindow)) ? Number(normalizedDraft.contextWindow) : undefined,