@khanglvm/llm-router 2.5.2 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,11 @@ import path from "node:path";
2
2
  import os from "node:os";
3
3
  import { existsSync } from "node:fs";
4
4
  import { spawn, spawnSync } from "node:child_process";
5
+ import { setTimeout as delay } from "node:timers/promises";
6
+ import { deriveLlamacppLaunchProfile } from "./llamacpp-runtime-profile.js";
7
+ import { createLlamacppManagedRuntimeRegistry } from "./llamacpp-managed-runtime.js";
8
+ import { listListeningPids as listListeningPidsForPort } from "./port-reclaim.js";
9
+ import { stopProcessByPid as stopProcessByPidForRuntime } from "./instance-state.js";
5
10
 
6
11
  export const LLAMACPP_DEFAULT_HOST = "127.0.0.1";
7
12
  export const LLAMACPP_DEFAULT_PORT = 39391;
@@ -16,8 +21,8 @@ const COMMON_SOURCE_BUILD_PATHS = Object.freeze([
16
21
  "src/llama-cpp-turboquant/build/bin/llama-server",
17
22
  "src/llama.cpp-turboquant/build/bin/llama-server"
18
23
  ]);
19
-
20
- let managedLlamacppRuntime = null;
24
+ const managedLlamacppRuntimeRegistry = createLlamacppManagedRuntimeRegistry();
25
+ let inFlightConfiguredStartCount = 0;
21
26
 
22
27
  function isPlainObject(value) {
23
28
  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
@@ -39,6 +44,34 @@ function normalizePathEntries(entries) {
39
44
  : [];
40
45
  }
41
46
 
47
+ function buildRuntimeProfileHash({ command, host, port, args = [] } = {}) {
48
+ const normalizedArgs = Array.isArray(args) ? args.filter(Boolean) : [];
49
+ return `${normalizeString(command)}|${normalizeString(host)}|${String(normalizePort(port, LLAMACPP_DEFAULT_PORT))}|${normalizedArgs.join("\u001f")}`;
50
+ }
51
+
52
+ function isManagedRuntimeAlive(instance) {
53
+ const child = instance?.child;
54
+ if (!child) return false;
55
+ return child.exitCode === null && child.killed !== true;
56
+ }
57
+
58
+ function normalizeListeningPidResult(result) {
59
+ if (result && typeof result === "object" && result.ok === false) {
60
+ return { ok: false, pids: [] };
61
+ }
62
+ if (Array.isArray(result)) {
63
+ return result
64
+ .map((value) => Number(value))
65
+ .filter((pid) => Number.isInteger(pid) && pid > 0);
66
+ }
67
+ if (result && typeof result === "object" && Array.isArray(result.pids)) {
68
+ return result.pids
69
+ .map((value) => Number(value))
70
+ .filter((pid) => Number.isInteger(pid) && pid > 0);
71
+ }
72
+ return [];
73
+ }
74
+
42
75
  function readConfiguredLlamacppRuntime(config) {
43
76
  const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
44
77
  if (!isPlainObject(runtime)) {
@@ -72,6 +105,8 @@ function buildPreloadModels(config) {
72
105
  if (!modelPath) continue;
73
106
  preloadModels.push({
74
107
  variantId: normalizeString(variant.id),
108
+ variant,
109
+ baseModel,
75
110
  modelPath,
76
111
  contextWindow: Number.isFinite(Number(variant.contextWindow)) ? Number(variant.contextWindow) : undefined
77
112
  });
@@ -79,6 +114,15 @@ function buildPreloadModels(config) {
79
114
  return preloadModels;
80
115
  }
81
116
 
117
+ function detectLlamacppSystemProfile(system = {}) {
118
+ const totalMemoryBytes = Number(system?.totalMemoryBytes);
119
+ return {
120
+ platform: normalizeString(system?.platform) || process.platform,
121
+ unifiedMemory: system?.unifiedMemory === true || process.platform === "darwin",
122
+ totalMemoryBytes: Number.isFinite(totalMemoryBytes) && totalMemoryBytes > 0 ? totalMemoryBytes : os.totalmem()
123
+ };
124
+ }
125
+
82
126
  export function detectLlamacppCandidates({
83
127
  envPathEntries = process.env.PATH?.split(path.delimiter) || [],
84
128
  homeDir = os.homedir(),
@@ -122,16 +166,18 @@ export function buildLlamacppLaunchArgs({
122
166
  command,
123
167
  host = LLAMACPP_DEFAULT_HOST,
124
168
  port = LLAMACPP_DEFAULT_PORT,
125
- preloadModels = []
169
+ preloadModels = [],
170
+ launchProfile = null
126
171
  } = {}) {
127
172
  const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
128
173
  const args = [
129
174
  normalizeString(command),
130
175
  "--host", normalizeString(host) || LLAMACPP_DEFAULT_HOST,
131
- "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
176
+ "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT)),
177
+ ...((Array.isArray(launchProfile?.args) ? launchProfile.args : []).filter(Boolean))
132
178
  ];
133
179
 
134
- if (firstModel?.modelPath) {
180
+ if (!launchProfile && firstModel?.modelPath) {
135
181
  args.push("-m", firstModel.modelPath);
136
182
  if (Number.isFinite(Number(firstModel.contextWindow)) && Number(firstModel.contextWindow) > 0) {
137
183
  args.push("-c", String(Math.floor(Number(firstModel.contextWindow))));
@@ -141,6 +187,31 @@ export function buildLlamacppLaunchArgs({
141
187
  return args.filter(Boolean);
142
188
  }
143
189
 
190
+ export async function spawnManagedLlamacppRuntime({
191
+ command,
192
+ host = LLAMACPP_DEFAULT_HOST,
193
+ port = LLAMACPP_DEFAULT_PORT,
194
+ launchProfile
195
+ } = {}, {
196
+ spawnImpl = spawn
197
+ } = {}) {
198
+ const args = buildLlamacppLaunchArgs({
199
+ command,
200
+ host,
201
+ port,
202
+ launchProfile
203
+ });
204
+ const child = spawnImpl(args[0], args.slice(1), { stdio: "ignore" });
205
+ return {
206
+ pid: child?.pid,
207
+ child,
208
+ host,
209
+ port,
210
+ baseUrl: `http://${host}:${port}/v1`,
211
+ args
212
+ };
213
+ }
214
+
144
215
  export function parseLlamacppValidationOutput(output = "") {
145
216
  const text = String(output || "").trim();
146
217
  const lowered = text.toLowerCase();
@@ -199,78 +270,142 @@ async function startConfiguredRuntime(config, {
199
270
  requireAutostart = true
200
271
  } = {}, {
201
272
  spawnSyncImpl = spawnSync,
202
- spawnImpl = spawn
273
+ spawnImpl = spawn,
274
+ system = undefined,
275
+ listListeningPids = undefined,
276
+ stopProcessByPid = undefined
203
277
  } = {}) {
204
- const runtime = readConfiguredLlamacppRuntime(config);
205
- if (requireAutostart && !runtime.startWithRouter) {
206
- return { ok: true, skipped: true, reason: "autostart-disabled" };
207
- }
208
-
209
- if (!runtime.command) {
210
- const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
211
- error(errorMessage);
212
- return { ok: false, errorMessage };
213
- }
214
-
215
- if (managedLlamacppRuntime
216
- && managedLlamacppRuntime.command === runtime.command
217
- && managedLlamacppRuntime.host === runtime.host
218
- && managedLlamacppRuntime.port === runtime.port
219
- && managedLlamacppRuntime.child?.exitCode === null
220
- && managedLlamacppRuntime.child?.killed !== true) {
221
- return { ok: true, alreadyRunning: true, runtime: managedLlamacppRuntime };
222
- }
223
-
224
- const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
225
- if (!validation.ok) {
226
- error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
227
- return validation;
228
- }
278
+ inFlightConfiguredStartCount += 1;
279
+ try {
280
+ const runtime = readConfiguredLlamacppRuntime(config);
281
+ if (requireAutostart && !runtime.startWithRouter) {
282
+ return { ok: true, skipped: true, reason: "autostart-disabled" };
283
+ }
229
284
 
230
- const preloadModels = buildPreloadModels(config);
231
- const args = buildLlamacppLaunchArgs({
232
- command: runtime.command,
233
- host: runtime.host,
234
- port: runtime.port,
235
- preloadModels
236
- });
285
+ if (!runtime.command) {
286
+ const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
287
+ error(errorMessage);
288
+ return { ok: false, errorMessage };
289
+ }
237
290
 
238
- return new Promise((resolve) => {
239
- let settled = false;
240
- const child = spawnImpl(args[0], args.slice(1), {
241
- stdio: "ignore"
291
+ const preloadModels = buildPreloadModels(config);
292
+ const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
293
+ const launchProfile = firstModel?.variant && firstModel?.baseModel
294
+ ? deriveLlamacppLaunchProfile({
295
+ variant: firstModel.variant,
296
+ baseModel: firstModel.baseModel,
297
+ system: detectLlamacppSystemProfile(system)
298
+ })
299
+ : null;
300
+ const args = buildLlamacppLaunchArgs({
301
+ command: runtime.command,
302
+ host: runtime.host,
303
+ port: runtime.port,
304
+ preloadModels,
305
+ launchProfile
306
+ });
307
+ const variantKey = normalizeString(firstModel?.variant?.key || firstModel?.variantId) || "default";
308
+ const profileHash = buildRuntimeProfileHash({
309
+ command: runtime.command,
310
+ host: runtime.host,
311
+ port: runtime.port,
312
+ args: args.slice(1)
313
+ });
314
+ const listListeningPidsFn = typeof listListeningPids === "function"
315
+ ? listListeningPids
316
+ : (port) => listListeningPidsForPort(port, { spawnSync: spawnSyncImpl });
317
+ const stopProcessByPidFn = typeof stopProcessByPid === "function"
318
+ ? stopProcessByPid
319
+ : (pid) => stopProcessByPidForRuntime(pid);
320
+ await managedLlamacppRuntimeRegistry.reconcile({
321
+ listListeningPids: async (port) => normalizeListeningPidResult(await listListeningPidsFn(port)),
322
+ stopProcessByPid: async (pid) => stopProcessByPidFn(pid)
242
323
  });
243
324
 
244
- const finish = (result) => {
245
- if (settled) return;
246
- settled = true;
247
- resolve(result);
248
- };
325
+ const existing = managedLlamacppRuntimeRegistry
326
+ .snapshot()
327
+ .find((instance) => (
328
+ instance.variantKey === variantKey
329
+ && instance.profileHash === profileHash
330
+ && isManagedRuntimeAlive(instance)
331
+ ));
332
+ if (existing) {
333
+ return { ok: true, alreadyRunning: true, runtime: existing };
334
+ }
249
335
 
250
- child.once("spawn", () => {
251
- managedLlamacppRuntime = {
252
- child,
253
- command: runtime.command,
254
- host: runtime.host,
255
- port: runtime.port,
256
- args
257
- };
258
- child.once("exit", () => {
259
- if (managedLlamacppRuntime?.child === child) {
260
- managedLlamacppRuntime = null;
261
- }
336
+ const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
337
+ if (!validation.ok) {
338
+ error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
339
+ return validation;
340
+ }
341
+
342
+ try {
343
+ const managedRuntime = await managedLlamacppRuntimeRegistry.ensureRuntimeForVariant({
344
+ variantKey,
345
+ profileHash,
346
+ launchArgs: args.slice(1),
347
+ preferredPort: runtime.port
348
+ }, {
349
+ spawnRuntime: async ({ port }) => new Promise((resolve, reject) => {
350
+ let settled = false;
351
+ const allocatedArgs = buildLlamacppLaunchArgs({
352
+ command: runtime.command,
353
+ host: runtime.host,
354
+ port,
355
+ preloadModels,
356
+ launchProfile
357
+ });
358
+ const child = spawnImpl(allocatedArgs[0], allocatedArgs.slice(1), {
359
+ stdio: "ignore"
360
+ });
361
+ const expectedInstanceId = `${variantKey}:${profileHash}:${port}`;
362
+ if (child && child.__llamacppManagedExitHookAttached !== true) {
363
+ child.__llamacppManagedExitHookAttached = true;
364
+ child.once("exit", () => {
365
+ void managedLlamacppRuntimeRegistry.untrackInstance(expectedInstanceId);
366
+ });
367
+ }
368
+
369
+ const settleResolve = (value) => {
370
+ if (settled) return;
371
+ settled = true;
372
+ resolve(value);
373
+ };
374
+ const settleReject = (reason) => {
375
+ if (settled) return;
376
+ settled = true;
377
+ reject(reason);
378
+ };
379
+
380
+ child.once("spawn", () => {
381
+ if (typeof child.unref === "function") child.unref();
382
+ settleResolve({
383
+ pid: child?.pid,
384
+ child,
385
+ command: runtime.command,
386
+ host: runtime.host,
387
+ port,
388
+ args: allocatedArgs,
389
+ baseUrl: `http://${runtime.host}:${port}/v1`
390
+ });
391
+ });
392
+ child.once("error", (spawnError) => {
393
+ settleReject(spawnError);
394
+ });
395
+ }),
396
+ waitForHealthy: async (instance) => instance
262
397
  });
263
- if (typeof child.unref === "function") child.unref();
264
- line(`Started llama.cpp runtime on http://${runtime.host}:${runtime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
265
- finish({ ok: true, runtime: managedLlamacppRuntime, validation });
266
- });
267
398
 
268
- child.once("error", (spawnError) => {
399
+ line(`Started llama.cpp runtime on http://${managedRuntime.host}:${managedRuntime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
400
+ return { ok: true, runtime: managedRuntime, validation };
401
+ } catch (spawnError) {
269
402
  const errorMessage = spawnError instanceof Error ? spawnError.message : String(spawnError);
270
403
  error(`Failed starting llama.cpp runtime: ${errorMessage}`);
271
- finish({ ok: false, errorMessage });
272
- });
273
- });
404
+ return { ok: false, errorMessage };
405
+ }
406
+ } finally {
407
+ inFlightConfiguredStartCount = Math.max(0, inFlightConfiguredStartCount - 1);
408
+ }
274
409
  }
275
410
 
276
411
  export async function ensureConfiguredLlamacppRuntimeStarted(config, callbacks = {}, deps = {}) {
@@ -287,23 +422,66 @@ export async function startConfiguredLlamacppRuntime(config, callbacks = {}, dep
287
422
  }, deps);
288
423
  }
289
424
 
425
+ export function getManagedLlamacppRuntimeSnapshot() {
426
+ return managedLlamacppRuntimeRegistry.snapshot().map((instance) => {
427
+ const { child: _child, ...rest } = instance || {};
428
+ return JSON.parse(JSON.stringify(rest));
429
+ });
430
+ }
431
+
290
432
  export async function stopManagedLlamacppRuntime({
291
433
  line = () => {},
292
434
  error = () => {}
293
435
  } = {}) {
294
- const active = managedLlamacppRuntime;
295
- if (!active?.child) {
436
+ while (inFlightConfiguredStartCount > 0) {
437
+ await delay(0);
438
+ }
439
+ if (typeof managedLlamacppRuntimeRegistry.waitForInFlightStarts === "function") {
440
+ await managedLlamacppRuntimeRegistry.waitForInFlightStarts();
441
+ }
442
+ const instances = managedLlamacppRuntimeRegistry.snapshot();
443
+ if (instances.length === 0) {
296
444
  return { ok: true, skipped: true, reason: "not-running" };
297
445
  }
298
446
 
299
- managedLlamacppRuntime = null;
300
- try {
301
- active.child.kill("SIGTERM");
302
- line("Stopped managed llama.cpp runtime.");
303
- return { ok: true };
304
- } catch (stopError) {
305
- const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
306
- error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
307
- return { ok: false, errorMessage };
447
+ const failures = [];
448
+ let stoppedCount = 0;
449
+ let pendingExitCount = 0;
450
+ for (const instance of instances) {
451
+ try {
452
+ if (instance?.owner === "llm-router" && typeof instance?.child?.kill === "function") {
453
+ const killResult = instance.child.kill("SIGTERM");
454
+ if (killResult !== false) {
455
+ stoppedCount += 1;
456
+ }
457
+ }
458
+ if (!isManagedRuntimeAlive(instance)) {
459
+ await managedLlamacppRuntimeRegistry.untrackInstance(instance?.instanceId);
460
+ } else {
461
+ pendingExitCount += 1;
462
+ }
463
+ } catch (stopError) {
464
+ const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
465
+ failures.push(errorMessage);
466
+ error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
467
+ }
468
+ }
469
+
470
+ if (stoppedCount > 0) {
471
+ if (pendingExitCount === 0) {
472
+ line(stoppedCount === 1 ? "Stopped managed llama.cpp runtime." : `Stopped ${stoppedCount} managed llama.cpp runtimes.`);
473
+ } else {
474
+ line(stoppedCount === 1
475
+ ? "Stop signal sent to managed llama.cpp runtime; waiting for exit."
476
+ : `Stop signal sent to ${stoppedCount} managed llama.cpp runtimes; waiting for exits.`);
477
+ }
308
478
  }
479
+
480
+ const completed = failures.length === 0 && pendingExitCount === 0;
481
+ return {
482
+ ok: completed,
483
+ stoppedCount,
484
+ pendingExitCount,
485
+ ...(failures.length > 0 ? { errorMessage: failures.join("; ") } : {})
486
+ };
309
487
  }
@@ -212,19 +212,42 @@ export async function saveLocalModelVariant(config, draft, {
212
212
  activeVariants,
213
213
  totalMemoryBytes: system.totalMemoryBytes
214
214
  });
215
- if (!decision.allowed) {
215
+ if (!decision.allowed) {
216
216
  throw new Error(decision.reason);
217
217
  }
218
218
  }
219
219
 
220
+ const normalizedMetadata = normalizeLocalModelsMetadata({
221
+ variants: {
222
+ draft: normalizedDraft
223
+ }
224
+ });
225
+ const normalizedVariantDraft = Object.values(normalizedMetadata.variants)[0] || {};
226
+ const previousVariant = isPlainObject(next.metadata.localModels.variants[key])
227
+ ? next.metadata.localModels.variants[key]
228
+ : {};
229
+
220
230
  next.metadata.localModels.variants[key] = {
221
- ...(isPlainObject(next.metadata.localModels.variants[key]) ? next.metadata.localModels.variants[key] : {}),
231
+ ...previousVariant,
222
232
  key,
223
233
  baseModelId,
224
234
  id: modelId,
225
235
  name,
226
236
  runtime,
227
237
  preset: normalizeString(normalizedDraft.preset),
238
+ runtimeProfile: runtime === "llamacpp"
239
+ ? normalizedVariantDraft.runtimeProfile
240
+ : undefined,
241
+ runtimeStatus: runtime === "llamacpp"
242
+ ? (isPlainObject(previousVariant.runtimeStatus)
243
+ ? previousVariant.runtimeStatus
244
+ : {
245
+ activeInstanceId: "",
246
+ lastFailure: null,
247
+ lastStartedAt: "",
248
+ lastHealthyAt: ""
249
+ })
250
+ : undefined,
228
251
  enabled: normalizedDraft.enabled === true,
229
252
  preload: normalizedDraft.preload === true,
230
253
  contextWindow: Number.isFinite(Number(normalizedDraft.contextWindow)) ? Number(normalizedDraft.contextWindow) : undefined,
@@ -13,6 +13,10 @@ import { readActivityLogSettings } from "../shared/local-router-defaults.js";
13
13
  import { appendActivityLogEntry, resolveActivityLogPath } from "./activity-log.js";
14
14
  import { appendLargeRequestLogEntry, resolveLargeRequestLogPath } from "./large-request-log.js";
15
15
  import { isLargeRequestLoggingEnabled } from "../runtime/handler/large-request-log.js";
16
+ import {
17
+ startConfiguredLlamacppRuntime,
18
+ stopManagedLlamacppRuntime
19
+ } from "./llamacpp-runtime.js";
16
20
 
17
21
  const DEFAULT_CONFIG_RELOAD_DEBOUNCE_MS = 300;
18
22
  const MAX_CONFIG_RELOAD_DEBOUNCE_MS = 5000;
@@ -34,6 +38,10 @@ function formatError(error) {
34
38
  return error instanceof Error ? error.message : String(error);
35
39
  }
36
40
 
41
+ function normalizeString(value) {
42
+ return typeof value === "string" ? value.trim() : "";
43
+ }
44
+
37
45
  function createLiveConfigStore({
38
46
  configPath,
39
47
  watchConfig = true,
@@ -237,6 +245,39 @@ async function writeFetchResponseToNode(res, response) {
237
245
  readable.pipe(res);
238
246
  }
239
247
 
248
+ function buildVariantLlamacppRuntimeConfig(config, variantKey) {
249
+ const normalizedVariantKey = normalizeString(variantKey);
250
+ const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
251
+ const variants = config?.metadata?.localModels?.variants;
252
+ const library = config?.metadata?.localModels?.library;
253
+ const variant = variants?.[normalizedVariantKey];
254
+ if (!runtime || !variant || variant.runtime !== "llamacpp") return null;
255
+
256
+ const baseModelId = normalizeString(variant?.baseModelId);
257
+ const baseModel = library?.[baseModelId];
258
+ if (!baseModel) return null;
259
+
260
+ return {
261
+ metadata: {
262
+ localModels: {
263
+ runtime: {
264
+ llamacpp: { ...runtime }
265
+ },
266
+ library: {
267
+ [baseModelId]: { ...baseModel }
268
+ },
269
+ variants: {
270
+ [normalizedVariantKey]: {
271
+ ...variant,
272
+ enabled: true,
273
+ preload: true
274
+ }
275
+ }
276
+ }
277
+ }
278
+ };
279
+ }
280
+
240
281
  export async function startLocalRouteServer({
241
282
  port = FIXED_LOCAL_ROUTER_PORT,
242
283
  host = FIXED_LOCAL_ROUTER_HOST,
@@ -248,7 +289,10 @@ export async function startLocalRouteServer({
248
289
  validateConfig,
249
290
  onConfigReload,
250
291
  onConfigReloadError,
251
- requireAuth = false
292
+ requireAuth = false,
293
+ createFetchHandlerImpl = createFetchHandler,
294
+ startConfiguredLlamacppRuntimeImpl = startConfiguredLlamacppRuntime,
295
+ stopManagedLlamacppRuntimeImpl = stopManagedLlamacppRuntime
252
296
  } = {}) {
253
297
  const reloadDebounceMs = resolveReloadDebounceMs(configReloadDebounceMs);
254
298
  const resolvedActivityLogPath = resolveActivityLogPath(configPath, activityLogPath);
@@ -270,9 +314,22 @@ export async function startLocalRouteServer({
270
314
  const initialConfig = await configStore.getConfig();
271
315
  activityLogEnabled = readActivityLogSettings(initialConfig).enabled;
272
316
 
273
- const fetchHandler = createFetchHandler({
317
+ const fetchHandler = createFetchHandlerImpl({
274
318
  ignoreAuth: !requireAuth,
319
+ runtime: "node",
275
320
  getConfig: () => configStore.getConfig(),
321
+ resolveLocalRuntimeBaseUrl: async ({ candidate }) => {
322
+ const variantKey = candidate?.model?.metadata?.localVariantKey;
323
+ const config = await configStore.getConfig();
324
+ const targetedConfig = buildVariantLlamacppRuntimeConfig(config, variantKey);
325
+ if (!targetedConfig) return "";
326
+
327
+ const started = await startConfiguredLlamacppRuntimeImpl(targetedConfig);
328
+ if (!started?.ok) {
329
+ throw new Error(started?.errorMessage || `Failed starting local runtime for ${normalizeString(variantKey) || "unknown variant"}.`);
330
+ }
331
+ return normalizeString(started?.runtime?.baseUrl);
332
+ },
276
333
  defaultStateStoreBackend: "file",
277
334
  onActivityLog: (entry) => {
278
335
  if (!activityLogEnabled) return;
@@ -355,6 +412,7 @@ export async function startLocalRouteServer({
355
412
  server.close = (callback) => {
356
413
  shuttingDown = true;
357
414
  Promise.resolve()
415
+ .then(() => stopManagedLlamacppRuntimeImpl().catch(() => {}))
358
416
  .then(() => configStore.close())
359
417
  .then(() => (typeof fetchHandler.close === "function" ? fetchHandler.close() : undefined))
360
418
  .finally(() => {