@khanglvm/llm-router 2.6.0 → 2.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,11 +2,6 @@ import path from "node:path";
2
2
  import os from "node:os";
3
3
  import { existsSync } from "node:fs";
4
4
  import { spawn, spawnSync } from "node:child_process";
5
- import { setTimeout as delay } from "node:timers/promises";
6
- import { deriveLlamacppLaunchProfile } from "./llamacpp-runtime-profile.js";
7
- import { createLlamacppManagedRuntimeRegistry } from "./llamacpp-managed-runtime.js";
8
- import { listListeningPids as listListeningPidsForPort } from "./port-reclaim.js";
9
- import { stopProcessByPid as stopProcessByPidForRuntime } from "./instance-state.js";
10
5
 
11
6
  export const LLAMACPP_DEFAULT_HOST = "127.0.0.1";
12
7
  export const LLAMACPP_DEFAULT_PORT = 39391;
@@ -21,8 +16,8 @@ const COMMON_SOURCE_BUILD_PATHS = Object.freeze([
21
16
  "src/llama-cpp-turboquant/build/bin/llama-server",
22
17
  "src/llama.cpp-turboquant/build/bin/llama-server"
23
18
  ]);
24
- const managedLlamacppRuntimeRegistry = createLlamacppManagedRuntimeRegistry();
25
- let inFlightConfiguredStartCount = 0;
19
+
20
+ let managedLlamacppRuntime = null;
26
21
 
27
22
  function isPlainObject(value) {
28
23
  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
@@ -44,34 +39,6 @@ function normalizePathEntries(entries) {
44
39
  : [];
45
40
  }
46
41
 
47
- function buildRuntimeProfileHash({ command, host, port, args = [] } = {}) {
48
- const normalizedArgs = Array.isArray(args) ? args.filter(Boolean) : [];
49
- return `${normalizeString(command)}|${normalizeString(host)}|${String(normalizePort(port, LLAMACPP_DEFAULT_PORT))}|${normalizedArgs.join("\u001f")}`;
50
- }
51
-
52
- function isManagedRuntimeAlive(instance) {
53
- const child = instance?.child;
54
- if (!child) return false;
55
- return child.exitCode === null && child.killed !== true;
56
- }
57
-
58
- function normalizeListeningPidResult(result) {
59
- if (result && typeof result === "object" && result.ok === false) {
60
- return { ok: false, pids: [] };
61
- }
62
- if (Array.isArray(result)) {
63
- return result
64
- .map((value) => Number(value))
65
- .filter((pid) => Number.isInteger(pid) && pid > 0);
66
- }
67
- if (result && typeof result === "object" && Array.isArray(result.pids)) {
68
- return result.pids
69
- .map((value) => Number(value))
70
- .filter((pid) => Number.isInteger(pid) && pid > 0);
71
- }
72
- return [];
73
- }
74
-
75
42
  function readConfiguredLlamacppRuntime(config) {
76
43
  const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
77
44
  if (!isPlainObject(runtime)) {
@@ -105,8 +72,6 @@ function buildPreloadModels(config) {
105
72
  if (!modelPath) continue;
106
73
  preloadModels.push({
107
74
  variantId: normalizeString(variant.id),
108
- variant,
109
- baseModel,
110
75
  modelPath,
111
76
  contextWindow: Number.isFinite(Number(variant.contextWindow)) ? Number(variant.contextWindow) : undefined
112
77
  });
@@ -114,15 +79,6 @@ function buildPreloadModels(config) {
114
79
  return preloadModels;
115
80
  }
116
81
 
117
- function detectLlamacppSystemProfile(system = {}) {
118
- const totalMemoryBytes = Number(system?.totalMemoryBytes);
119
- return {
120
- platform: normalizeString(system?.platform) || process.platform,
121
- unifiedMemory: system?.unifiedMemory === true || process.platform === "darwin",
122
- totalMemoryBytes: Number.isFinite(totalMemoryBytes) && totalMemoryBytes > 0 ? totalMemoryBytes : os.totalmem()
123
- };
124
- }
125
-
126
82
  export function detectLlamacppCandidates({
127
83
  envPathEntries = process.env.PATH?.split(path.delimiter) || [],
128
84
  homeDir = os.homedir(),
@@ -166,18 +122,16 @@ export function buildLlamacppLaunchArgs({
166
122
  command,
167
123
  host = LLAMACPP_DEFAULT_HOST,
168
124
  port = LLAMACPP_DEFAULT_PORT,
169
- preloadModels = [],
170
- launchProfile = null
125
+ preloadModels = []
171
126
  } = {}) {
172
127
  const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
173
128
  const args = [
174
129
  normalizeString(command),
175
130
  "--host", normalizeString(host) || LLAMACPP_DEFAULT_HOST,
176
- "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT)),
177
- ...((Array.isArray(launchProfile?.args) ? launchProfile.args : []).filter(Boolean))
131
+ "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
178
132
  ];
179
133
 
180
- if (!launchProfile && firstModel?.modelPath) {
134
+ if (firstModel?.modelPath) {
181
135
  args.push("-m", firstModel.modelPath);
182
136
  if (Number.isFinite(Number(firstModel.contextWindow)) && Number(firstModel.contextWindow) > 0) {
183
137
  args.push("-c", String(Math.floor(Number(firstModel.contextWindow))));
@@ -187,31 +141,6 @@ export function buildLlamacppLaunchArgs({
187
141
  return args.filter(Boolean);
188
142
  }
189
143
 
190
- export async function spawnManagedLlamacppRuntime({
191
- command,
192
- host = LLAMACPP_DEFAULT_HOST,
193
- port = LLAMACPP_DEFAULT_PORT,
194
- launchProfile
195
- } = {}, {
196
- spawnImpl = spawn
197
- } = {}) {
198
- const args = buildLlamacppLaunchArgs({
199
- command,
200
- host,
201
- port,
202
- launchProfile
203
- });
204
- const child = spawnImpl(args[0], args.slice(1), { stdio: "ignore" });
205
- return {
206
- pid: child?.pid,
207
- child,
208
- host,
209
- port,
210
- baseUrl: `http://${host}:${port}/v1`,
211
- args
212
- };
213
- }
214
-
215
144
  export function parseLlamacppValidationOutput(output = "") {
216
145
  const text = String(output || "").trim();
217
146
  const lowered = text.toLowerCase();
@@ -270,142 +199,78 @@ async function startConfiguredRuntime(config, {
270
199
  requireAutostart = true
271
200
  } = {}, {
272
201
  spawnSyncImpl = spawnSync,
273
- spawnImpl = spawn,
274
- system = undefined,
275
- listListeningPids = undefined,
276
- stopProcessByPid = undefined
202
+ spawnImpl = spawn
277
203
  } = {}) {
278
- inFlightConfiguredStartCount += 1;
279
- try {
280
- const runtime = readConfiguredLlamacppRuntime(config);
281
- if (requireAutostart && !runtime.startWithRouter) {
282
- return { ok: true, skipped: true, reason: "autostart-disabled" };
283
- }
204
+ const runtime = readConfiguredLlamacppRuntime(config);
205
+ if (requireAutostart && !runtime.startWithRouter) {
206
+ return { ok: true, skipped: true, reason: "autostart-disabled" };
207
+ }
284
208
 
285
- if (!runtime.command) {
286
- const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
287
- error(errorMessage);
288
- return { ok: false, errorMessage };
289
- }
209
+ if (!runtime.command) {
210
+ const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
211
+ error(errorMessage);
212
+ return { ok: false, errorMessage };
213
+ }
290
214
 
291
- const preloadModels = buildPreloadModels(config);
292
- const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
293
- const launchProfile = firstModel?.variant && firstModel?.baseModel
294
- ? deriveLlamacppLaunchProfile({
295
- variant: firstModel.variant,
296
- baseModel: firstModel.baseModel,
297
- system: detectLlamacppSystemProfile(system)
298
- })
299
- : null;
300
- const args = buildLlamacppLaunchArgs({
301
- command: runtime.command,
302
- host: runtime.host,
303
- port: runtime.port,
304
- preloadModels,
305
- launchProfile
306
- });
307
- const variantKey = normalizeString(firstModel?.variant?.key || firstModel?.variantId) || "default";
308
- const profileHash = buildRuntimeProfileHash({
309
- command: runtime.command,
310
- host: runtime.host,
311
- port: runtime.port,
312
- args: args.slice(1)
313
- });
314
- const listListeningPidsFn = typeof listListeningPids === "function"
315
- ? listListeningPids
316
- : (port) => listListeningPidsForPort(port, { spawnSync: spawnSyncImpl });
317
- const stopProcessByPidFn = typeof stopProcessByPid === "function"
318
- ? stopProcessByPid
319
- : (pid) => stopProcessByPidForRuntime(pid);
320
- await managedLlamacppRuntimeRegistry.reconcile({
321
- listListeningPids: async (port) => normalizeListeningPidResult(await listListeningPidsFn(port)),
322
- stopProcessByPid: async (pid) => stopProcessByPidFn(pid)
323
- });
215
+ if (managedLlamacppRuntime
216
+ && managedLlamacppRuntime.command === runtime.command
217
+ && managedLlamacppRuntime.host === runtime.host
218
+ && managedLlamacppRuntime.port === runtime.port
219
+ && managedLlamacppRuntime.child?.exitCode === null
220
+ && managedLlamacppRuntime.child?.killed !== true) {
221
+ return { ok: true, alreadyRunning: true, runtime: managedLlamacppRuntime };
222
+ }
324
223
 
325
- const existing = managedLlamacppRuntimeRegistry
326
- .snapshot()
327
- .find((instance) => (
328
- instance.variantKey === variantKey
329
- && instance.profileHash === profileHash
330
- && isManagedRuntimeAlive(instance)
331
- ));
332
- if (existing) {
333
- return { ok: true, alreadyRunning: true, runtime: existing };
334
- }
224
+ const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
225
+ if (!validation.ok) {
226
+ error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
227
+ return validation;
228
+ }
335
229
 
336
- const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
337
- if (!validation.ok) {
338
- error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
339
- return validation;
340
- }
230
+ const preloadModels = buildPreloadModels(config);
231
+ const args = buildLlamacppLaunchArgs({
232
+ command: runtime.command,
233
+ host: runtime.host,
234
+ port: runtime.port,
235
+ preloadModels
236
+ });
341
237
 
342
- try {
343
- const managedRuntime = await managedLlamacppRuntimeRegistry.ensureRuntimeForVariant({
344
- variantKey,
345
- profileHash,
346
- launchArgs: args.slice(1),
347
- preferredPort: runtime.port
348
- }, {
349
- spawnRuntime: async ({ port }) => new Promise((resolve, reject) => {
350
- let settled = false;
351
- const allocatedArgs = buildLlamacppLaunchArgs({
352
- command: runtime.command,
353
- host: runtime.host,
354
- port,
355
- preloadModels,
356
- launchProfile
357
- });
358
- const child = spawnImpl(allocatedArgs[0], allocatedArgs.slice(1), {
359
- stdio: "ignore"
360
- });
361
- const expectedInstanceId = `${variantKey}:${profileHash}:${port}`;
362
- if (child && child.__llamacppManagedExitHookAttached !== true) {
363
- child.__llamacppManagedExitHookAttached = true;
364
- child.once("exit", () => {
365
- void managedLlamacppRuntimeRegistry.untrackInstance(expectedInstanceId);
366
- });
367
- }
368
-
369
- const settleResolve = (value) => {
370
- if (settled) return;
371
- settled = true;
372
- resolve(value);
373
- };
374
- const settleReject = (reason) => {
375
- if (settled) return;
376
- settled = true;
377
- reject(reason);
378
- };
379
-
380
- child.once("spawn", () => {
381
- if (typeof child.unref === "function") child.unref();
382
- settleResolve({
383
- pid: child?.pid,
384
- child,
385
- command: runtime.command,
386
- host: runtime.host,
387
- port,
388
- args: allocatedArgs,
389
- baseUrl: `http://${runtime.host}:${port}/v1`
390
- });
391
- });
392
- child.once("error", (spawnError) => {
393
- settleReject(spawnError);
394
- });
395
- }),
396
- waitForHealthy: async (instance) => instance
238
+ return new Promise((resolve) => {
239
+ let settled = false;
240
+ const child = spawnImpl(args[0], args.slice(1), {
241
+ stdio: "ignore"
242
+ });
243
+
244
+ const finish = (result) => {
245
+ if (settled) return;
246
+ settled = true;
247
+ resolve(result);
248
+ };
249
+
250
+ child.once("spawn", () => {
251
+ managedLlamacppRuntime = {
252
+ child,
253
+ command: runtime.command,
254
+ host: runtime.host,
255
+ port: runtime.port,
256
+ args
257
+ };
258
+ child.once("exit", () => {
259
+ if (managedLlamacppRuntime?.child === child) {
260
+ managedLlamacppRuntime = null;
261
+ }
397
262
  });
263
+ if (typeof child.unref === "function") child.unref();
264
+ line(`Started llama.cpp runtime on http://${runtime.host}:${runtime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
265
+ finish({ ok: true, runtime: managedLlamacppRuntime, validation });
266
+ });
398
267
 
399
- line(`Started llama.cpp runtime on http://${managedRuntime.host}:${managedRuntime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
400
- return { ok: true, runtime: managedRuntime, validation };
401
- } catch (spawnError) {
268
+ child.once("error", (spawnError) => {
402
269
  const errorMessage = spawnError instanceof Error ? spawnError.message : String(spawnError);
403
270
  error(`Failed starting llama.cpp runtime: ${errorMessage}`);
404
- return { ok: false, errorMessage };
405
- }
406
- } finally {
407
- inFlightConfiguredStartCount = Math.max(0, inFlightConfiguredStartCount - 1);
408
- }
271
+ finish({ ok: false, errorMessage });
272
+ });
273
+ });
409
274
  }
410
275
 
411
276
  export async function ensureConfiguredLlamacppRuntimeStarted(config, callbacks = {}, deps = {}) {
@@ -422,66 +287,23 @@ export async function startConfiguredLlamacppRuntime(config, callbacks = {}, dep
422
287
  }, deps);
423
288
  }
424
289
 
425
- export function getManagedLlamacppRuntimeSnapshot() {
426
- return managedLlamacppRuntimeRegistry.snapshot().map((instance) => {
427
- const { child: _child, ...rest } = instance || {};
428
- return JSON.parse(JSON.stringify(rest));
429
- });
430
- }
431
-
432
290
  export async function stopManagedLlamacppRuntime({
433
291
  line = () => {},
434
292
  error = () => {}
435
293
  } = {}) {
436
- while (inFlightConfiguredStartCount > 0) {
437
- await delay(0);
438
- }
439
- if (typeof managedLlamacppRuntimeRegistry.waitForInFlightStarts === "function") {
440
- await managedLlamacppRuntimeRegistry.waitForInFlightStarts();
441
- }
442
- const instances = managedLlamacppRuntimeRegistry.snapshot();
443
- if (instances.length === 0) {
294
+ const active = managedLlamacppRuntime;
295
+ if (!active?.child) {
444
296
  return { ok: true, skipped: true, reason: "not-running" };
445
297
  }
446
298
 
447
- const failures = [];
448
- let stoppedCount = 0;
449
- let pendingExitCount = 0;
450
- for (const instance of instances) {
451
- try {
452
- if (instance?.owner === "llm-router" && typeof instance?.child?.kill === "function") {
453
- const killResult = instance.child.kill("SIGTERM");
454
- if (killResult !== false) {
455
- stoppedCount += 1;
456
- }
457
- }
458
- if (!isManagedRuntimeAlive(instance)) {
459
- await managedLlamacppRuntimeRegistry.untrackInstance(instance?.instanceId);
460
- } else {
461
- pendingExitCount += 1;
462
- }
463
- } catch (stopError) {
464
- const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
465
- failures.push(errorMessage);
466
- error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
467
- }
468
- }
469
-
470
- if (stoppedCount > 0) {
471
- if (pendingExitCount === 0) {
472
- line(stoppedCount === 1 ? "Stopped managed llama.cpp runtime." : `Stopped ${stoppedCount} managed llama.cpp runtimes.`);
473
- } else {
474
- line(stoppedCount === 1
475
- ? "Stop signal sent to managed llama.cpp runtime; waiting for exit."
476
- : `Stop signal sent to ${stoppedCount} managed llama.cpp runtimes; waiting for exits.`);
477
- }
299
+ managedLlamacppRuntime = null;
300
+ try {
301
+ active.child.kill("SIGTERM");
302
+ line("Stopped managed llama.cpp runtime.");
303
+ return { ok: true };
304
+ } catch (stopError) {
305
+ const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
306
+ error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
307
+ return { ok: false, errorMessage };
478
308
  }
479
-
480
- const completed = failures.length === 0 && pendingExitCount === 0;
481
- return {
482
- ok: completed,
483
- stoppedCount,
484
- pendingExitCount,
485
- ...(failures.length > 0 ? { errorMessage: failures.join("; ") } : {})
486
- };
487
309
  }
@@ -212,42 +212,19 @@ export async function saveLocalModelVariant(config, draft, {
212
212
  activeVariants,
213
213
  totalMemoryBytes: system.totalMemoryBytes
214
214
  });
215
- if (!decision.allowed) {
215
+ if (!decision.allowed) {
216
216
  throw new Error(decision.reason);
217
217
  }
218
218
  }
219
219
 
220
- const normalizedMetadata = normalizeLocalModelsMetadata({
221
- variants: {
222
- draft: normalizedDraft
223
- }
224
- });
225
- const normalizedVariantDraft = Object.values(normalizedMetadata.variants)[0] || {};
226
- const previousVariant = isPlainObject(next.metadata.localModels.variants[key])
227
- ? next.metadata.localModels.variants[key]
228
- : {};
229
-
230
220
  next.metadata.localModels.variants[key] = {
231
- ...previousVariant,
221
+ ...(isPlainObject(next.metadata.localModels.variants[key]) ? next.metadata.localModels.variants[key] : {}),
232
222
  key,
233
223
  baseModelId,
234
224
  id: modelId,
235
225
  name,
236
226
  runtime,
237
227
  preset: normalizeString(normalizedDraft.preset),
238
- runtimeProfile: runtime === "llamacpp"
239
- ? normalizedVariantDraft.runtimeProfile
240
- : undefined,
241
- runtimeStatus: runtime === "llamacpp"
242
- ? (isPlainObject(previousVariant.runtimeStatus)
243
- ? previousVariant.runtimeStatus
244
- : {
245
- activeInstanceId: "",
246
- lastFailure: null,
247
- lastStartedAt: "",
248
- lastHealthyAt: ""
249
- })
250
- : undefined,
251
228
  enabled: normalizedDraft.enabled === true,
252
229
  preload: normalizedDraft.preload === true,
253
230
  contextWindow: Number.isFinite(Number(normalizedDraft.contextWindow)) ? Number(normalizedDraft.contextWindow) : undefined,
@@ -13,10 +13,6 @@ import { readActivityLogSettings } from "../shared/local-router-defaults.js";
13
13
  import { appendActivityLogEntry, resolveActivityLogPath } from "./activity-log.js";
14
14
  import { appendLargeRequestLogEntry, resolveLargeRequestLogPath } from "./large-request-log.js";
15
15
  import { isLargeRequestLoggingEnabled } from "../runtime/handler/large-request-log.js";
16
- import {
17
- startConfiguredLlamacppRuntime,
18
- stopManagedLlamacppRuntime
19
- } from "./llamacpp-runtime.js";
20
16
 
21
17
  const DEFAULT_CONFIG_RELOAD_DEBOUNCE_MS = 300;
22
18
  const MAX_CONFIG_RELOAD_DEBOUNCE_MS = 5000;
@@ -38,10 +34,6 @@ function formatError(error) {
38
34
  return error instanceof Error ? error.message : String(error);
39
35
  }
40
36
 
41
- function normalizeString(value) {
42
- return typeof value === "string" ? value.trim() : "";
43
- }
44
-
45
37
  function createLiveConfigStore({
46
38
  configPath,
47
39
  watchConfig = true,
@@ -245,39 +237,6 @@ async function writeFetchResponseToNode(res, response) {
245
237
  readable.pipe(res);
246
238
  }
247
239
 
248
- function buildVariantLlamacppRuntimeConfig(config, variantKey) {
249
- const normalizedVariantKey = normalizeString(variantKey);
250
- const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
251
- const variants = config?.metadata?.localModels?.variants;
252
- const library = config?.metadata?.localModels?.library;
253
- const variant = variants?.[normalizedVariantKey];
254
- if (!runtime || !variant || variant.runtime !== "llamacpp") return null;
255
-
256
- const baseModelId = normalizeString(variant?.baseModelId);
257
- const baseModel = library?.[baseModelId];
258
- if (!baseModel) return null;
259
-
260
- return {
261
- metadata: {
262
- localModels: {
263
- runtime: {
264
- llamacpp: { ...runtime }
265
- },
266
- library: {
267
- [baseModelId]: { ...baseModel }
268
- },
269
- variants: {
270
- [normalizedVariantKey]: {
271
- ...variant,
272
- enabled: true,
273
- preload: true
274
- }
275
- }
276
- }
277
- }
278
- };
279
- }
280
-
281
240
  export async function startLocalRouteServer({
282
241
  port = FIXED_LOCAL_ROUTER_PORT,
283
242
  host = FIXED_LOCAL_ROUTER_HOST,
@@ -289,10 +248,7 @@ export async function startLocalRouteServer({
289
248
  validateConfig,
290
249
  onConfigReload,
291
250
  onConfigReloadError,
292
- requireAuth = false,
293
- createFetchHandlerImpl = createFetchHandler,
294
- startConfiguredLlamacppRuntimeImpl = startConfiguredLlamacppRuntime,
295
- stopManagedLlamacppRuntimeImpl = stopManagedLlamacppRuntime
251
+ requireAuth = false
296
252
  } = {}) {
297
253
  const reloadDebounceMs = resolveReloadDebounceMs(configReloadDebounceMs);
298
254
  const resolvedActivityLogPath = resolveActivityLogPath(configPath, activityLogPath);
@@ -314,22 +270,9 @@ export async function startLocalRouteServer({
314
270
  const initialConfig = await configStore.getConfig();
315
271
  activityLogEnabled = readActivityLogSettings(initialConfig).enabled;
316
272
 
317
- const fetchHandler = createFetchHandlerImpl({
273
+ const fetchHandler = createFetchHandler({
318
274
  ignoreAuth: !requireAuth,
319
- runtime: "node",
320
275
  getConfig: () => configStore.getConfig(),
321
- resolveLocalRuntimeBaseUrl: async ({ candidate }) => {
322
- const variantKey = candidate?.model?.metadata?.localVariantKey;
323
- const config = await configStore.getConfig();
324
- const targetedConfig = buildVariantLlamacppRuntimeConfig(config, variantKey);
325
- if (!targetedConfig) return "";
326
-
327
- const started = await startConfiguredLlamacppRuntimeImpl(targetedConfig);
328
- if (!started?.ok) {
329
- throw new Error(started?.errorMessage || `Failed starting local runtime for ${normalizeString(variantKey) || "unknown variant"}.`);
330
- }
331
- return normalizeString(started?.runtime?.baseUrl);
332
- },
333
276
  defaultStateStoreBackend: "file",
334
277
  onActivityLog: (entry) => {
335
278
  if (!activityLogEnabled) return;
@@ -412,7 +355,6 @@ export async function startLocalRouteServer({
412
355
  server.close = (callback) => {
413
356
  shuttingDown = true;
414
357
  Promise.resolve()
415
- .then(() => stopManagedLlamacppRuntimeImpl().catch(() => {}))
416
358
  .then(() => configStore.close())
417
359
  .then(() => (typeof fetchHandler.close === "function" ? fetchHandler.close() : undefined))
418
360
  .finally(() => {
@@ -310,6 +310,15 @@ function isTransientModelRuntimeError(result, message) {
310
310
  return patterns.some((pattern) => pattern.test(text));
311
311
  }
312
312
 
313
+ function isOutputLimitReachedMessage(message) {
314
+ const text = String(message || "").toLowerCase();
315
+ if (!text) return false;
316
+ return (
317
+ text.includes("max_tokens") &&
318
+ (text.includes("output limit") || text.includes("token limit") || text.includes("finish"))
319
+ );
320
+ }
321
+
313
322
  function isRateLimitResult(result, message) {
314
323
  const status = Number(result?.status || 0);
315
324
  if (status === 429) return true;
@@ -377,6 +386,15 @@ function classifyModelProbeResult(format, result) {
377
386
  };
378
387
  }
379
388
 
389
+ if (isOutputLimitReachedMessage(message)) {
390
+ return {
391
+ supported: true,
392
+ confirmed: true,
393
+ outcome: "output-limit",
394
+ message: message || "Request reached model but the probe token budget was too small."
395
+ };
396
+ }
397
+
380
398
  if (isUnsupportedModelMessage(message)) {
381
399
  return {
382
400
  supported: false,