npm - @khanglvm/llm-router - Versions diffs - 2.5.1 → 2.6.0 - Mend

@khanglvm/llm-router 2.5.1 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/CHANGELOG.md +18 -0
package/README.md +5 -2
package/package.json +1 -1
package/src/node/dev-command.js +114 -0
package/src/node/huggingface-gguf.js +12 -0
package/src/node/llamacpp-managed-runtime.js +202 -0
package/src/node/llamacpp-runtime-profile.js +133 -0
package/src/node/llamacpp-runtime.js +256 -78
package/src/node/local-models-service.js +25 -2
package/src/node/local-server.js +60 -2
package/src/node/web-console-client.js +20 -20
package/src/node/web-console-server.js +64 -8
package/src/node/web-console-styles.generated.js +1 -1
package/src/node/web-console-ui/local-models-utils.js +33 -0
package/src/runtime/handler/provider-call.js +36 -18
package/src/runtime/handler/runtime-policy.js +4 -1
package/src/runtime/local-models.js +36 -0

package/src/node/llamacpp-runtime.js CHANGED Viewed

@@ -2,6 +2,11 @@ import path from "node:path";
 import os from "node:os";
 import { existsSync } from "node:fs";
 import { spawn, spawnSync } from "node:child_process";
+import { setTimeout as delay } from "node:timers/promises";
+import { deriveLlamacppLaunchProfile } from "./llamacpp-runtime-profile.js";
+import { createLlamacppManagedRuntimeRegistry } from "./llamacpp-managed-runtime.js";
+import { listListeningPids as listListeningPidsForPort } from "./port-reclaim.js";
+import { stopProcessByPid as stopProcessByPidForRuntime } from "./instance-state.js";
 export const LLAMACPP_DEFAULT_HOST = "127.0.0.1";
 export const LLAMACPP_DEFAULT_PORT = 39391;
@@ -16,8 +21,8 @@ const COMMON_SOURCE_BUILD_PATHS = Object.freeze([
   "src/llama-cpp-turboquant/build/bin/llama-server",
   "src/llama.cpp-turboquant/build/bin/llama-server"
 ]);
-let managedLlamacppRuntime = null;
+const managedLlamacppRuntimeRegistry = createLlamacppManagedRuntimeRegistry();
+let inFlightConfiguredStartCount = 0;
 function isPlainObject(value) {
   return Boolean(value) && typeof value === "object" && !Array.isArray(value);
@@ -39,6 +44,34 @@ function normalizePathEntries(entries) {
     : [];
 }
+function buildRuntimeProfileHash({ command, host, port, args = [] } = {}) {
+  const normalizedArgs = Array.isArray(args) ? args.filter(Boolean) : [];
+  return `${normalizeString(command)}|${normalizeString(host)}|${String(normalizePort(port, LLAMACPP_DEFAULT_PORT))}|${normalizedArgs.join("\u001f")}`;
+}
+function isManagedRuntimeAlive(instance) {
+  const child = instance?.child;
+  if (!child) return false;
+  return child.exitCode === null && child.killed !== true;
+}
+function normalizeListeningPidResult(result) {
+  if (result && typeof result === "object" && result.ok === false) {
+    return { ok: false, pids: [] };
+  }
+  if (Array.isArray(result)) {
+    return result
+      .map((value) => Number(value))
+      .filter((pid) => Number.isInteger(pid) && pid > 0);
+  }
+  if (result && typeof result === "object" && Array.isArray(result.pids)) {
+    return result.pids
+      .map((value) => Number(value))
+      .filter((pid) => Number.isInteger(pid) && pid > 0);
+  }
+  return [];
+}
 function readConfiguredLlamacppRuntime(config) {
   const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
   if (!isPlainObject(runtime)) {
@@ -72,6 +105,8 @@ function buildPreloadModels(config) {
     if (!modelPath) continue;
     preloadModels.push({
       variantId: normalizeString(variant.id),
+      variant,
+      baseModel,
       modelPath,
       contextWindow: Number.isFinite(Number(variant.contextWindow)) ? Number(variant.contextWindow) : undefined
     });
@@ -79,6 +114,15 @@ function buildPreloadModels(config) {
   return preloadModels;
 }
+function detectLlamacppSystemProfile(system = {}) {
+  const totalMemoryBytes = Number(system?.totalMemoryBytes);
+  return {
+    platform: normalizeString(system?.platform) || process.platform,
+    unifiedMemory: system?.unifiedMemory === true || process.platform === "darwin",
+    totalMemoryBytes: Number.isFinite(totalMemoryBytes) && totalMemoryBytes > 0 ? totalMemoryBytes : os.totalmem()
+  };
+}
 export function detectLlamacppCandidates({
   envPathEntries = process.env.PATH?.split(path.delimiter) || [],
   homeDir = os.homedir(),
@@ -122,16 +166,18 @@ export function buildLlamacppLaunchArgs({
   command,
   host = LLAMACPP_DEFAULT_HOST,
   port = LLAMACPP_DEFAULT_PORT,
-  preloadModels = []
+  preloadModels = [],
+  launchProfile = null
 } = {}) {
   const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
   const args = [
     normalizeString(command),
     "--host", normalizeString(host) || LLAMACPP_DEFAULT_HOST,
-    "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
+    "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT)),
+    ...((Array.isArray(launchProfile?.args) ? launchProfile.args : []).filter(Boolean))
   ];
-  if (firstModel?.modelPath) {
+  if (!launchProfile && firstModel?.modelPath) {
     args.push("-m", firstModel.modelPath);
     if (Number.isFinite(Number(firstModel.contextWindow)) && Number(firstModel.contextWindow) > 0) {
       args.push("-c", String(Math.floor(Number(firstModel.contextWindow))));
@@ -141,6 +187,31 @@ export function buildLlamacppLaunchArgs({
   return args.filter(Boolean);
 }
+export async function spawnManagedLlamacppRuntime({
+  command,
+  host = LLAMACPP_DEFAULT_HOST,
+  port = LLAMACPP_DEFAULT_PORT,
+  launchProfile
+} = {}, {
+  spawnImpl = spawn
+} = {}) {
+  const args = buildLlamacppLaunchArgs({
+    command,
+    host,
+    port,
+    launchProfile
+  });
+  const child = spawnImpl(args[0], args.slice(1), { stdio: "ignore" });
+  return {
+    pid: child?.pid,
+    child,
+    host,
+    port,
+    baseUrl: `http://${host}:${port}/v1`,
+    args
+  };
+}
 export function parseLlamacppValidationOutput(output = "") {
   const text = String(output || "").trim();
   const lowered = text.toLowerCase();
@@ -199,78 +270,142 @@ async function startConfiguredRuntime(config, {
   requireAutostart = true
 } = {}, {
   spawnSyncImpl = spawnSync,
-  spawnImpl = spawn
+  spawnImpl = spawn,
+  system = undefined,
+  listListeningPids = undefined,
+  stopProcessByPid = undefined
 } = {}) {
-  const runtime = readConfiguredLlamacppRuntime(config);
-  if (requireAutostart && !runtime.startWithRouter) {
-    return { ok: true, skipped: true, reason: "autostart-disabled" };
-  }
-  if (!runtime.command) {
-    const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
-    error(errorMessage);
-    return { ok: false, errorMessage };
-  }
-  if (managedLlamacppRuntime
-    && managedLlamacppRuntime.command === runtime.command
-    && managedLlamacppRuntime.host === runtime.host
-    && managedLlamacppRuntime.port === runtime.port
-    && managedLlamacppRuntime.child?.exitCode === null
-    && managedLlamacppRuntime.child?.killed !== true) {
-    return { ok: true, alreadyRunning: true, runtime: managedLlamacppRuntime };
-  }
-  const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
-  if (!validation.ok) {
-    error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
-    return validation;
-  }
+  inFlightConfiguredStartCount += 1;
+  try {
+    const runtime = readConfiguredLlamacppRuntime(config);
+    if (requireAutostart && !runtime.startWithRouter) {
+      return { ok: true, skipped: true, reason: "autostart-disabled" };
+    }
-  const preloadModels = buildPreloadModels(config);
-  const args = buildLlamacppLaunchArgs({
-    command: runtime.command,
-    host: runtime.host,
-    port: runtime.port,
-    preloadModels
-  });
+    if (!runtime.command) {
+      const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
+      error(errorMessage);
+      return { ok: false, errorMessage };
+    }
-  return new Promise((resolve) => {
-    let settled = false;
-    const child = spawnImpl(args[0], args.slice(1), {
-      stdio: "ignore"
+    const preloadModels = buildPreloadModels(config);
+    const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
+    const launchProfile = firstModel?.variant && firstModel?.baseModel
+      ? deriveLlamacppLaunchProfile({
+        variant: firstModel.variant,
+        baseModel: firstModel.baseModel,
+        system: detectLlamacppSystemProfile(system)
+      })
+      : null;
+    const args = buildLlamacppLaunchArgs({
+      command: runtime.command,
+      host: runtime.host,
+      port: runtime.port,
+      preloadModels,
+      launchProfile
+    });
+    const variantKey = normalizeString(firstModel?.variant?.key || firstModel?.variantId) || "default";
+    const profileHash = buildRuntimeProfileHash({
+      command: runtime.command,
+      host: runtime.host,
+      port: runtime.port,
+      args: args.slice(1)
+    });
+    const listListeningPidsFn = typeof listListeningPids === "function"
+      ? listListeningPids
+      : (port) => listListeningPidsForPort(port, { spawnSync: spawnSyncImpl });
+    const stopProcessByPidFn = typeof stopProcessByPid === "function"
+      ? stopProcessByPid
+      : (pid) => stopProcessByPidForRuntime(pid);
+    await managedLlamacppRuntimeRegistry.reconcile({
+      listListeningPids: async (port) => normalizeListeningPidResult(await listListeningPidsFn(port)),
+      stopProcessByPid: async (pid) => stopProcessByPidFn(pid)
     });
-    const finish = (result) => {
-      if (settled) return;
-      settled = true;
-      resolve(result);
-    };
+    const existing = managedLlamacppRuntimeRegistry
+      .snapshot()
+      .find((instance) => (
+        instance.variantKey === variantKey
+        && instance.profileHash === profileHash
+        && isManagedRuntimeAlive(instance)
+      ));
+    if (existing) {
+      return { ok: true, alreadyRunning: true, runtime: existing };
+    }
-    child.once("spawn", () => {
-      managedLlamacppRuntime = {
-        child,
-        command: runtime.command,
-        host: runtime.host,
-        port: runtime.port,
-        args
-      };
-      child.once("exit", () => {
-        if (managedLlamacppRuntime?.child === child) {
-          managedLlamacppRuntime = null;
-        }
+    const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
+    if (!validation.ok) {
+      error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
+      return validation;
+    }
+    try {
+      const managedRuntime = await managedLlamacppRuntimeRegistry.ensureRuntimeForVariant({
+        variantKey,
+        profileHash,
+        launchArgs: args.slice(1),
+        preferredPort: runtime.port
+      }, {
+        spawnRuntime: async ({ port }) => new Promise((resolve, reject) => {
+          let settled = false;
+          const allocatedArgs = buildLlamacppLaunchArgs({
+            command: runtime.command,
+            host: runtime.host,
+            port,
+            preloadModels,
+            launchProfile
+          });
+          const child = spawnImpl(allocatedArgs[0], allocatedArgs.slice(1), {
+            stdio: "ignore"
+          });
+          const expectedInstanceId = `${variantKey}:${profileHash}:${port}`;
+          if (child && child.__llamacppManagedExitHookAttached !== true) {
+            child.__llamacppManagedExitHookAttached = true;
+            child.once("exit", () => {
+              void managedLlamacppRuntimeRegistry.untrackInstance(expectedInstanceId);
+            });
+          }
+          const settleResolve = (value) => {
+            if (settled) return;
+            settled = true;
+            resolve(value);
+          };
+          const settleReject = (reason) => {
+            if (settled) return;
+            settled = true;
+            reject(reason);
+          };
+          child.once("spawn", () => {
+            if (typeof child.unref === "function") child.unref();
+            settleResolve({
+              pid: child?.pid,
+              child,
+              command: runtime.command,
+              host: runtime.host,
+              port,
+              args: allocatedArgs,
+              baseUrl: `http://${runtime.host}:${port}/v1`
+            });
+          });
+          child.once("error", (spawnError) => {
+            settleReject(spawnError);
+          });
+        }),
+        waitForHealthy: async (instance) => instance
       });
-      if (typeof child.unref === "function") child.unref();
-      line(`Started llama.cpp runtime on http://${runtime.host}:${runtime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
-      finish({ ok: true, runtime: managedLlamacppRuntime, validation });
-    });
-    child.once("error", (spawnError) => {
+      line(`Started llama.cpp runtime on http://${managedRuntime.host}:${managedRuntime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
+      return { ok: true, runtime: managedRuntime, validation };
+    } catch (spawnError) {
       const errorMessage = spawnError instanceof Error ? spawnError.message : String(spawnError);
       error(`Failed starting llama.cpp runtime: ${errorMessage}`);
-      finish({ ok: false, errorMessage });
-    });
-  });
+      return { ok: false, errorMessage };
+    }
+  } finally {
+    inFlightConfiguredStartCount = Math.max(0, inFlightConfiguredStartCount - 1);
+  }
 }
 export async function ensureConfiguredLlamacppRuntimeStarted(config, callbacks = {}, deps = {}) {
@@ -287,23 +422,66 @@ export async function startConfiguredLlamacppRuntime(config, callbacks = {}, dep
   }, deps);
 }
+export function getManagedLlamacppRuntimeSnapshot() {
+  return managedLlamacppRuntimeRegistry.snapshot().map((instance) => {
+    const { child: _child, ...rest } = instance || {};
+    return JSON.parse(JSON.stringify(rest));
+  });
+}
 export async function stopManagedLlamacppRuntime({
   line = () => {},
   error = () => {}
 } = {}) {
-  const active = managedLlamacppRuntime;
-  if (!active?.child) {
+  while (inFlightConfiguredStartCount > 0) {
+    await delay(0);
+  }
+  if (typeof managedLlamacppRuntimeRegistry.waitForInFlightStarts === "function") {
+    await managedLlamacppRuntimeRegistry.waitForInFlightStarts();
+  }
+  const instances = managedLlamacppRuntimeRegistry.snapshot();
+  if (instances.length === 0) {
     return { ok: true, skipped: true, reason: "not-running" };
   }
-  managedLlamacppRuntime = null;
-  try {
-    active.child.kill("SIGTERM");
-    line("Stopped managed llama.cpp runtime.");
-    return { ok: true };
-  } catch (stopError) {
-    const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
-    error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
-    return { ok: false, errorMessage };
+  const failures = [];
+  let stoppedCount = 0;
+  let pendingExitCount = 0;
+  for (const instance of instances) {
+    try {
+      if (instance?.owner === "llm-router" && typeof instance?.child?.kill === "function") {
+        const killResult = instance.child.kill("SIGTERM");
+        if (killResult !== false) {
+          stoppedCount += 1;
+        }
+      }
+      if (!isManagedRuntimeAlive(instance)) {
+        await managedLlamacppRuntimeRegistry.untrackInstance(instance?.instanceId);
+      } else {
+        pendingExitCount += 1;
+      }
+    } catch (stopError) {
+      const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
+      failures.push(errorMessage);
+      error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
+    }
+  }
+  if (stoppedCount > 0) {
+    if (pendingExitCount === 0) {
+      line(stoppedCount === 1 ? "Stopped managed llama.cpp runtime." : `Stopped ${stoppedCount} managed llama.cpp runtimes.`);
+    } else {
+      line(stoppedCount === 1
+        ? "Stop signal sent to managed llama.cpp runtime; waiting for exit."
+        : `Stop signal sent to ${stoppedCount} managed llama.cpp runtimes; waiting for exits.`);
+    }
   }
+  const completed = failures.length === 0 && pendingExitCount === 0;
+  return {
+    ok: completed,
+    stoppedCount,
+    pendingExitCount,
+    ...(failures.length > 0 ? { errorMessage: failures.join("; ") } : {})
+  };
 }

package/src/node/local-models-service.js CHANGED Viewed

@@ -212,19 +212,42 @@ export async function saveLocalModelVariant(config, draft, {
       activeVariants,
       totalMemoryBytes: system.totalMemoryBytes
     });
-    if (!decision.allowed) {
+  if (!decision.allowed) {
       throw new Error(decision.reason);
     }
   }
+  const normalizedMetadata = normalizeLocalModelsMetadata({
+    variants: {
+      draft: normalizedDraft
+    }
+  });
+  const normalizedVariantDraft = Object.values(normalizedMetadata.variants)[0] || {};
+  const previousVariant = isPlainObject(next.metadata.localModels.variants[key])
+    ? next.metadata.localModels.variants[key]
+    : {};
   next.metadata.localModels.variants[key] = {
-    ...(isPlainObject(next.metadata.localModels.variants[key]) ? next.metadata.localModels.variants[key] : {}),
+    ...previousVariant,
     key,
     baseModelId,
     id: modelId,
     name,
     runtime,
     preset: normalizeString(normalizedDraft.preset),
+    runtimeProfile: runtime === "llamacpp"
+      ? normalizedVariantDraft.runtimeProfile
+      : undefined,
+    runtimeStatus: runtime === "llamacpp"
+      ? (isPlainObject(previousVariant.runtimeStatus)
+        ? previousVariant.runtimeStatus
+        : {
+          activeInstanceId: "",
+          lastFailure: null,
+          lastStartedAt: "",
+          lastHealthyAt: ""
+        })
+      : undefined,
     enabled: normalizedDraft.enabled === true,
     preload: normalizedDraft.preload === true,
     contextWindow: Number.isFinite(Number(normalizedDraft.contextWindow)) ? Number(normalizedDraft.contextWindow) : undefined,

package/src/node/local-server.js CHANGED Viewed

@@ -13,6 +13,10 @@ import { readActivityLogSettings } from "../shared/local-router-defaults.js";
 import { appendActivityLogEntry, resolveActivityLogPath } from "./activity-log.js";
 import { appendLargeRequestLogEntry, resolveLargeRequestLogPath } from "./large-request-log.js";
 import { isLargeRequestLoggingEnabled } from "../runtime/handler/large-request-log.js";
+import {
+  startConfiguredLlamacppRuntime,
+  stopManagedLlamacppRuntime
+} from "./llamacpp-runtime.js";
 const DEFAULT_CONFIG_RELOAD_DEBOUNCE_MS = 300;
 const MAX_CONFIG_RELOAD_DEBOUNCE_MS = 5000;
@@ -34,6 +38,10 @@ function formatError(error) {
   return error instanceof Error ? error.message : String(error);
 }
+function normalizeString(value) {
+  return typeof value === "string" ? value.trim() : "";
+}
 function createLiveConfigStore({
   configPath,
   watchConfig = true,
@@ -237,6 +245,39 @@ async function writeFetchResponseToNode(res, response) {
   readable.pipe(res);
 }
+function buildVariantLlamacppRuntimeConfig(config, variantKey) {
+  const normalizedVariantKey = normalizeString(variantKey);
+  const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
+  const variants = config?.metadata?.localModels?.variants;
+  const library = config?.metadata?.localModels?.library;
+  const variant = variants?.[normalizedVariantKey];
+  if (!runtime || !variant || variant.runtime !== "llamacpp") return null;
+  const baseModelId = normalizeString(variant?.baseModelId);
+  const baseModel = library?.[baseModelId];
+  if (!baseModel) return null;
+  return {
+    metadata: {
+      localModels: {
+        runtime: {
+          llamacpp: { ...runtime }
+        },
+        library: {
+          [baseModelId]: { ...baseModel }
+        },
+        variants: {
+          [normalizedVariantKey]: {
+            ...variant,
+            enabled: true,
+            preload: true
+          }
+        }
+      }
+    }
+  };
+}
 export async function startLocalRouteServer({
   port = FIXED_LOCAL_ROUTER_PORT,
   host = FIXED_LOCAL_ROUTER_HOST,
@@ -248,7 +289,10 @@ export async function startLocalRouteServer({
   validateConfig,
   onConfigReload,
   onConfigReloadError,
-  requireAuth = false
+  requireAuth = false,
+  createFetchHandlerImpl = createFetchHandler,
+  startConfiguredLlamacppRuntimeImpl = startConfiguredLlamacppRuntime,
+  stopManagedLlamacppRuntimeImpl = stopManagedLlamacppRuntime
 } = {}) {
   const reloadDebounceMs = resolveReloadDebounceMs(configReloadDebounceMs);
   const resolvedActivityLogPath = resolveActivityLogPath(configPath, activityLogPath);
@@ -270,9 +314,22 @@ export async function startLocalRouteServer({
   const initialConfig = await configStore.getConfig();
   activityLogEnabled = readActivityLogSettings(initialConfig).enabled;
-  const fetchHandler = createFetchHandler({
+  const fetchHandler = createFetchHandlerImpl({
     ignoreAuth: !requireAuth,
+    runtime: "node",
     getConfig: () => configStore.getConfig(),
+    resolveLocalRuntimeBaseUrl: async ({ candidate }) => {
+      const variantKey = candidate?.model?.metadata?.localVariantKey;
+      const config = await configStore.getConfig();
+      const targetedConfig = buildVariantLlamacppRuntimeConfig(config, variantKey);
+      if (!targetedConfig) return "";
+      const started = await startConfiguredLlamacppRuntimeImpl(targetedConfig);
+      if (!started?.ok) {
+        throw new Error(started?.errorMessage || `Failed starting local runtime for ${normalizeString(variantKey) || "unknown variant"}.`);
+      }
+      return normalizeString(started?.runtime?.baseUrl);
+    },
     defaultStateStoreBackend: "file",
     onActivityLog: (entry) => {
       if (!activityLogEnabled) return;
@@ -355,6 +412,7 @@ export async function startLocalRouteServer({
   server.close = (callback) => {
     shuttingDown = true;
     Promise.resolve()
+      .then(() => stopManagedLlamacppRuntimeImpl().catch(() => {}))
       .then(() => configStore.close())
       .then(() => (typeof fetchHandler.close === "function" ? fetchHandler.close() : undefined))
       .finally(() => {