npm - @khanglvm/llm-router - Versions diffs - 2.4.1 → 2.5.1 - Mend

@khanglvm/llm-router 2.4.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/CHANGELOG.md +17 -0
package/README.md +12 -0
package/package.json +1 -1
package/src/node/huggingface-gguf.js +273 -0
package/src/node/llamacpp-runtime.js +309 -0
package/src/node/local-model-browser.js +132 -0
package/src/node/local-model-capacity.js +39 -0
package/src/node/local-models-service.js +238 -0
package/src/node/start-command.js +12 -0
package/src/node/web-console-client.js +27 -27
package/src/node/web-console-server.js +575 -0
package/src/node/web-console-styles.generated.js +1 -1
package/src/node/web-console-ui/api-client.js +94 -0
package/src/node/web-console-ui/local-models-utils.js +138 -0
package/src/runtime/config.js +22 -7
package/src/runtime/handler/provider-translation.js +5 -5
package/src/runtime/local-models.js +168 -0
package/src/translator/response/openai-to-claude.js +70 -9

package/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [2.5.1] - 2026-04-23
+### Fixed
+- Relaxed the live Claude Code publish smoke check so short affirmative routed replies such as `OK` or `好的` no longer fail `npm publish` when the end-to-end router path is otherwise healthy.
+## [2.5.0] - 2026-04-23
+### Added
+- Local Models can now use a native macOS file/folder picker to attach GGUF files in place, scan a selected folder recursively for GGUF artifacts, and browse directly to a local `llama-server` runtime binary.
+### Changed
+- Hugging Face GGUF search results for Local Models now rank quantizations more intelligently, show tighter Mac memory-fit guidance, and call out better long-context download choices for 64 GB Macs.
+- `llama.cpp` runtime detection now searches common local source-build locations in addition to `PATH` and Homebrew installs, and server validation now recognizes more `llama-server` help output variants including TurboQuant builds.
+### Fixed
+- OpenAI-to-Claude response translation now preserves Anthropic-compatible usage metadata such as `speed`, `service_tier`, cache counters, and tool-usage fields so Claude Code no longer trips over missing `usage.speed` on routed responses.
 ## [2.4.1] - 2026-04-19
 ### Fixed

package/README.md CHANGED Viewed

@@ -36,6 +36,18 @@ llr ai-help  # agent-oriented setup brief
 - **Deployable** — run locally or deploy to Cloudflare Workers
 - **AI-agent friendly** — full CLI parity with `llr config --operation=...` so agents can configure everything programmatically
+## Local Models
+Open `llr` and use the **Local Models** tab to manage local inference sources alongside hosted providers.
+- **`llama.cpp` runtime** — detect or point at a local `llama-server`, attach GGUF files in place, or download public GGUF artifacts into the router-managed library under `~/.llm-router/local-models`
+- **Native macOS browsing** — use the built-in file picker to choose a single GGUF file, scan a folder recursively for GGUF models, or browse directly to a local `llama-server` binary
+- **Managed + attached model library** — stale or moved files stay visible instead of crashing the app, and can be repaired by locating the file again or removed cleanly
+- **Router-visible local variants** — create friendly model variants with bounded presets, context-window metadata, preload toggles, and Mac unified-memory fit guidance with clearer safe/tight recommendations
+- **Alias-ready local routing** — once saved, local variants behave like normal router models and can be used in aliases, capability flags, and fallback chains
+For v1, the managed download flow only searches public Hugging Face GGUF files and the fit guidance is tuned for Macs with unified memory.
 ## Local Runtime Reliability
 `llr start` keeps a small supervisor bound to the fixed local router port and runs the real router backend behind it on an internal loopback port.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@khanglvm/llm-router",
-  "version": "2.4.1",
+  "version": "2.5.1",
   "description": "LLM Router: single gateway endpoint for multi-provider LLMs with unified OpenAI+Anthropic format and seamless fallback",
   "keywords": [
     "llm-router",

package/src/node/huggingface-gguf.js ADDED Viewed

@@ -0,0 +1,273 @@
+import path from "node:path";
+import { promises as fs } from "node:fs";
+const HUGGING_FACE_API_URL = "https://huggingface.co/api/models";
+const HUGGING_FACE_BASE_URL = "https://huggingface.co";
+const POTENTIAL_MODEL_ARTIFACT_PATTERN = /\.(gguf|safetensors|bin|pth|pt)$/i;
+const DEFAULT_EXPECTED_CONTEXT_WINDOW = 200000;
+function normalizeString(value) {
+  return typeof value === "string" ? value.trim() : "";
+}
+function normalizePositiveNumber(value) {
+  const parsed = Number(value);
+  if (!Number.isFinite(parsed) || parsed <= 0) return undefined;
+  return parsed;
+}
+function parseQuantizationFromFileName(fileName) {
+  const match = String(fileName || "").match(/(UD-[A-Z0-9_]+|IQ\d+_[A-Z]+|Q\d+_[A-Z0-9]+|Q\d+_0|MXFP4_MOE|BF16|F16|F32)/i);
+  return match ? match[1].toUpperCase() : "";
+}
+function scoreQuantization(fileName) {
+  const quantization = parseQuantizationFromFileName(fileName);
+  if (!quantization) return 0;
+  if (quantization.startsWith("Q5")) return 6;
+  if (quantization.startsWith("IQ")) return 5;
+  if (quantization === "Q4_K_M" || quantization === "Q4_K_S" || quantization.startsWith("Q4")) return 4;
+  if (quantization.startsWith("Q6")) return 3;
+  if (quantization.startsWith("Q8")) return 2;
+  if (quantization === "BF16" || quantization === "F16" || quantization === "F32") return 1;
+  return 1;
+}
+function buildCompatibilityBadges(fileName, fit, recommendation = "") {
+  const badges = [];
+  if (/\.gguf$/i.test(fileName)) badges.push("GGUF");
+  badges.push("llama.cpp");
+  if (fit === "safe") badges.push("Mac OK");
+  else if (fit === "tight") badges.push("Mac Tight");
+  else badges.push("Mac review");
+  if (/best fit/i.test(recommendation)) badges.push("Best fit");
+  return badges;
+}
+function isPotentialModelArtifact(fileName) {
+  return POTENTIAL_MODEL_ARTIFACT_PATTERN.test(String(fileName || ""));
+}
+function encodePathSegments(rawPath) {
+  return String(rawPath || "")
+    .split("/")
+    .filter(Boolean)
+    .map((segment) => encodeURIComponent(segment))
+    .join("/");
+}
+function extractHuggingFaceFiles(models = []) {
+  const files = [];
+  for (const model of Array.isArray(models) ? models : []) {
+    const repo = normalizeString(model?.id || model?.modelId);
+    if (!repo) continue;
+    for (const sibling of Array.isArray(model?.siblings) ? model.siblings : []) {
+      const file = normalizeString(sibling?.rfilename);
+      if (!file || !isPotentialModelArtifact(file)) continue;
+      files.push({
+        repo,
+        file,
+        size: normalizePositiveNumber(sibling?.size) ?? normalizePositiveNumber(sibling?.lfs?.size),
+        downloads: normalizePositiveNumber(model?.downloads) || 0,
+        likes: normalizePositiveNumber(model?.likes) || 0,
+        gguf: model?.gguf || undefined,
+        private: model?.private === true,
+        gated: model?.gated === true
+      });
+    }
+  }
+  return files;
+}
+export function classifyGgufCandidateForMac(candidate, { totalMemoryBytes } = {}) {
+  const fileName = normalizeString(candidate?.file || candidate?.rfilename);
+  const sizeBytes = normalizePositiveNumber(candidate?.sizeBytes ?? candidate?.size);
+  const expectedContextWindow = normalizePositiveNumber(candidate?.expectedContextWindow) || DEFAULT_EXPECTED_CONTEXT_WINDOW;
+  if (!/\.gguf$/i.test(fileName)) {
+    return {
+      fit: "unsupported",
+      disabled: true,
+      reason: "Not a GGUF file",
+      recommendation: "Unsupported for llama.cpp in v1."
+    };
+  }
+  if (sizeBytes && totalMemoryBytes && sizeBytes > Number(totalMemoryBytes) * 0.85) {
+    return {
+      fit: "over-budget",
+      disabled: true,
+      reason: "Too large for this Mac",
+      recommendation: "Skip this one on a 64 GB Mac."
+    };
+  }
+  if (!sizeBytes || !totalMemoryBytes) {
+    return {
+      fit: "unknown",
+      disabled: false,
+      reason: "",
+      recommendation: "Review memory fit manually before download."
+    };
+  }
+  const memoryRatio = sizeBytes / Number(totalMemoryBytes);
+  const quantScore = scoreQuantization(fileName);
+  if (expectedContextWindow >= 200000 && memoryRatio >= 0.5) {
+    return {
+      fit: "tight",
+      disabled: false,
+      reason: "200K context will be tight on this Mac",
+      recommendation: quantScore >= 2
+        ? "200K context needs review on a 64 GB Mac."
+        : "Large context and heavy quantization choice need review."
+    };
+  }
+  if (memoryRatio >= 0.4) {
+    return {
+      fit: "tight",
+      disabled: false,
+      reason: "Fits, but leaves limited unified memory headroom",
+      recommendation: "Reasonable fit, but memory headroom will be tight."
+    };
+  }
+  return {
+    fit: "safe",
+    disabled: false,
+    reason: "",
+    recommendation: quantScore >= 4
+      ? "Best fit for a 64 GB Mac and long-context testing."
+      : "Fits this Mac comfortably."
+  };
+}
+export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
+  const results = (Array.isArray(files) ? files : []).map((entry) => {
+    const file = normalizeString(entry?.file || entry?.rfilename);
+    const sizeBytes = normalizePositiveNumber(entry?.sizeBytes ?? entry?.size);
+    const status = classifyGgufCandidateForMac({
+      file,
+      sizeBytes,
+      expectedContextWindow: systemInfo?.expectedContextWindow
+    }, systemInfo);
+    const quantization = parseQuantizationFromFileName(file);
+    const fitScore = status.fit === "safe" ? 30 : status.fit === "tight" ? 15 : status.fit === "unknown" ? 8 : -20;
+    const rankingScore = fitScore
+      + (status.disabled ? -100 : 0)
+      + (scoreQuantization(file) * 10)
+      + Math.min(15, Math.log10(Number(entry?.downloads || 0) + 1) * 4)
+      + Math.min(8, Math.log10(Number(entry?.likes || 0) + 1) * 3)
+      - Math.min(12, (sizeBytes || 0) / (1024 ** 3));
+    return {
+      repo: normalizeString(entry?.repo || entry?.id || entry?.modelId),
+      file,
+      quantization,
+      sizeBytes,
+      disabled: status.disabled,
+      disabledReason: status.reason,
+      fit: status.fit,
+      recommendation: status.recommendation,
+      badges: buildCompatibilityBadges(file, status.fit, status.recommendation),
+      rankingScore
+    };
+  });
+  return results.sort((left, right) => {
+    if (right.rankingScore !== left.rankingScore) return right.rankingScore - left.rankingScore;
+    return String(left.file || "").localeCompare(String(right.file || ""));
+  });
+}
+export async function searchHuggingFaceGgufCandidates(query, {
+  limit = 20,
+  totalMemoryBytes,
+  expectedContextWindow = DEFAULT_EXPECTED_CONTEXT_WINDOW,
+  fetchImpl = fetch
+} = {}) {
+  const search = normalizeString(query);
+  const url = new URL(HUGGING_FACE_API_URL);
+  if (search) url.searchParams.set("search", search);
+  url.searchParams.set("limit", String(Math.max(1, Math.min(50, Number(limit) || 20))));
+  for (const field of ["siblings", "gguf", "downloads", "likes", "gated", "private"]) {
+    url.searchParams.append("expand[]", field);
+  }
+  const response = await fetchImpl(url, {
+    headers: {
+      accept: "application/json"
+    }
+  });
+  if (!response.ok) {
+    throw new Error(`Hugging Face search failed (${response.status}).`);
+  }
+  const payload = await response.json();
+  return shapeHuggingFaceGgufResults(
+    extractHuggingFaceFiles(payload),
+    { totalMemoryBytes, expectedContextWindow }
+  );
+}
+export function buildHuggingFaceFileDownloadUrl(repo, file) {
+  const normalizedRepo = encodePathSegments(repo);
+  const normalizedFile = encodePathSegments(file);
+  return `${HUGGING_FACE_BASE_URL}/${normalizedRepo}/resolve/main/${normalizedFile}?download=true`;
+}
+export async function downloadManagedHuggingFaceGguf({
+  repo,
+  file,
+  destinationPath
+} = {}, {
+  fetchImpl = fetch,
+  onProgress = () => {}
+} = {}) {
+  const targetRepo = normalizeString(repo);
+  const targetFile = normalizeString(file);
+  const outputPath = normalizeString(destinationPath);
+  if (!targetRepo || !targetFile || !outputPath) {
+    throw new Error("repo, file, and destinationPath are required.");
+  }
+  const url = buildHuggingFaceFileDownloadUrl(targetRepo, targetFile);
+  const response = await fetchImpl(url, {
+    headers: {
+      accept: "application/octet-stream"
+    }
+  });
+  if (!response.ok || !response.body) {
+    throw new Error(`Hugging Face download failed (${response.status}).`);
+  }
+  await fs.mkdir(path.dirname(outputPath), { recursive: true });
+  const tempPath = `${outputPath}.part`;
+  const fileHandle = await fs.open(tempPath, "w");
+  const totalBytes = normalizePositiveNumber(response.headers.get("content-length"));
+  let receivedBytes = 0;
+  try {
+    const reader = response.body.getReader();
+    while (true) {
+      const { value, done } = await reader.read();
+      if (done) break;
+      const chunk = value || new Uint8Array();
+      if (chunk.byteLength > 0) {
+        await fileHandle.write(chunk);
+        receivedBytes += chunk.byteLength;
+        onProgress({ receivedBytes, totalBytes });
+      }
+    }
+  } finally {
+    await fileHandle.close();
+  }
+  await fs.rename(tempPath, outputPath);
+  return {
+    filePath: outputPath,
+    sizeBytes: receivedBytes || totalBytes || undefined,
+    downloadUrl: url
+  };
+}

package/src/node/llamacpp-runtime.js ADDED Viewed

@@ -0,0 +1,309 @@
+import path from "node:path";
+import os from "node:os";
+import { existsSync } from "node:fs";
+import { spawn, spawnSync } from "node:child_process";
+export const LLAMACPP_DEFAULT_HOST = "127.0.0.1";
+export const LLAMACPP_DEFAULT_PORT = 39391;
+const LLAMACPP_EXECUTABLE = "llama-server";
+const FALLBACK_LLAMACPP_PATHS = Object.freeze([
+  "/opt/homebrew/bin/llama-server",
+  "/usr/local/bin/llama-server"
+]);
+const COMMON_SOURCE_BUILD_PATHS = Object.freeze([
+  "src/llama-cpp/build/bin/llama-server",
+  "src/llama.cpp/build/bin/llama-server",
+  "src/llama-cpp-turboquant/build/bin/llama-server",
+  "src/llama.cpp-turboquant/build/bin/llama-server"
+]);
+let managedLlamacppRuntime = null;
+function isPlainObject(value) {
+  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
+}
+function normalizeString(value) {
+  return typeof value === "string" ? value.trim() : "";
+}
+function normalizePort(value, fallback = LLAMACPP_DEFAULT_PORT) {
+  const parsed = Number(value);
+  if (!Number.isInteger(parsed) || parsed <= 0 || parsed > 65535) return fallback;
+  return parsed;
+}
+function normalizePathEntries(entries) {
+  return Array.isArray(entries)
+    ? entries.map((entry) => normalizeString(entry)).filter(Boolean)
+    : [];
+}
+function readConfiguredLlamacppRuntime(config) {
+  const runtime = config?.metadata?.localModels?.runtime?.llamacpp;
+  if (!isPlainObject(runtime)) {
+    return {
+      startWithRouter: false,
+      command: "",
+      host: LLAMACPP_DEFAULT_HOST,
+      port: LLAMACPP_DEFAULT_PORT
+    };
+  }
+  return {
+    startWithRouter: runtime.startWithRouter === true,
+    command: normalizeString(runtime.selectedCommand || runtime.manualCommand || runtime.command || runtime.path),
+    host: normalizeString(runtime.host) || LLAMACPP_DEFAULT_HOST,
+    port: normalizePort(runtime.port, LLAMACPP_DEFAULT_PORT)
+  };
+}
+function buildPreloadModels(config) {
+  const library = config?.metadata?.localModels?.library;
+  const variants = config?.metadata?.localModels?.variants;
+  if (!isPlainObject(library) || !isPlainObject(variants)) return [];
+  const preloadModels = [];
+  for (const variant of Object.values(variants)) {
+    if (!isPlainObject(variant)) continue;
+    if (variant.runtime !== "llamacpp" || variant.preload !== true || variant.enabled !== true) continue;
+    const baseModel = library[variant.baseModelId];
+    const modelPath = normalizeString(baseModel?.path);
+    if (!modelPath) continue;
+    preloadModels.push({
+      variantId: normalizeString(variant.id),
+      modelPath,
+      contextWindow: Number.isFinite(Number(variant.contextWindow)) ? Number(variant.contextWindow) : undefined
+    });
+  }
+  return preloadModels;
+}
+export function detectLlamacppCandidates({
+  envPathEntries = process.env.PATH?.split(path.delimiter) || [],
+  homeDir = os.homedir(),
+  existingPaths = null
+} = {}) {
+  const seen = new Set();
+  const candidates = [];
+  const searchTargets = [
+    ...normalizePathEntries(envPathEntries).map((entry) => ({
+      path: path.join(entry, LLAMACPP_EXECUTABLE),
+      source: "path"
+    })),
+    ...FALLBACK_LLAMACPP_PATHS.map((entry) => ({
+      path: entry,
+      source: "homebrew"
+    })),
+    ...COMMON_SOURCE_BUILD_PATHS.map((entry) => ({
+      path: path.join(homeDir, entry),
+      source: "source-build"
+    }))
+  ];
+  for (const target of searchTargets) {
+    const candidatePath = normalizeString(target.path);
+    if (seen.has(candidatePath)) continue;
+    seen.add(candidatePath);
+    const exists = existingPaths instanceof Set ? existingPaths.has(candidatePath) : existsSync(candidatePath);
+    if (!exists) continue;
+    candidates.push({
+      id: candidatePath,
+      label: candidatePath,
+      path: candidatePath,
+      source: target.source
+    });
+  }
+  return candidates;
+}
+export function buildLlamacppLaunchArgs({
+  command,
+  host = LLAMACPP_DEFAULT_HOST,
+  port = LLAMACPP_DEFAULT_PORT,
+  preloadModels = []
+} = {}) {
+  const firstModel = Array.isArray(preloadModels) ? preloadModels[0] : null;
+  const args = [
+    normalizeString(command),
+    "--host", normalizeString(host) || LLAMACPP_DEFAULT_HOST,
+    "--port", String(normalizePort(port, LLAMACPP_DEFAULT_PORT))
+  ];
+  if (firstModel?.modelPath) {
+    args.push("-m", firstModel.modelPath);
+    if (Number.isFinite(Number(firstModel.contextWindow)) && Number(firstModel.contextWindow) > 0) {
+      args.push("-c", String(Math.floor(Number(firstModel.contextWindow))));
+    }
+  }
+  return args.filter(Boolean);
+}
+export function parseLlamacppValidationOutput(output = "") {
+  const text = String(output || "").trim();
+  const lowered = text.toLowerCase();
+  const supportsHost = /(^|\s)--host(\s|$)/m.test(text);
+  const supportsPort = /(^|\s)--port(\s|$)/m.test(text);
+  const referencesModelFlag = /(^|\s)(-m,\s+)?--model(\s|$)/m.test(text);
+  const looksLikeServerHelp = supportsHost && supportsPort && referencesModelFlag;
+  const kind = lowered.includes("llama-server") || looksLikeServerHelp ? "server" : "";
+  return {
+    ok: Boolean(kind) && supportsHost && supportsPort,
+    kind,
+    supportsHost,
+    supportsPort,
+    isTurboQuant: lowered.includes("turboquant") || /\bturbo[234]\b/.test(lowered)
+  };
+}
+export function validateLlamacppCommand(command, { spawnSyncImpl = spawnSync } = {}) {
+  const target = normalizeString(command);
+  if (!target) {
+    return {
+      ok: false,
+      errorMessage: "No llama.cpp command is configured."
+    };
+  }
+  const result = spawnSyncImpl(target, ["--help"], {
+    encoding: "utf8"
+  });
+  if (result?.error) {
+    return {
+      ok: false,
+      errorMessage: result.error instanceof Error ? result.error.message : String(result.error)
+    };
+  }
+  const parsed = parseLlamacppValidationOutput(`${result?.stdout || ""}\n${result?.stderr || ""}`);
+  if (!parsed.ok) {
+    return {
+      ok: false,
+      errorMessage: `Command '${target}' does not appear to be a compatible llama-server binary.`,
+      ...parsed
+    };
+  }
+  return {
+    ok: true,
+    ...parsed
+  };
+}
+async function startConfiguredRuntime(config, {
+  line = () => {},
+  error = () => {},
+  requireAutostart = true
+} = {}, {
+  spawnSyncImpl = spawnSync,
+  spawnImpl = spawn
+} = {}) {
+  const runtime = readConfiguredLlamacppRuntime(config);
+  if (requireAutostart && !runtime.startWithRouter) {
+    return { ok: true, skipped: true, reason: "autostart-disabled" };
+  }
+  if (!runtime.command) {
+    const errorMessage = "llama.cpp autostart is enabled, but no runtime command is configured.";
+    error(errorMessage);
+    return { ok: false, errorMessage };
+  }
+  if (managedLlamacppRuntime
+    && managedLlamacppRuntime.command === runtime.command
+    && managedLlamacppRuntime.host === runtime.host
+    && managedLlamacppRuntime.port === runtime.port
+    && managedLlamacppRuntime.child?.exitCode === null
+    && managedLlamacppRuntime.child?.killed !== true) {
+    return { ok: true, alreadyRunning: true, runtime: managedLlamacppRuntime };
+  }
+  const validation = validateLlamacppCommand(runtime.command, { spawnSyncImpl });
+  if (!validation.ok) {
+    error(validation.errorMessage || `Failed validating llama.cpp runtime '${runtime.command}'.`);
+    return validation;
+  }
+  const preloadModels = buildPreloadModels(config);
+  const args = buildLlamacppLaunchArgs({
+    command: runtime.command,
+    host: runtime.host,
+    port: runtime.port,
+    preloadModels
+  });
+  return new Promise((resolve) => {
+    let settled = false;
+    const child = spawnImpl(args[0], args.slice(1), {
+      stdio: "ignore"
+    });
+    const finish = (result) => {
+      if (settled) return;
+      settled = true;
+      resolve(result);
+    };
+    child.once("spawn", () => {
+      managedLlamacppRuntime = {
+        child,
+        command: runtime.command,
+        host: runtime.host,
+        port: runtime.port,
+        args
+      };
+      child.once("exit", () => {
+        if (managedLlamacppRuntime?.child === child) {
+          managedLlamacppRuntime = null;
+        }
+      });
+      if (typeof child.unref === "function") child.unref();
+      line(`Started llama.cpp runtime on http://${runtime.host}:${runtime.port}${validation.isTurboQuant ? " (TurboQuant detected)" : ""}.`);
+      finish({ ok: true, runtime: managedLlamacppRuntime, validation });
+    });
+    child.once("error", (spawnError) => {
+      const errorMessage = spawnError instanceof Error ? spawnError.message : String(spawnError);
+      error(`Failed starting llama.cpp runtime: ${errorMessage}`);
+      finish({ ok: false, errorMessage });
+    });
+  });
+}
+export async function ensureConfiguredLlamacppRuntimeStarted(config, callbacks = {}, deps = {}) {
+  return startConfiguredRuntime(config, {
+    ...callbacks,
+    requireAutostart: true
+  }, deps);
+}
+export async function startConfiguredLlamacppRuntime(config, callbacks = {}, deps = {}) {
+  return startConfiguredRuntime(config, {
+    ...callbacks,
+    requireAutostart: false
+  }, deps);
+}
+export async function stopManagedLlamacppRuntime({
+  line = () => {},
+  error = () => {}
+} = {}) {
+  const active = managedLlamacppRuntime;
+  if (!active?.child) {
+    return { ok: true, skipped: true, reason: "not-running" };
+  }
+  managedLlamacppRuntime = null;
+  try {
+    active.child.kill("SIGTERM");
+    line("Stopped managed llama.cpp runtime.");
+    return { ok: true };
+  } catch (stopError) {
+    const errorMessage = stopError instanceof Error ? stopError.message : String(stopError);
+    error(`Failed stopping llama.cpp runtime: ${errorMessage}`);
+    return { ok: false, errorMessage };
+  }
+}