npm - metrillm-mcp - Versions diffs - 0.2.0 → 0.2.2 - Mend

metrillm-mcp 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -28,10 +28,22 @@ import { Ollama } from "ollama";
 // ../src/utils.ts
 import vm from "vm";
-import { execFile } from "child_process";
+import { execFile, spawn } from "child_process";
 function openUrl(url) {
-  const cmd = process.platform === "darwin" ? "open" : process.platform === "win32" ? "start" : "xdg-open";
-  execFile(cmd, [url]);
+  if (process.platform === "win32") {
+    const child2 = spawn("cmd", ["/c", "start", "", url], {
+      windowsHide: true,
+      stdio: "ignore"
+    });
+    child2.on("error", () => {
+    });
+    child2.unref();
+    return;
+  }
+  const cmd = process.platform === "darwin" ? "open" : "xdg-open";
+  const child = execFile(cmd, [url]);
+  child.on("error", () => {
+  });
 }
 function avg(nums) {
   if (nums.length === 0) return 0;
@@ -110,7 +122,8 @@ function stripThinkTags(text) {
 }
 function hasThinkingContent(response, thinkingField) {
   if (thinkingField && thinkingField.trim().length > 0) return true;
-  return /<think(?:ing)?[\s>]/i.test(response);
+  if (/<think(?:ing)?[\s>]/i.test(response)) return true;
+  return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
 }
 function estimateTokenCount(text) {
   if (!text) return 0;
@@ -514,7 +527,8 @@ function extractCodeBlock(text, preferredFunctionName) {
 var client = new Ollama();
 var DEFAULT_OLLAMA_HOST = "http://127.0.0.1:11434";
 var OLLAMA_INIT_TIMEOUT_MS = 12e4;
-var STREAM_STALL_TIMEOUT_MS = 3e4;
+var DEFAULT_STREAM_STALL_TIMEOUT_MS = 3e4;
+var SHARED_STREAM_STALL_TIMEOUT_ENV = "METRILLM_STREAM_STALL_TIMEOUT_MS";
 function getOllamaBaseUrl() {
   const configured = process.env.OLLAMA_HOST?.trim();
   if (!configured) return DEFAULT_OLLAMA_HOST;
@@ -564,35 +578,81 @@ var defaultKeepAlive;
 function setDefaultKeepAlive(keepAlive) {
   defaultKeepAlive = keepAlive;
 }
+function hasSamplingOverrides(options) {
+  return options?.top_p !== void 0 || options?.seed !== void 0;
+}
+function isUnsupportedSamplingOptionError(err) {
+  const message = err instanceof Error ? err.message : String(err);
+  const lower = message.toLowerCase();
+  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
+  if (!mentionsSampling) return false;
+  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+}
+function parseNonNegativeInt(value) {
+  if (!/^\d+$/.test(value)) return null;
+  const parsed = Number.parseInt(value, 10);
+  if (!Number.isSafeInteger(parsed) || parsed < 0) return null;
+  return parsed;
+}
+function resolveStreamStallTimeoutMs(override) {
+  if (override !== void 0) {
+    if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
+    return override === 0 ? void 0 : Math.trunc(override);
+  }
+  const configured = process.env[SHARED_STREAM_STALL_TIMEOUT_ENV]?.trim();
+  if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
+  const parsed = parseNonNegativeInt(configured);
+  if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
+  return parsed === 0 ? void 0 : parsed;
+}
+function buildGenerateRequest(model, prompt, options, includeSampling) {
+  return {
+    model,
+    prompt,
+    stream: true,
+    keep_alive: options?.keep_alive ?? defaultKeepAlive,
+    ...options?.think !== void 0 ? { think: options.think } : {},
+    options: {
+      temperature: options?.temperature ?? 0,
+      ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
+      ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
+      num_predict: options?.num_predict ?? 512
+    }
+  };
+}
 async function generate(model, prompt, options) {
   return generateStream(model, prompt, void 0, options);
 }
 async function generateStream(model, prompt, callbacks, options) {
-  const stream = await withTimeout(
-    client.generate({
-      model,
-      prompt,
-      stream: true,
-      keep_alive: options?.keep_alive ?? defaultKeepAlive,
-      ...options?.think !== void 0 ? { think: options.think } : {},
-      options: {
-        temperature: options?.temperature ?? 0,
-        num_predict: options?.num_predict ?? 512
-      }
-    }),
+  const stallTimeoutMs = resolveStreamStallTimeoutMs(options?.stall_timeout_ms);
+  let abortedByStallTimeout = false;
+  const initializeStream = (includeSampling) => withTimeout(
+    client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
     OLLAMA_INIT_TIMEOUT_MS,
     "Ollama generate initialization"
   );
+  let stream;
+  try {
+    stream = await initializeStream(true);
+  } catch (err) {
+    if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
+      stream = await initializeStream(false);
+    } else {
+      throw err;
+    }
+  }
   let fullResponse = "";
   let fullThinking = "";
   let result = null;
   let firstChunkSeen = false;
   let stallTimer = null;
   const resetStallTimer = () => {
+    if (stallTimeoutMs === void 0) return;
     if (stallTimer) clearTimeout(stallTimer);
     stallTimer = setTimeout(() => {
+      abortedByStallTimeout = true;
       client.abort();
-    }, STREAM_STALL_TIMEOUT_MS);
+    }, stallTimeoutMs);
   };
   try {
     resetStallTimer();
@@ -627,6 +687,9 @@ async function generateStream(model, prompt, callbacks, options) {
     if (stallTimer) clearTimeout(stallTimer);
   }
   if (!result) {
+    if (abortedByStallTimeout && stallTimeoutMs !== void 0) {
+      throw new Error(`Ollama stream timed out after ${stallTimeoutMs}ms`);
+    }
     throw new Error("Stream ended without done signal");
   }
   callbacks?.onDone?.(result);
@@ -656,42 +719,191 @@ function abortOngoingRequests() {
 import os from "os";
 import path from "path";
 import { promises as fs } from "fs";
+import { execFile as execFile2 } from "child_process";
 var DEFAULT_LM_STUDIO_BASE_URL = "http://127.0.0.1:1234";
 var LM_STUDIO_INIT_TIMEOUT_MS = 15e3;
 var LM_STUDIO_METADATA_TIMEOUT_MS = 2e3;
-var DEFAULT_STREAM_STALL_TIMEOUT_MS = 18e4;
+var DEFAULT_STREAM_STALL_TIMEOUT_MS2 = 3e4;
+var LM_STUDIO_CLI_TIMEOUT_MS = 8e3;
+var SHARED_STREAM_STALL_TIMEOUT_ENV2 = "METRILLM_STREAM_STALL_TIMEOUT_MS";
 var DEFAULT_LM_STUDIO_HOME_DIR = path.join(os.homedir(), ".lmstudio");
 var DEFAULT_LM_STUDIO_MODELS_DIR = path.join(DEFAULT_LM_STUDIO_HOME_DIR, "models");
 var LM_STUDIO_HOME_DIR_ENV = "LM_STUDIO_HOME_DIR";
 var LM_STUDIO_MODELS_DIR_ENV = "LM_STUDIO_MODELS_DIR";
+var LM_STUDIO_CLI_PATH_ENV = "LM_STUDIO_CLI_PATH";
 var defaultKeepAlive2;
 var activeAbortControllers = /* @__PURE__ */ new Set();
 var directorySizeCache = /* @__PURE__ */ new Map();
 var modelDefinitionCache = /* @__PURE__ */ new Map();
-function buildThinkingConfig(think) {
-  if (think === void 0) return {};
-  const effort = think ? "high" : "low";
+var NON_THINKING_SYSTEM_PROMPT = [
+  "You are in non-thinking mode for benchmark reproducibility.",
+  "Return only the final answer.",
+  "Do not output internal reasoning, chain-of-thought, or scratchpad.",
+  "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
+].join(" ");
+function hasThinkingLeakText(response) {
+  return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
+}
+function assertThinkingModeRespected(model, think, response, reasoning) {
+  if (think !== false) return;
+  if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
+    throw new Error(
+      [
+        `LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
+        "In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
+        "If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
+        "Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
+        "Then eject/reload the model and run the benchmark again."
+      ].join(" ")
+    );
+  }
+}
+function buildNativeThinkingOption(think) {
+  if (think !== true) return void 0;
+  return "high";
+}
+function hasSamplingOverrides2(options) {
+  return options?.top_p !== void 0 || options?.seed !== void 0;
+}
+function isUnsupportedSamplingMessage(status, text) {
+  if (status !== 400 && status !== 422) return false;
+  const lower = text.toLowerCase();
+  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
+  if (!mentionsSampling) return false;
+  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+}
+function extractLMStudioErrorMessage(body) {
+  const trimmed = body.trim();
+  if (!trimmed) return "";
+  try {
+    const parsed = JSON.parse(trimmed);
+    const message = parsed.error?.message;
+    if (typeof message === "string" && message.trim().length > 0) {
+      return message.trim();
+    }
+  } catch {
+  }
+  return trimmed;
+}
+function isModelLoadGuardrailError(message) {
+  const lower = message.toLowerCase();
+  if (!lower.includes("failed to load model")) return false;
+  return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
+}
+function buildLMStudioRequestError(kind, model, status, statusText, body) {
+  const backendMessage = extractLMStudioErrorMessage(body);
+  if (isModelLoadGuardrailError(backendMessage)) {
+    return new Error(
+      [
+        `LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
+        "In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
+        `Backend error: ${backendMessage}`
+      ].join(" ")
+    );
+  }
+  const suffix = backendMessage ? ` ${backendMessage}` : "";
+  return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
+}
+function buildNativeChatBody(model, prompt, options, stream, includeSampling) {
+  const reasoning = buildNativeThinkingOption(options?.think);
   return {
-    include_reasoning: think,
-    reasoning_effort: effort,
-    reasoning: { effort }
+    model,
+    input: prompt,
+    temperature: options?.temperature ?? 0,
+    ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
+    ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
+    max_tokens: options?.num_predict ?? 512,
+    stream,
+    ...reasoning !== void 0 ? { reasoning } : {},
+    ...options?.think === false ? { system_prompt: NON_THINKING_SYSTEM_PROMPT } : {}
   };
 }
-function parseNonNegativeInt(value) {
+function getNativeStatNumber(value) {
+  if (typeof value !== "number" || !Number.isFinite(value) || value < 0) return void 0;
+  return value;
+}
+function flattenNativeText(value, depth = 0) {
+  if (depth > 3 || value == null) return "";
+  if (typeof value === "string") return value;
+  if (Array.isArray(value)) {
+    return value.map((item) => flattenNativeText(item, depth + 1)).join("");
+  }
+  if (typeof value === "object") {
+    const candidate = value;
+    return flattenNativeText(candidate.text, depth + 1) || flattenNativeText(candidate.content, depth + 1) || flattenNativeText(candidate.delta, depth + 1) || flattenNativeText(candidate.value, depth + 1);
+  }
+  return "";
+}
+function collectNativeOutput(output4) {
+  if (!Array.isArray(output4)) {
+    return { response: "", reasoning: "" };
+  }
+  let response = "";
+  let reasoning = "";
+  for (const item of output4) {
+    if (typeof item !== "object" || item === null) continue;
+    const nativeItem = item;
+    const text = flattenNativeText(nativeItem.text ?? nativeItem.content).trim();
+    if (!text) continue;
+    const type = asNonEmptyString(nativeItem.type)?.toLowerCase() ?? "";
+    if (type.includes("reason")) {
+      reasoning += text;
+    } else {
+      response += text;
+    }
+  }
+  return { response, reasoning };
+}
+function extractNativeStats(payload) {
+  if (typeof payload !== "object" || payload === null) return void 0;
+  const direct = payload.stats;
+  if (direct) return direct;
+  const result = payload.result?.stats;
+  return result;
+}
+function extractNativeResponse(payload) {
+  if (typeof payload !== "object" || payload === null) {
+    return { response: "", reasoning: "" };
+  }
+  const resultOutput = payload.result?.output;
+  const directOutput = payload.output;
+  const fromResult = collectNativeOutput(resultOutput);
+  if (fromResult.response || fromResult.reasoning) return fromResult;
+  return collectNativeOutput(directOutput);
+}
+function extractNativeDelta(payload) {
+  if (typeof payload !== "object" || payload === null) {
+    return { response: "", reasoning: "" };
+  }
+  const type = asNonEmptyString(payload.type)?.toLowerCase() ?? "";
+  const directText = flattenNativeText(payload.delta);
+  const fallbackText = directText || flattenNativeText(payload.content) || flattenNativeText(payload.text);
+  if (!fallbackText) {
+    return { response: "", reasoning: "" };
+  }
+  if (type.includes("reason")) {
+    return { response: "", reasoning: fallbackText };
+  }
+  if (type.includes("message") || type.includes("text") || type.includes("content")) {
+    return { response: fallbackText, reasoning: "" };
+  }
+  return { response: fallbackText, reasoning: "" };
+}
+function parseNonNegativeInt2(value) {
   if (!/^\d+$/.test(value)) return null;
   const parsed = Number.parseInt(value, 10);
   if (!Number.isSafeInteger(parsed) || parsed < 0) return null;
   return parsed;
 }
-function resolveStreamStallTimeoutMs(override) {
+function resolveStreamStallTimeoutMs2(override) {
   if (override !== void 0) {
-    if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
+    if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
     return override === 0 ? void 0 : Math.trunc(override);
   }
-  const configured = process.env.LM_STUDIO_STREAM_STALL_TIMEOUT_MS?.trim();
-  if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
-  const parsed = parseNonNegativeInt(configured);
-  if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
+  const configured = process.env[SHARED_STREAM_STALL_TIMEOUT_ENV2]?.trim();
+  if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
+  const parsed = parseNonNegativeInt2(configured);
+  if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
   return parsed === 0 ? void 0 : parsed;
 }
 function getLMStudioBaseUrl() {
@@ -714,25 +926,29 @@ function getLMStudioHeaders() {
   }
   return headers;
 }
-function extractUsage(payload) {
-  if (typeof payload !== "object" || payload === null) return void 0;
-  const usage = payload.usage;
-  if (!usage) return void 0;
-  return usage;
-}
-function extractChoice2(payload) {
-  if (typeof payload !== "object" || payload === null) return void 0;
-  const choices = payload.choices;
-  if (!choices || choices.length === 0) return void 0;
-  return choices[0];
-}
-function extractContent(choice) {
-  const content = choice?.delta?.content ?? choice?.message?.content;
-  return typeof content === "string" ? content : "";
+function getUsageTokenCount(value) {
+  if (typeof value !== "number" || !Number.isFinite(value)) return 0;
+  if (value <= 0) return 0;
+  return Math.trunc(value);
+}
+function estimateCompletionTokensFallback(text) {
+  const normalized = text.trim();
+  if (!normalized) return 0;
+  const cjkMatches = normalized.match(/[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/gu);
+  const cjkCount = cjkMatches?.length ?? 0;
+  const withoutCjk = normalized.replace(
+    /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/gu,
+    ""
+  );
+  const nonCjkChars = withoutCjk.replace(/\s+/g, "").length;
+  const nonCjkHeuristic = Math.ceil(nonCjkChars / 4);
+  const whitespaceEstimate = estimateTokenCount(normalized);
+  return Math.max(1, Math.max(whitespaceEstimate, cjkCount + nonCjkHeuristic));
 }
-function extractReasoning(choice) {
-  const reasoning = choice?.delta?.reasoning_content ?? choice?.delta?.reasoning ?? choice?.message?.reasoning_content ?? choice?.message?.reasoning;
-  return typeof reasoning === "string" ? reasoning : "";
+function resolveCompletionTokenCount(reportedTokenCount, response, reasoning) {
+  const reported = getUsageTokenCount(reportedTokenCount);
+  if (reported > 0) return reported;
+  return estimateCompletionTokensFallback(`${reasoning} ${response}`);
 }
 function asNonEmptyString(value) {
   if (typeof value !== "string") return void 0;
@@ -761,7 +977,7 @@ async function pathIsDirectory(targetPath) {
   try {
     const stat = await fs.stat(targetPath);
     return stat.isDirectory();
-  } catch {
+  } catch (_err) {
     return false;
   }
 }
@@ -982,11 +1198,17 @@ async function resolveLocalModelMetadata(modelId, apiModel, modelsRootDir) {
     const size = await readDirectorySizeBytes(source.fullPath);
     if (size > bestSize) bestSize = size;
     if (size > 0) {
-      return { size, parameterSize: definition.parameterSize };
+      return {
+        size,
+        parameterSize: definition.parameterSize
+      };
     }
   }
   if (bestSize > 0) {
-    return { size: bestSize, parameterSize: definition.parameterSize };
+    return {
+      size: bestSize,
+      parameterSize: definition.parameterSize
+    };
   }
   const fallback = await resolvePublisherModelMetadata(modelId, apiModel, modelsRootDir);
   if (fallback.size > 0) {
@@ -995,7 +1217,10 @@ async function resolveLocalModelMetadata(modelId, apiModel, modelsRootDir) {
       parameterSize: definition.parameterSize ?? fallback.parameterSize
     };
   }
-  return { size: 0, parameterSize: definition.parameterSize ?? fallback.parameterSize };
+  return {
+    size: 0,
+    parameterSize: definition.parameterSize ?? fallback.parameterSize
+  };
 }
 function parseSizeBytes(model) {
   if (!model) return 0;
@@ -1033,6 +1258,21 @@ function inferParameterSizeFromModelId(modelId) {
   }
   return void 0;
 }
+function resolveModelFormat(apiModel, _localMetadata, _modelId) {
+  return asNonEmptyString(apiModel?.compatibility_type);
+}
+function buildModelEntry(id, apiModel, localMetadata) {
+  const apiSize = parseSizeBytes(apiModel);
+  return {
+    name: id,
+    size: apiSize > 0 ? apiSize : localMetadata?.size ?? 0,
+    parameterSize: localMetadata?.parameterSize ?? inferParameterSizeFromModelId(id),
+    quantization: asNonEmptyString(apiModel?.quantization),
+    runtimeStatus: asNonEmptyString(apiModel?.state),
+    modelFormat: resolveModelFormat(apiModel, localMetadata, id),
+    family: asNonEmptyString(apiModel?.arch) ?? asNonEmptyString(apiModel?.type) ?? asNonEmptyString(apiModel?.publisher)
+  };
+}
 function isLoadedState(state) {
   if (!state) return false;
   const normalized = state.trim().toLowerCase();
@@ -1040,6 +1280,128 @@ function isLoadedState(state) {
   if (normalized === "loaded" || normalized === "ready") return true;
   return normalized.includes("loaded");
 }
+function execFileText(cmd, args, timeoutMs) {
+  return new Promise((resolve, reject) => {
+    execFile2(
+      cmd,
+      args,
+      {
+        timeout: timeoutMs,
+        maxBuffer: 1024 * 1024,
+        env: process.env
+      },
+      (err, stdout, stderr) => {
+        if (err) {
+          const error = err;
+          error.stdout = stdout;
+          error.stderr = stderr;
+          reject(error);
+          return;
+        }
+        resolve({ stdout, stderr });
+      }
+    );
+  });
+}
+function isCommandMissingError(err) {
+  return err instanceof Error && "code" in err && err.code === "ENOENT";
+}
+async function runLmsCli(args) {
+  const configuredPath = asNonEmptyString(process.env[LM_STUDIO_CLI_PATH_ENV]);
+  const fallbackPath = path.join(getLMStudioHomeDir(), "bin", "lms");
+  const candidates = [
+    configuredPath,
+    "lms",
+    fallbackPath
+  ].filter(
+    (candidate, index, list) => Boolean(candidate) && list.indexOf(candidate) === index
+  );
+  let lastError;
+  for (const candidate of candidates) {
+    try {
+      return await execFileText(candidate, args, LM_STUDIO_CLI_TIMEOUT_MS);
+    } catch (err) {
+      lastError = err;
+      if (isCommandMissingError(err)) continue;
+      throw err;
+    }
+  }
+  throw lastError ?? new Error("LM Studio CLI is not available.");
+}
+function normalizeCliToken(value) {
+  return (value ?? "").trim().toLowerCase();
+}
+function matchesLoadedModelCliEntry(entry, model) {
+  const target = normalizeCliToken(model);
+  if (!target) return false;
+  return [
+    entry.identifier,
+    entry.indexedModelIdentifier,
+    entry.path,
+    entry.modelKey
+  ].some((candidate) => normalizeCliToken(candidate) === target);
+}
+async function listLoadedModelsFromCli() {
+  const { stdout } = await runLmsCli(["ps", "--json"]);
+  const parsed = JSON.parse(stdout);
+  return Array.isArray(parsed) ? parsed : [];
+}
+function parseEstimatedBytes(output4) {
+  const match = output4.match(/Estimated Total Memory:\s*([0-9]+(?:\.[0-9]+)?)\s*(KiB|MiB|GiB|TiB|KB|MB|GB|TB)/i);
+  if (!match) return null;
+  const value = Number.parseFloat(match[1] ?? "");
+  const unit = (match[2] ?? "").toUpperCase();
+  if (!Number.isFinite(value) || value <= 0) return null;
+  const multipliers = {
+    KIB: 1024,
+    MIB: 1024 ** 2,
+    GIB: 1024 ** 3,
+    TIB: 1024 ** 4,
+    KB: 1e3,
+    MB: 1e3 ** 2,
+    GB: 1e3 ** 3,
+    TB: 1e3 ** 4
+  };
+  const multiplier = multipliers[unit];
+  if (!multiplier) return null;
+  return Math.round(value * multiplier);
+}
+async function estimateLoadedModelMemoryBytes(model) {
+  let loadedEntry;
+  try {
+    const loadedModels = await listLoadedModelsFromCli();
+    loadedEntry = loadedModels.find((entry) => matchesLoadedModelCliEntry(entry, model));
+  } catch {
+    loadedEntry = void 0;
+  }
+  if (!loadedEntry) return null;
+  const candidateModelKeys = [
+    loadedEntry?.path,
+    loadedEntry?.indexedModelIdentifier,
+    loadedEntry?.modelKey
+  ].filter(
+    (candidate, index, list) => Boolean(candidate?.trim()) && list.findIndex((item) => item === candidate) === index
+  );
+  for (const candidate of candidateModelKeys) {
+    const args = ["load", "--estimate-only", "-y"];
+    if (typeof loadedEntry?.contextLength === "number" && Number.isFinite(loadedEntry.contextLength) && loadedEntry.contextLength > 0) {
+      args.push("--context-length", String(Math.trunc(loadedEntry.contextLength)));
+    }
+    args.push(candidate);
+    try {
+      const { stdout, stderr } = await runLmsCli(args);
+      const estimated = parseEstimatedBytes(`${stdout}
+${stderr}`);
+      if (estimated !== null) return estimated;
+    } catch (err) {
+      const output4 = err instanceof Error ? `${String(err.stdout ?? "")}
+${String(err.stderr ?? "")}` : "";
+      const estimated = parseEstimatedBytes(output4);
+      if (estimated !== null) return estimated;
+    }
+  }
+  return null;
+}
 async function fetchApiModels() {
   try {
     const resp = await fetchWithTimeout(
@@ -1092,7 +1454,7 @@ async function getLMStudioVersion() {
   const localVersion = await resolveLocalLMStudioVersion();
   try {
     const resp = await fetchWithTimeout(
-      "/v1/models",
+      "/api/v1/models",
       { method: "GET", headers: getLMStudioHeaders() },
       5e3,
       "LM Studio version check"
@@ -1107,7 +1469,7 @@ async function getLMStudioVersion() {
 }
 async function listModels2() {
   const resp = await fetchWithTimeout(
-    "/v1/models",
+    "/api/v1/models",
     { method: "GET", headers: getLMStudioHeaders() },
     LM_STUDIO_INIT_TIMEOUT_MS,
     "LM Studio list models"
@@ -1125,25 +1487,25 @@ async function listModels2() {
     apiById.set(id, model);
   }
   const modelsRootDir = await resolveModelsRootDir();
-  const localMetadataById = /* @__PURE__ */ new Map();
-  for (const id of ids) {
-    const localMetadata = await resolveLocalModelMetadata(id, apiById.get(id), modelsRootDir);
-    localMetadataById.set(id, localMetadata);
-  }
-  return ids.map((id) => {
-    const apiModel = apiById.get(id);
-    const localMetadata = localMetadataById.get(id);
-    const apiSize = parseSizeBytes(apiModel);
-    return {
-      name: id,
-      size: apiSize > 0 ? apiSize : localMetadata?.size ?? 0,
-      parameterSize: localMetadata?.parameterSize ?? inferParameterSizeFromModelId(id),
-      quantization: asNonEmptyString(apiModel?.quantization),
-      runtimeStatus: asNonEmptyString(apiModel?.state),
-      modelFormat: asNonEmptyString(apiModel?.compatibility_type),
-      family: asNonEmptyString(apiModel?.arch) ?? asNonEmptyString(apiModel?.type) ?? asNonEmptyString(apiModel?.publisher)
-    };
-  });
+  const localMetadataEntries = await Promise.all(
+    ids.map(async (id) => {
+      const localMetadata = await resolveLocalModelMetadata(id, apiById.get(id), modelsRootDir);
+      return [id, localMetadata];
+    })
+  );
+  const localMetadataById = new Map(
+    localMetadataEntries
+  );
+  return ids.map((id) => buildModelEntry(id, apiById.get(id), localMetadataById.get(id)));
+}
+async function resolveModel(modelId) {
+  const id = modelId.trim();
+  if (!id) return null;
+  const apiModels = await fetchApiModels();
+  const apiModel = apiModels?.find((candidate) => asNonEmptyString(candidate.id) === id);
+  const modelsRootDir = await resolveModelsRootDir();
+  const localMetadata = await resolveLocalModelMetadata(id, apiModel, modelsRootDir);
+  return buildModelEntry(id, apiModel, localMetadata);
 }
 async function listRunningModels2() {
   const apiModels = await fetchApiModels();
@@ -1164,39 +1526,54 @@ async function generate2(model, prompt, options) {
   activeAbortControllers.add(controller);
   try {
     const baseUrl = getLMStudioBaseUrl();
-    const url = new URL("/v1/chat/completions", baseUrl);
-    const resp = await fetch(url, {
+    const url = new URL("/api/v1/chat", baseUrl);
+    const doRequest = (includeSampling) => fetch(url, {
       method: "POST",
       headers: getLMStudioHeaders(),
-      body: JSON.stringify({
-        model,
-        messages: [{ role: "user", content: prompt }],
-        temperature: options?.temperature ?? 0,
-        max_tokens: options?.num_predict ?? 512,
-        stream: false,
-        ...buildThinkingConfig(options?.think)
-      }),
+      body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, includeSampling)),
       signal: controller.signal
     });
+    let resp = await doRequest(true);
     if (!resp.ok) {
       const body = await resp.text().catch(() => "");
-      throw new Error(`LM Studio generate failed (${resp.status} ${resp.statusText}) ${body}`.trim());
+      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+        resp = await doRequest(false);
+      } else {
+        throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
+      }
+    }
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
     }
     const payload = await resp.json();
-    const choice = extractChoice2(payload);
-    const response = extractContent(choice);
-    const reasoning = extractReasoning(choice);
-    const usage = extractUsage(payload);
+    const nativeResponse = extractNativeResponse(payload);
+    const response = nativeResponse.response;
+    const reasoning = nativeResponse.reasoning;
+    assertThinkingModeRespected(model, options?.think, response, reasoning);
+    const stats = extractNativeStats(payload);
     const totalDuration = Math.max(0, Date.now() - start) * 1e6;
+    const outputTokens = getUsageTokenCount(stats?.total_output_tokens) || resolveCompletionTokenCount(void 0, response, reasoning);
+    const throughput = getNativeStatNumber(stats?.tokens_per_second);
+    const timeToFirstTokenSeconds = getNativeStatNumber(stats?.time_to_first_token_seconds);
+    const modelLoadTimeSeconds = getNativeStatNumber(stats?.model_load_time_seconds);
+    const evalCountEstimated = getUsageTokenCount(stats?.total_output_tokens) <= 0;
+    const evalDuration = throughput !== void 0 && throughput > 0 && outputTokens > 0 ? Math.max(1, Math.round(outputTokens / throughput * 1e9)) : totalDuration;
+    const promptEvalDuration = timeToFirstTokenSeconds !== void 0 ? Math.max(0, Math.round(timeToFirstTokenSeconds * 1e9)) : 0;
+    const loadDuration = Math.max(
+      0,
+      Math.round((modelLoadTimeSeconds ?? 0) * 1e9)
+    );
     return {
       response,
       ...reasoning ? { thinking: reasoning } : {},
       totalDuration,
-      loadDuration: 0,
-      promptEvalCount: usage?.prompt_tokens ?? 0,
-      promptEvalDuration: 0,
-      evalCount: usage?.completion_tokens ?? 0,
-      evalDuration: totalDuration
+      loadDuration,
+      promptEvalCount: getUsageTokenCount(stats?.input_tokens),
+      promptEvalDuration,
+      evalCount: outputTokens,
+      evalDuration,
+      ...evalCountEstimated ? { evalCountEstimated: true } : {}
     };
   } catch (err) {
     if (err instanceof Error && err.name === "AbortError") {
@@ -1211,10 +1588,10 @@ async function generateStream2(model, prompt, callbacks, options) {
   const start = Date.now();
   const controller = new AbortController();
   activeAbortControllers.add(controller);
-  const stallTimeoutMs = resolveStreamStallTimeoutMs(options?.stall_timeout_ms);
+  const stallTimeoutMs = resolveStreamStallTimeoutMs2(options?.stall_timeout_ms);
   let abortedByStallTimeout = false;
   const baseUrl = getLMStudioBaseUrl();
-  const url = new URL("/v1/chat/completions", baseUrl);
+  const url = new URL("/api/v1/chat", baseUrl);
   let stallTimer = null;
   const resetStallTimer = () => {
     if (stallTimeoutMs === void 0) return;
@@ -1226,23 +1603,24 @@ async function generateStream2(model, prompt, callbacks, options) {
   };
   try {
     resetStallTimer();
-    const resp = await fetch(url, {
+    const doRequest = (includeSampling) => fetch(url, {
       method: "POST",
       headers: getLMStudioHeaders(),
-      body: JSON.stringify({
-        model,
-        messages: [{ role: "user", content: prompt }],
-        temperature: options?.temperature ?? 0,
-        max_tokens: options?.num_predict ?? 512,
-        stream: true,
-        stream_options: { include_usage: true },
-        ...buildThinkingConfig(options?.think)
-      }),
+      body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, includeSampling)),
       signal: controller.signal
     });
+    let resp = await doRequest(true);
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+        resp = await doRequest(false);
+      } else {
+        throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
+      }
+    }
     if (!resp.ok) {
       const body = await resp.text().catch(() => "");
-      throw new Error(`LM Studio stream failed (${resp.status} ${resp.statusText}) ${body}`.trim());
+      throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
     }
     if (!resp.body) {
       throw new Error("LM Studio stream response body is empty");
@@ -1253,10 +1631,10 @@ async function generateStream2(model, prompt, callbacks, options) {
     let doneReceived = false;
     let fullResponse = "";
     let fullThinking = "";
-    let usage;
+    let stats;
     let firstChunkSeen = false;
-    let firstTokenTime = null;
-    let lastTokenTime = null;
+    let firstGeneratedTokenTime = null;
+    let lastGeneratedTokenTime = null;
     const processDataLine = (rawLine) => {
       const line = rawLine.trim();
       if (!line.startsWith("data:")) return;
@@ -1272,18 +1650,27 @@ async function generateStream2(model, prompt, callbacks, options) {
       } catch {
         return;
       }
-      const choice = extractChoice2(payload);
-      const content = extractContent(choice);
-      const reasoning = extractReasoning(choice);
-      const chunkUsage = extractUsage(payload);
-      if (chunkUsage) usage = chunkUsage;
+      const delta = extractNativeDelta(payload);
+      const content = delta.response;
+      const reasoning = delta.reasoning;
+      const chunkStats = extractNativeStats(payload);
+      if (chunkStats) stats = chunkStats;
+      const aggregate = extractNativeResponse(payload);
+      if (aggregate.response) {
+        fullResponse = aggregate.response;
+      }
+      if (aggregate.reasoning) {
+        fullThinking = aggregate.reasoning;
+      }
+      if (reasoning || content) {
+        const now = Date.now();
+        if (firstGeneratedTokenTime === null) firstGeneratedTokenTime = now;
+        lastGeneratedTokenTime = now;
+      }
       if (reasoning) {
         fullThinking += reasoning;
       }
       if (content) {
-        const now = Date.now();
-        if (firstTokenTime === null) firstTokenTime = now;
-        lastTokenTime = now;
         fullResponse += content;
         callbacks?.onToken?.(content);
       }
@@ -1303,6 +1690,14 @@ async function generateStream2(model, prompt, callbacks, options) {
         processDataLine(rawLine);
       }
     }
+    buffered += decoder.decode();
+    if (buffered.length > 0) {
+      const lines = buffered.split("\n");
+      buffered = lines.pop() ?? "";
+      for (const rawLine of lines) {
+        processDataLine(rawLine);
+      }
+    }
     if (buffered.trim().length > 0) {
       processDataLine(buffered);
     }
@@ -1311,17 +1706,27 @@ async function generateStream2(model, prompt, callbacks, options) {
       throw new Error("LM Studio stream ended without content");
     }
     const totalDuration = Math.max(0, Date.now() - start) * 1e6;
-    const evalDurationMs = firstTokenTime !== null && lastTokenTime !== null && lastTokenTime > firstTokenTime ? lastTokenTime - firstTokenTime : Date.now() - start;
+    const outputTokens = getUsageTokenCount(stats?.total_output_tokens) || resolveCompletionTokenCount(void 0, fullResponse, fullThinking);
+    const throughput = getNativeStatNumber(stats?.tokens_per_second);
+    const timeToFirstTokenSeconds = getNativeStatNumber(stats?.time_to_first_token_seconds);
+    const modelLoadTimeSeconds = getNativeStatNumber(stats?.model_load_time_seconds);
+    const evalCountEstimated = getUsageTokenCount(stats?.total_output_tokens) <= 0;
+    const evalDurationMs = throughput !== void 0 && throughput > 0 && outputTokens > 0 ? outputTokens / throughput * 1e3 : firstGeneratedTokenTime !== null && lastGeneratedTokenTime !== null && lastGeneratedTokenTime > firstGeneratedTokenTime ? lastGeneratedTokenTime - firstGeneratedTokenTime : Date.now() - start;
     const result = {
       response: fullResponse,
       ...fullThinking ? { thinking: fullThinking } : {},
       totalDuration,
-      loadDuration: 0,
-      promptEvalCount: usage?.prompt_tokens ?? 0,
-      promptEvalDuration: firstTokenTime !== null ? (firstTokenTime - start) * 1e6 : 0,
-      evalCount: usage?.completion_tokens ?? 0,
-      evalDuration: Math.max(1, evalDurationMs) * 1e6
+      loadDuration: Math.max(
+        0,
+        Math.round((modelLoadTimeSeconds ?? 0) * 1e9)
+      ),
+      promptEvalCount: getUsageTokenCount(stats?.input_tokens),
+      promptEvalDuration: timeToFirstTokenSeconds !== void 0 ? Math.max(0, Math.round(timeToFirstTokenSeconds * 1e9)) : firstGeneratedTokenTime !== null ? (firstGeneratedTokenTime - start) * 1e6 : 0,
+      evalCount: outputTokens,
+      evalDuration: Math.max(1, Math.round(evalDurationMs * 1e6)),
+      ...evalCountEstimated ? { evalCountEstimated: true } : {}
     };
+    assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
     callbacks?.onDone?.(result);
     return result;
   } catch (err) {
@@ -1487,6 +1892,19 @@ function getRuntimeName() {
 function getRuntimeModelFormat() {
   return activeRuntime.modelFormat ?? "gguf";
 }
+async function resolveRuntimeModel(model) {
+  if (activeRuntime.name === "lm-studio") {
+    return resolveModel(model);
+  }
+  const knownModels = await activeRuntime.listModels();
+  const matchedModel = knownModels.find((candidate) => candidate.name === model);
+  if (matchedModel) return matchedModel;
+  return {
+    name: model,
+    size: 0,
+    modelFormat: activeRuntime.modelFormat ?? "gguf"
+  };
+}
 // ../src/commands/bench.ts
 import { createHash as createHash3 } from "crypto";
@@ -1495,11 +1913,33 @@ import chalk8 from "chalk";
 // ../src/core/hardware.ts
 import si from "systeminformation";
 import os2 from "os";
-import { execFile as execFile2 } from "child_process";
+import { execFile as execFile3 } from "child_process";
 import { readFile } from "fs/promises";
+function normalizeWhitespace(value) {
+  return value.replace(/\s+/g, " ").trim();
+}
+function looksLikeGpuDescriptor(value) {
+  return /\b(radeon|graphics|geforce|rtx|gtx|arc|iris|uhd|quadro|tesla|adreno|mali|powervr)\b/i.test(value);
+}
+function splitCpuAndInferredGpu(cpuLabel) {
+  const normalized = normalizeWhitespace(cpuLabel);
+  const withGpuMatch = normalized.match(/\s+(?:w\/\s*|with\s+)(.+)$/i);
+  if (!withGpuMatch?.index) {
+    return { cpu: normalized, inferredGpu: null };
+  }
+  const inferredGpu = normalizeWhitespace(withGpuMatch[1] ?? "");
+  if (!looksLikeGpuDescriptor(inferredGpu)) {
+    return { cpu: normalized, inferredGpu: null };
+  }
+  const cpu = normalizeWhitespace(normalized.slice(0, withGpuMatch.index));
+  return {
+    cpu: cpu || normalized,
+    inferredGpu: inferredGpu || null
+  };
+}
 function execCommand(cmd, args, timeoutMs = 3e3) {
   return new Promise((resolve) => {
-    const child = execFile2(cmd, args, { timeout: timeoutMs }, (err, stdout) => {
+    const child = execFile3(cmd, args, { timeout: timeoutMs }, (err, stdout) => {
       if (err) return resolve("");
       resolve(stdout.trim());
     });
@@ -1633,11 +2073,14 @@ async function getHardwareInfo() {
   ]);
   const gpuController = graphics.controllers[0];
   const gpuNames = graphics.controllers.map((g) => g.model).filter(Boolean).join(", ");
+  const cpuLabelRaw = normalizeWhitespace(`${cpu.manufacturer} ${cpu.brand}`);
+  const { cpu: cpuLabel, inferredGpu } = splitCpuAndInferredGpu(cpuLabelRaw);
+  const defaultIntegratedGpu = process.platform === "darwin" ? "Integrated / Apple Silicon" : "Integrated / Unknown";
   const gpuCoresRaw = gpuController?.cores;
   const gpuCores = gpuCoresRaw ? parseInt(String(gpuCoresRaw), 10) : null;
   const memType = memLayout.length > 0 ? memLayout[0].type : null;
   return {
-    cpu: `${cpu.manufacturer} ${cpu.brand}`,
+    cpu: cpuLabel,
     cpuCores: cpu.cores,
     cpuPCores: cpu.performanceCores || null,
     cpuECores: cpu.efficiencyCores || null,
@@ -1647,7 +2090,7 @@ async function getHardwareInfo() {
     memoryType: memType || null,
     swapTotalGB: +(mem.swaptotal / 1024 / 1024 / 1024).toFixed(1),
     swapUsedGB: +(mem.swapused / 1024 / 1024 / 1024).toFixed(1),
-    gpu: gpuNames || "Integrated / Apple Silicon",
+    gpu: normalizeWhitespace(gpuNames) || inferredGpu || defaultIntegratedGpu,
     gpuCores: gpuCores && !isNaN(gpuCores) ? gpuCores : null,
     gpuVramMB: gpuController?.vram ?? null,
     os: `${osInfo.distro} ${osInfo.release}`,
@@ -1682,6 +2125,10 @@ import chalk from "chalk";
 // ../src/ui/terminal.ts
 var supportsUnicode = process.platform !== "win32" || Boolean(process.env.WT_SESSION) || Boolean(process.env.TERM_PROGRAM);
+var ANSI_RE = /\x1b\[[0-9;]*[A-Za-z]/g;
+function stripAnsi(value) {
+  return value.replace(ANSI_RE, "");
+}
 // ../src/ui/progress.ts
 var FUN_PHRASES = [
@@ -1816,6 +2263,33 @@ function errorMsg(text) {
   console.log(chalk.red(`  ${CROSS_MARK} ${text}`));
 }
+// ../src/benchmarks/profile.ts
+var BENCHMARK_PROFILE_VERSION = "v1";
+var BENCHMARK_PROFILE_SEED = 42;
+var BENCHMARK_PROFILE_TOP_P = 1;
+var BENCHMARK_PROFILE_TEMPERATURE = 0;
+function withBenchmarkProfile(opts = {}) {
+  return {
+    temperature: BENCHMARK_PROFILE_TEMPERATURE,
+    top_p: BENCHMARK_PROFILE_TOP_P,
+    seed: BENCHMARK_PROFILE_SEED,
+    ...opts
+  };
+}
+function buildBenchmarkProfileMetadata(thinkEnabled) {
+  return {
+    version: BENCHMARK_PROFILE_VERSION,
+    sampling: {
+      temperature: BENCHMARK_PROFILE_TEMPERATURE,
+      topP: BENCHMARK_PROFILE_TOP_P,
+      seed: BENCHMARK_PROFILE_SEED
+    },
+    thinkingMode: thinkEnabled ? "enabled" : "disabled",
+    contextWindowTokens: null,
+    contextPolicy: "runtime-default"
+  };
+}
 // ../src/benchmarks/performance.ts
 var WARMUP_PROMPT = "Say hello in one word.";
 var BENCH_PROMPTS = [
@@ -1855,11 +2329,15 @@ async function runPerformanceBench(model, options = {}) {
       optionalProbeWithAvailability(() => getSwapUsedGB(), 0),
       optionalProbe(() => detectBatteryPowered(), void 0)
     ]);
+    const runningModelsBeforeWarmup = await optionalProbe(() => listRunningModels3(), []);
+    const modelWasAlreadyLoaded = runningModelsBeforeWarmup.some((m) => m.name === model);
     const warmup = await withTimeout(
       generateStream3(model, WARMUP_PROMPT, void 0, {
-        num_predict: 32,
-        think: options.think,
-        stall_timeout_ms: options.streamStallTimeoutMs
+        ...withBenchmarkProfile({
+          num_predict: 32,
+          think: options.think,
+          stall_timeout_ms: options.streamStallTimeoutMs
+        })
       }),
       warmupTimeoutMs,
       "Model warmup",
@@ -1870,15 +2348,6 @@ async function runPerformanceBench(model, options = {}) {
     const loadTime = warmup.loadDuration / 1e6;
     const runningModels = await listRunningModels3();
     const thisModel = runningModels.find((m) => m.name === model);
-    let installedModelSizeBytes = 0;
-    try {
-      const availableModels = await listModels3();
-      const listedModel = availableModels.find((m) => m.name === model);
-      if (listedModel && Number.isFinite(listedModel.size) && listedModel.size > 0) {
-        installedModelSizeBytes = listedModel.size;
-      }
-    } catch {
-    }
     spinner.succeed("Model loaded");
     const tpsValues = [];
     const firstChunkValues = [];
@@ -1892,6 +2361,7 @@ async function runPerformanceBench(model, options = {}) {
     let thinkingDetected = false;
     let totalThinkingTokens = 0;
     const cpuLoadSamples = [];
+    let tokensPerSecondEstimated = false;
     for (let i = 0; i < BENCH_PROMPTS.length; i++) {
       spinner.start(`Running performance test ${i + 1}/${BENCH_PROMPTS.length}...`);
       let firstChunkTime = null;
@@ -1914,11 +2384,11 @@ async function runPerformanceBench(model, options = {}) {
                 }
               }
             },
-            {
+            withBenchmarkProfile({
               num_predict: 256,
               think: options.think,
               stall_timeout_ms: options.streamStallTimeoutMs
-            }
+            })
           ),
           promptTimeoutMs,
           "Performance benchmark",
@@ -1929,6 +2399,9 @@ async function runPerformanceBench(model, options = {}) {
         tpsValues.push(tps);
         totalEvalCount += result.evalCount;
         totalEvalDurationNs += result.evalDuration;
+        if (result.evalCountEstimated) {
+          tokensPerSecondEstimated = true;
+        }
         if (firstChunkTime !== null) {
           firstChunkValues.push(firstChunkTime);
         }
@@ -1974,10 +2447,18 @@ async function runPerformanceBench(model, options = {}) {
     ]);
     let memoryUsedGB;
     let memoryPercent;
-    const loadedModelSizeBytes = thisModel && thisModel.size > 0 ? thisModel.size : installedModelSizeBytes;
+    let memoryFootprintEstimated = false;
+    const runtimeReportsComparableLoadedSize = runtimeName !== "lm-studio";
+    const estimatedLoadedModelSizeBytes = runtimeName === "lm-studio" && modelWasAlreadyLoaded ? await optionalProbe(() => estimateLoadedModelMemoryBytes(model), null) : null;
+    const loadedModelSizeBytes = runtimeReportsComparableLoadedSize && thisModel && thisModel.size > 0 ? thisModel.size : 0;
+    const memoryFootprintAvailable = runtimeReportsComparableLoadedSize ? loadedModelSizeBytes > 0 || !modelWasAlreadyLoaded : (estimatedLoadedModelSizeBytes ?? 0) > 0 || !modelWasAlreadyLoaded;
     if (loadedModelSizeBytes > 0) {
       memoryUsedGB = loadedModelSizeBytes / 1024 ** 3;
       memoryPercent = memoryUsedGB / memAfter.totalGB * 100;
+    } else if ((estimatedLoadedModelSizeBytes ?? 0) > 0) {
+      memoryUsedGB = (estimatedLoadedModelSizeBytes ?? 0) / 1024 ** 3;
+      memoryPercent = memoryUsedGB / memAfter.totalGB * 100;
+      memoryFootprintEstimated = true;
     } else {
       memoryUsedGB = Math.max(0, memAfter.usedGB - memBefore.usedGB);
       memoryPercent = Math.max(0, memAfter.percent - memBefore.percent);
@@ -1999,6 +2480,7 @@ async function runPerformanceBench(model, options = {}) {
     return {
       metrics: {
         tokensPerSecond: totalEvalDurationNs > 0 ? totalEvalCount / (totalEvalDurationNs / 1e9) : avg(tpsValues),
+        ...tokensPerSecondEstimated ? { tokensPerSecondEstimated: true } : {},
         ...firstChunkMs !== void 0 ? { firstChunkMs } : {},
         ttft: ttft >= 0 ? ttft : 3e4,
         // Fallback: 30s if no TTFT measured
@@ -2009,6 +2491,8 @@ async function runPerformanceBench(model, options = {}) {
         completionTokens: totalCompletionTokens,
         memoryUsedGB: +memoryUsedGB.toFixed(1),
         memoryPercent: +memoryPercent.toFixed(1),
+        memoryFootprintAvailable,
+        ...memoryFootprintEstimated ? { memoryFootprintEstimated: true } : {},
         memoryHostUsedGB: memAfter.usedGB,
         memoryHostPercent: memAfter.percent,
         tpsStdDev: tpsValues.length >= 2 ? stddev(tpsValues) : void 0,
@@ -2401,7 +2885,7 @@ Answer:`;
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Reasoning question",
           abortOngoingRequests3
@@ -2730,7 +3214,7 @@ Answer:`;
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Math problem",
           abortOngoingRequests3
@@ -2773,7 +3257,7 @@ Answer:`;
 // ../src/benchmarks/coding.ts
 import vm2 from "vm";
-import { spawn } from "child_process";
+import { spawn as spawn2 } from "child_process";
 import { Worker } from "worker_threads";
 // ../src/datasets/coding.json
@@ -6446,7 +6930,7 @@ async function runTestsInSubprocess(code, task) {
   const total = task.tests.length;
   return new Promise((resolve) => {
     const wallTimeoutMs = computeIsolatedWallTimeoutMs(task);
-    const child = spawn(
+    const child = spawn2(
       process.execPath,
       [
         "--max-old-space-size=96",
@@ -6613,7 +7097,7 @@ Reply with ONLY the function code, no explanation.`;
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 2048, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
           timeoutMs,
           "Coding task",
           abortOngoingRequests3
@@ -6968,7 +7452,7 @@ async function runInstructionFollowingBench(model, opts) {
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Instruction following task",
           abortOngoingRequests3
@@ -7354,7 +7838,7 @@ async function runStructuredOutputBench(model, opts) {
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Structured output task",
           abortOngoingRequests3
@@ -7613,7 +8097,7 @@ async function runMultilingualBench(model, opts) {
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Multilingual task",
           abortOngoingRequests3
@@ -7749,13 +8233,15 @@ function computePerformanceScore(perf, hardware) {
   const tuning = deriveHardwareFitTuning(hardware);
   const safeTokensPerSecond = sanitizeNonNegative(perf.tokensPerSecond, 0);
   const safeTtft = sanitizeNonNegative(perf.ttft, tuning.ttft.hardMaxMs * 2);
-  const effectiveMemPercent = sanitizeNonNegative(
-    perf.memoryHostPercent ?? perf.memoryPercent,
-    100
-  );
   const speed = Math.round(scoreSpeed(safeTokensPerSecond, tuning));
   const ttft = Math.round(scoreTTFT(safeTtft, tuning));
-  const memory = Math.round(scoreMemory(effectiveMemPercent));
+  const memory = perf.memoryFootprintAvailable === false ? Math.round((speed + ttft) / (50 + 20) * 30) : Math.round(
+    scoreMemory(
+      // Score memory from the model's measured footprint/delta rather than
+      // unrelated host RAM usage from other running workloads.
+      sanitizeNonNegative(perf.memoryPercent, 100)
+    )
+  );
   return {
     total: clamp(speed + ttft + memory, 0, 100),
     speed,
@@ -7864,11 +8350,9 @@ function computeFitness(perf, quality, hardware, benchEnv) {
   const safeTokensPerSecond = sanitizeNonNegative(perf.tokensPerSecond, 0);
   const safeTtft = sanitizeNonNegative(perf.ttft, tuning.ttft.hardMaxMs * 10);
   const safeLoadTime = sanitizeNonNegative(perf.loadTime, tuning.loadTimeHardMaxMs * 10);
-  const hostMemoryPercent = sanitizeNonNegative(
-    perf.memoryHostPercent ?? perf.memoryPercent,
-    100
-  );
-  const modelMemoryDeltaPercent = sanitizeNonNegative(perf.memoryPercent, 100);
+  const modelMemoryFootprintAvailable = perf.memoryFootprintAvailable !== false;
+  const modelMemoryDeltaPercent = modelMemoryFootprintAvailable ? sanitizeNonNegative(perf.memoryPercent, 100) : void 0;
+  const hostMemoryPercent = perf.memoryHostPercent !== void 0 && Number.isFinite(perf.memoryHostPercent) && perf.memoryHostPercent >= 0 ? perf.memoryHostPercent : void 0;
   const disqualifiers = [];
   if (safeTokensPerSecond < tuning.speed.hardMin) {
     disqualifiers.push(
@@ -7885,12 +8369,12 @@ function computeFitness(perf, quality, hardware, benchEnv) {
       `Model load time too high: ${Math.round(safeLoadTime)}ms (maximum: ${tuning.loadTimeHardMaxMs}ms for ${tuning.profile} profile)`
     );
   }
-  const hostCritical = hostMemoryPercent > 95;
-  const modelDeltaCritical = modelMemoryDeltaPercent > 90;
-  const modelDeltaSignificant = modelMemoryDeltaPercent >= 10;
-  if (modelDeltaCritical || hostCritical && modelDeltaSignificant) {
+  const hostCritical = hostMemoryPercent !== void 0 && hostMemoryPercent > 95;
+  const modelDeltaCritical = modelMemoryDeltaPercent !== void 0 && modelMemoryDeltaPercent > 90;
+  const modelDeltaSignificant = modelMemoryDeltaPercent !== void 0 && modelMemoryDeltaPercent >= 10;
+  if (modelDeltaCritical) {
     disqualifiers.push(
-      `Memory usage critical: host ${hostMemoryPercent.toFixed(0)}%, model delta +${modelMemoryDeltaPercent.toFixed(0)}%`
+      `Memory usage critical: model delta +${modelMemoryDeltaPercent.toFixed(0)}%`
     );
   }
   const verdictScore = globalScore ?? hardwareFitScore;
@@ -7925,9 +8409,28 @@ function computeFitness(perf, quality, hardware, benchEnv) {
       `Token speed is unstable (stddev ${perf.tpsStdDev.toFixed(1)} tok/s, mean ${safeTokensPerSecond.toFixed(1)} tok/s) \u2014 may indicate thermal throttling or memory pressure.`
     );
   }
-  if (hostCritical && !modelDeltaSignificant) {
+  if (perf.tokensPerSecondEstimated) {
+    warnings.push(
+      "Token throughput is estimated from LM Studio output because native token stats were unavailable. Compare tok/s across backends cautiously."
+    );
+  }
+  if (perf.memoryFootprintEstimated) {
+    warnings.push(
+      "Model memory footprint is estimated via LM Studio CLI rather than measured from a fresh load."
+    );
+  }
+  if (!modelMemoryFootprintAvailable) {
     warnings.push(
-      `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) but model delta is limited (+${modelMemoryDeltaPercent.toFixed(0)}%). Verdict may be influenced by other running workloads.`
+      "Model memory footprint was unavailable for this run, so RAM fit scoring was normalized from speed and TTFT only."
+    );
+  }
+  if (hostCritical && !modelMemoryFootprintAvailable) {
+    warnings.push(
+      `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) and model footprint was unavailable. Results may be influenced by other running workloads.`
+    );
+  } else if (hostCritical && modelMemoryDeltaPercent !== void 0 && !modelDeltaSignificant) {
+    warnings.push(
+      `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) but model delta is limited (+${modelMemoryDeltaPercent.toFixed(0)}%). Results may be influenced by other running workloads.`
     );
   }
   if (hardware?.powerMode === "low-power") {
@@ -8015,12 +8518,46 @@ function getLevel(score) {
   if (score >= 25) return "Weak";
   return "Poor";
 }
+function formatCpuCoresLabel(hw) {
+  if (hw.cpuPCores !== null && hw.cpuECores !== null) {
+    return `${hw.cpuCores} total (${hw.cpuPCores} performance + ${hw.cpuECores} efficiency)`;
+  }
+  if (hw.cpuPCores !== null && hw.cpuCores > hw.cpuPCores) {
+    return `${hw.cpuCores} threads (${hw.cpuPCores} cores)`;
+  }
+  if (hw.cpuPCores !== null) {
+    return `${hw.cpuCores} total (${hw.cpuPCores} performance)`;
+  }
+  if (hw.cpuECores !== null) {
+    return `${hw.cpuCores} total (${hw.cpuECores} efficiency)`;
+  }
+  return String(hw.cpuCores);
+}
+function summarizeCategoryIssues(name, details) {
+  let crashes = 0;
+  let timeouts = 0;
+  let errors = 0;
+  for (const detail of details) {
+    const actual = detail.actual ?? "";
+    if (/^TIMEOUT\b/i.test(actual)) {
+      timeouts++;
+      continue;
+    }
+    if (/^ERROR:/i.test(actual)) {
+      errors++;
+      if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
+        crashes++;
+      }
+    }
+  }
+  return { name, crashes, timeouts, errors };
+}
 function printHardwareTable(hw) {
   const table = new Table({
     head: [chalk3.bold("Hardware"), chalk3.bold("Value")],
     style: { head: [], border: [] }
   });
-  const coresDetail = hw.cpuPCores ? `${hw.cpuCores} (${hw.cpuPCores} performance + ${hw.cpuECores ?? 0} efficiency)` : String(hw.cpuCores);
+  const coresDetail = formatCpuCoresLabel(hw);
   const cpuLine = hw.cpuFreqGHz ? `${hw.cpu} @ ${hw.cpuFreqGHz} GHz` : hw.cpu;
   const ramLine = hw.memoryType ? `${hw.totalMemoryGB} GB ${hw.memoryType} (${hw.freeMemoryGB} GB free)` : `${hw.totalMemoryGB} GB (${hw.freeMemoryGB} GB free)`;
   const swapColor = hw.swapUsedGB > hw.swapTotalGB * 0.5 ? chalk3.yellow : chalk3.green;
@@ -8056,7 +8593,10 @@ function printPerformanceTable(perf, benchEnvironment) {
   const ttftColor = perf.ttft < 1e3 ? chalk3.green : perf.ttft < 3e3 ? chalk3.yellow : chalk3.red;
   const memColor = perf.memoryPercent < 50 ? chalk3.green : perf.memoryPercent < 80 ? chalk3.yellow : chalk3.red;
   table.push(
-    ["Tokens/sec", tpsColor(`${perf.tokensPerSecond.toFixed(1)} tok/s`)],
+    [
+      "Tokens/sec",
+      perf.tokensPerSecondEstimated ? chalk3.yellow(`${perf.tokensPerSecond.toFixed(1)} tok/s (estimated)`) : tpsColor(`${perf.tokensPerSecond.toFixed(1)} tok/s`)
+    ],
     [
       "First Chunk Latency",
       perf.firstChunkMs !== void 0 ? formatDuration(perf.firstChunkMs) : chalk3.dim("N/A (stream metric unavailable)")
@@ -8071,8 +8611,8 @@ function printPerformanceTable(perf, benchEnvironment) {
     ["Completion Tokens", String(perf.completionTokens)],
     [
       "Model Memory Footprint",
-      memColor(
-        `${perf.memoryUsedGB.toFixed(1)} GB (+${perf.memoryPercent.toFixed(0)}%)`
+      perf.memoryFootprintAvailable === false ? chalk3.dim("N/A (model already loaded; runtime metric unavailable)") : memColor(
+        `${perf.memoryUsedGB.toFixed(1)} GB (+${perf.memoryPercent.toFixed(0)}%)${perf.memoryFootprintEstimated ? " (estimated)" : ""}`
       )
     ],
     [
@@ -8133,6 +8673,18 @@ function printQualityTable(quality, timePenalties) {
     ]);
   }
   console.log(table.toString());
+  const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
+  if (issueSummaries.length > 0) {
+    console.log(chalk3.yellow("Execution issues detected during quality benchmark:"));
+    for (const summary of issueSummaries) {
+      const parts = [];
+      if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
+      const nonCrashErrors = summary.errors - summary.crashes;
+      if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
+      if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
+      console.log(chalk3.yellow(`  \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
+    }
+  }
 }
 function printSummaryTable(results) {
   const termWidth = process.stdout.columns || 80;
@@ -8146,7 +8698,7 @@ function printSummaryTable(results) {
     chalk3.bold("Model"),
     chalk3.bold("tok/s"),
     chalk3.bold("TTFT"),
-    chalk3.bold("Host RAM%"),
+    chalk3.bold("Model RAM%"),
     chalk3.bold("Profile"),
     chalk3.bold("HW Fit"),
     chalk3.bold("Quality"),
@@ -8159,17 +8711,23 @@ function printSummaryTable(results) {
     style: { head: [], border: [] },
     wordWrap: true
   });
+  const formatSummaryModelMemory = (result) => {
+    if (result.performance.memoryFootprintAvailable === false) return "N/A";
+    const value = `${result.performance.memoryPercent.toFixed(0)}%`;
+    return result.performance.memoryFootprintEstimated ? `${value}~` : value;
+  };
   for (const r of results) {
     const vColor = r.fitness.verdict === "EXCELLENT" ? chalk3.green.bold : r.fitness.verdict === "GOOD" ? chalk3.blue.bold : r.fitness.verdict === "MARGINAL" ? chalk3.yellow.bold : chalk3.red.bold;
     const flags = [];
     if (r.hardware.powerMode === "low-power") flags.push(chalk3.red("ECO"));
     if (r.modelInfo?.thinkingDetected) flags.push(chalk3.magenta("THINK"));
     const modelName = compact && r.model.length > 20 ? r.model.slice(0, 18) + ".." : r.model;
+    const throughputLabel = r.performance.tokensPerSecondEstimated ? `~${r.performance.tokensPerSecond.toFixed(1)}` : `${r.performance.tokensPerSecond.toFixed(1)}`;
     const row = [
       modelName,
-      `${r.performance.tokensPerSecond.toFixed(1)}`,
+      throughputLabel,
       formatDuration(r.performance.ttft),
-      r.performance.memoryHostPercent !== void 0 ? `${r.performance.memoryHostPercent.toFixed(0)}%` : "n/a",
+      formatSummaryModelMemory(r),
       r.fitness.tuning.profile,
       scoreColor(r.fitness.hardwareFitScore)(
         `${compactBar(r.fitness.hardwareFitScore)} ${r.fitness.hardwareFitScore}%`
@@ -8194,9 +8752,8 @@ function printSummaryTable(results) {
 // ../src/ui/verdict.ts
 import chalk4 from "chalk";
 var BOX_INNER = 60;
-var ANSI_RE = /\x1b\[[0-9;]*m/g;
 function visibleLength(str) {
-  return str.replace(ANSI_RE, "").length;
+  return stripAnsi(str).length;
 }
 function wrapText(text, maxWidth) {
   if (visibleLength(text) <= maxWidth) return [text];
@@ -8566,6 +9123,15 @@ function assertUploaderConfig(config) {
     );
   }
 }
+function resolveUploadedMemoryPercent(result) {
+  return result.performance.memoryFootprintAvailable === false ? null : result.performance.memoryPercent;
+}
+function resolveUploadedModelFormat(result) {
+  if (result.metadata.modelFormat?.trim()) return result.metadata.modelFormat;
+  const runtimeBackend = result.metadata.runtimeBackend ?? "ollama";
+  if (runtimeBackend === "ollama") return "gguf";
+  return "unknown";
+}
 async function uploadBenchResult(result, options = {}) {
   const config = resolveUploaderConfig();
   assertUploaderConfig(config);
@@ -8578,7 +9144,7 @@ async function uploadBenchResult(result, options = {}) {
     thinking_detected: result.modelInfo?.thinkingDetected ?? null,
     tokens_per_second: result.performance.tokensPerSecond,
     ttft_ms: result.performance.ttft,
-    memory_percent: result.performance.memoryHostPercent ?? result.performance.memoryPercent,
+    memory_percent: resolveUploadedMemoryPercent(result),
     thinking_tokens_estimate: result.performance.thinkingTokensEstimate ?? null,
     verdict: result.fitness.verdict,
     global_score: result.fitness.globalScore,
@@ -8595,7 +9161,7 @@ async function uploadBenchResult(result, options = {}) {
     benchmark_spec_version: result.metadata.benchmarkSpecVersion,
     runtime_version: result.metadata.runtimeVersion,
     runtime_backend: result.metadata.runtimeBackend ?? "ollama",
-    model_format: result.metadata.modelFormat ?? "gguf",
+    model_format: resolveUploadedModelFormat(result),
     raw_log_hash: result.metadata.rawLogHash,
     result
   };
@@ -8848,6 +9414,7 @@ async function promptSubmitterProfile(deps, defaults = {}) {
     }
     console.log(chalk6.yellow("Nickname must be between 2 and 40 characters."));
   }
+  console.log(chalk6.dim("Your email is never stored \u2014 only a SHA-256 hash is saved to match your leaderboard entries."));
   while (true) {
     const emailHint = defaults.email ? ` [${defaults.email}]` : "";
     const emailAnswer = await ask(`Email${emailHint} > `);
@@ -9008,7 +9575,7 @@ async function promptThinkingMode() {
 }
 // ../src/commands/bench.ts
-var BENCHMARK_SPEC_VERSION = "0.2.0";
+var BENCHMARK_SPEC_VERSION = "0.2.1";
 var PROMPT_PACK_VERSION = "0.1.0";
 async function benchCommand(options) {
   if (options.backend !== void 0) {
@@ -9101,6 +9668,11 @@ async function benchCommand(options) {
   if (!silent && thinkEnabled) {
     infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
   }
+  if (!silent) {
+    infoMsg(
+      `Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
+    );
+  }
   try {
     const results = [];
     const failedModels = [];
@@ -9130,7 +9702,7 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
           minSuccessfulPrompts: options.perfMinSuccessfulPrompts,
           failOnPromptError: options.perfStrict,
           think: thinkEnabled,
-          streamStallTimeoutMs: options.lmStudioStreamStallTimeoutMs
+          streamStallTimeoutMs: options.streamStallTimeoutMs
         });
         const perf = perfResult.metrics;
         const benchEnvironment = perfResult.benchEnvironment;
@@ -9168,13 +9740,22 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
           printVerdict(modelName, fitness);
         }
         const matchedModel = allModels.find((m) => m.name === modelName);
-        const modelInfo = matchedModel ? {
-          parameterSize: matchedModel.parameterSize,
-          quantization: matchedModel.quantization,
-          family: matchedModel.family,
-          // Persist the configured benchmark mode (not model auto-detection).
-          thinkingDetected: thinkEnabled
-        } : { thinkingDetected: thinkEnabled };
+        let resolvedModel = matchedModel;
+        if (matchedModel?.modelFormat === void 0) {
+          try {
+            resolvedModel = await resolveRuntimeModel(modelName) ?? matchedModel;
+          } catch {
+            resolvedModel = matchedModel;
+          }
+        }
+        const modelMetadataSource = resolvedModel ?? matchedModel;
+        const modelInfo = modelMetadataSource ? {
+          parameterSize: modelMetadataSource.parameterSize,
+          quantization: modelMetadataSource.quantization,
+          family: modelMetadataSource.family,
+          // Persist actual observed thinking behavior from the benchmark run.
+          thinkingDetected: perfResult.thinkingDetected
+        } : { thinkingDetected: perfResult.thinkingDetected };
         const partialResult = {
           model: modelName,
           modelInfo,
@@ -9189,7 +9770,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
             promptPackVersion: PROMPT_PACK_VERSION,
             runtimeVersion,
             runtimeBackend: getRuntimeName(),
-            modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
+            modelFormat: resolvedModel?.modelFormat ?? (getRuntimeName() === "ollama" ? getRuntimeModelFormat() : "unknown"),
+            benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
           }
         };
         const rawLogHash = createHash3("sha256").update(JSON.stringify(partialResult)).digest("hex");
@@ -9550,7 +10132,7 @@ async function handleShareResult(args) {
 // src/index.ts
 var server = new McpServer({
   name: "metrillm",
-  version: "0.1.0"
+  version: "0.2.1"
 });
 for (const def of toolDefinitions) {
   switch (def.name) {