metrillm-mcp 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -28,10 +28,22 @@ import { Ollama } from "ollama";
28
28
 
29
29
  // ../src/utils.ts
30
30
  import vm from "vm";
31
- import { execFile } from "child_process";
31
+ import { execFile, spawn } from "child_process";
32
32
  function openUrl(url) {
33
- const cmd = process.platform === "darwin" ? "open" : process.platform === "win32" ? "start" : "xdg-open";
34
- execFile(cmd, [url]);
33
+ if (process.platform === "win32") {
34
+ const child2 = spawn("cmd", ["/c", "start", "", url], {
35
+ windowsHide: true,
36
+ stdio: "ignore"
37
+ });
38
+ child2.on("error", () => {
39
+ });
40
+ child2.unref();
41
+ return;
42
+ }
43
+ const cmd = process.platform === "darwin" ? "open" : "xdg-open";
44
+ const child = execFile(cmd, [url]);
45
+ child.on("error", () => {
46
+ });
35
47
  }
36
48
  function avg(nums) {
37
49
  if (nums.length === 0) return 0;
@@ -515,7 +527,8 @@ function extractCodeBlock(text, preferredFunctionName) {
515
527
  var client = new Ollama();
516
528
  var DEFAULT_OLLAMA_HOST = "http://127.0.0.1:11434";
517
529
  var OLLAMA_INIT_TIMEOUT_MS = 12e4;
518
- var STREAM_STALL_TIMEOUT_MS = 3e4;
530
+ var DEFAULT_STREAM_STALL_TIMEOUT_MS = 3e4;
531
+ var SHARED_STREAM_STALL_TIMEOUT_ENV = "METRILLM_STREAM_STALL_TIMEOUT_MS";
519
532
  function getOllamaBaseUrl() {
520
533
  const configured = process.env.OLLAMA_HOST?.trim();
521
534
  if (!configured) return DEFAULT_OLLAMA_HOST;
@@ -575,6 +588,23 @@ function isUnsupportedSamplingOptionError(err) {
575
588
  if (!mentionsSampling) return false;
576
589
  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
577
590
  }
591
+ function parseNonNegativeInt(value) {
592
+ if (!/^\d+$/.test(value)) return null;
593
+ const parsed = Number.parseInt(value, 10);
594
+ if (!Number.isSafeInteger(parsed) || parsed < 0) return null;
595
+ return parsed;
596
+ }
597
+ function resolveStreamStallTimeoutMs(override) {
598
+ if (override !== void 0) {
599
+ if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
600
+ return override === 0 ? void 0 : Math.trunc(override);
601
+ }
602
+ const configured = process.env[SHARED_STREAM_STALL_TIMEOUT_ENV]?.trim();
603
+ if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
604
+ const parsed = parseNonNegativeInt(configured);
605
+ if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
606
+ return parsed === 0 ? void 0 : parsed;
607
+ }
578
608
  function buildGenerateRequest(model, prompt, options, includeSampling) {
579
609
  return {
580
610
  model,
@@ -594,6 +624,8 @@ async function generate(model, prompt, options) {
594
624
  return generateStream(model, prompt, void 0, options);
595
625
  }
596
626
  async function generateStream(model, prompt, callbacks, options) {
627
+ const stallTimeoutMs = resolveStreamStallTimeoutMs(options?.stall_timeout_ms);
628
+ let abortedByStallTimeout = false;
597
629
  const initializeStream = (includeSampling) => withTimeout(
598
630
  client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
599
631
  OLLAMA_INIT_TIMEOUT_MS,
@@ -615,10 +647,12 @@ async function generateStream(model, prompt, callbacks, options) {
615
647
  let firstChunkSeen = false;
616
648
  let stallTimer = null;
617
649
  const resetStallTimer = () => {
650
+ if (stallTimeoutMs === void 0) return;
618
651
  if (stallTimer) clearTimeout(stallTimer);
619
652
  stallTimer = setTimeout(() => {
653
+ abortedByStallTimeout = true;
620
654
  client.abort();
621
- }, STREAM_STALL_TIMEOUT_MS);
655
+ }, stallTimeoutMs);
622
656
  };
623
657
  try {
624
658
  resetStallTimer();
@@ -653,6 +687,9 @@ async function generateStream(model, prompt, callbacks, options) {
653
687
  if (stallTimer) clearTimeout(stallTimer);
654
688
  }
655
689
  if (!result) {
690
+ if (abortedByStallTimeout && stallTimeoutMs !== void 0) {
691
+ throw new Error(`Ollama stream timed out after ${stallTimeoutMs}ms`);
692
+ }
656
693
  throw new Error("Stream ended without done signal");
657
694
  }
658
695
  callbacks?.onDone?.(result);
@@ -682,14 +719,18 @@ function abortOngoingRequests() {
682
719
  import os from "os";
683
720
  import path from "path";
684
721
  import { promises as fs } from "fs";
722
+ import { execFile as execFile2 } from "child_process";
685
723
  var DEFAULT_LM_STUDIO_BASE_URL = "http://127.0.0.1:1234";
686
724
  var LM_STUDIO_INIT_TIMEOUT_MS = 15e3;
687
725
  var LM_STUDIO_METADATA_TIMEOUT_MS = 2e3;
688
- var DEFAULT_STREAM_STALL_TIMEOUT_MS = 18e4;
726
+ var DEFAULT_STREAM_STALL_TIMEOUT_MS2 = 3e4;
727
+ var LM_STUDIO_CLI_TIMEOUT_MS = 8e3;
728
+ var SHARED_STREAM_STALL_TIMEOUT_ENV2 = "METRILLM_STREAM_STALL_TIMEOUT_MS";
689
729
  var DEFAULT_LM_STUDIO_HOME_DIR = path.join(os.homedir(), ".lmstudio");
690
730
  var DEFAULT_LM_STUDIO_MODELS_DIR = path.join(DEFAULT_LM_STUDIO_HOME_DIR, "models");
691
731
  var LM_STUDIO_HOME_DIR_ENV = "LM_STUDIO_HOME_DIR";
692
732
  var LM_STUDIO_MODELS_DIR_ENV = "LM_STUDIO_MODELS_DIR";
733
+ var LM_STUDIO_CLI_PATH_ENV = "LM_STUDIO_CLI_PATH";
693
734
  var defaultKeepAlive2;
694
735
  var activeAbortControllers = /* @__PURE__ */ new Set();
695
736
  var directorySizeCache = /* @__PURE__ */ new Map();
@@ -717,14 +758,9 @@ function assertThinkingModeRespected(model, think, response, reasoning) {
717
758
  );
718
759
  }
719
760
  }
720
- function buildThinkingConfig(think) {
721
- if (think === void 0) return {};
722
- const effort = think ? "high" : "low";
723
- return {
724
- include_reasoning: think,
725
- reasoning_effort: effort,
726
- reasoning: { effort }
727
- };
761
+ function buildNativeThinkingOption(think) {
762
+ if (think !== true) return void 0;
763
+ return "high";
728
764
  }
729
765
  function hasSamplingOverrides2(options) {
730
766
  return options?.top_p !== void 0 || options?.seed !== void 0;
@@ -768,38 +804,106 @@ function buildLMStudioRequestError(kind, model, status, statusText, body) {
768
804
  const suffix = backendMessage ? ` ${backendMessage}` : "";
769
805
  return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
770
806
  }
771
- function buildChatCompletionBody(model, prompt, options, stream, includeSampling) {
772
- const messages = options?.think === false ? [
773
- { role: "system", content: NON_THINKING_SYSTEM_PROMPT },
774
- { role: "user", content: prompt }
775
- ] : [{ role: "user", content: prompt }];
807
+ function buildNativeChatBody(model, prompt, options, stream, includeSampling) {
808
+ const reasoning = buildNativeThinkingOption(options?.think);
776
809
  return {
777
810
  model,
778
- messages,
811
+ input: prompt,
779
812
  temperature: options?.temperature ?? 0,
780
813
  ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
781
814
  ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
782
815
  max_tokens: options?.num_predict ?? 512,
783
816
  stream,
784
- ...stream ? { stream_options: { include_usage: true } } : {},
785
- ...buildThinkingConfig(options?.think)
817
+ ...reasoning !== void 0 ? { reasoning } : {},
818
+ ...options?.think === false ? { system_prompt: NON_THINKING_SYSTEM_PROMPT } : {}
786
819
  };
787
820
  }
788
- function parseNonNegativeInt(value) {
821
+ function getNativeStatNumber(value) {
822
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0) return void 0;
823
+ return value;
824
+ }
825
+ function flattenNativeText(value, depth = 0) {
826
+ if (depth > 3 || value == null) return "";
827
+ if (typeof value === "string") return value;
828
+ if (Array.isArray(value)) {
829
+ return value.map((item) => flattenNativeText(item, depth + 1)).join("");
830
+ }
831
+ if (typeof value === "object") {
832
+ const candidate = value;
833
+ return flattenNativeText(candidate.text, depth + 1) || flattenNativeText(candidate.content, depth + 1) || flattenNativeText(candidate.delta, depth + 1) || flattenNativeText(candidate.value, depth + 1);
834
+ }
835
+ return "";
836
+ }
837
+ function collectNativeOutput(output4) {
838
+ if (!Array.isArray(output4)) {
839
+ return { response: "", reasoning: "" };
840
+ }
841
+ let response = "";
842
+ let reasoning = "";
843
+ for (const item of output4) {
844
+ if (typeof item !== "object" || item === null) continue;
845
+ const nativeItem = item;
846
+ const text = flattenNativeText(nativeItem.text ?? nativeItem.content).trim();
847
+ if (!text) continue;
848
+ const type = asNonEmptyString(nativeItem.type)?.toLowerCase() ?? "";
849
+ if (type.includes("reason")) {
850
+ reasoning += text;
851
+ } else {
852
+ response += text;
853
+ }
854
+ }
855
+ return { response, reasoning };
856
+ }
857
+ function extractNativeStats(payload) {
858
+ if (typeof payload !== "object" || payload === null) return void 0;
859
+ const direct = payload.stats;
860
+ if (direct) return direct;
861
+ const result = payload.result?.stats;
862
+ return result;
863
+ }
864
+ function extractNativeResponse(payload) {
865
+ if (typeof payload !== "object" || payload === null) {
866
+ return { response: "", reasoning: "" };
867
+ }
868
+ const resultOutput = payload.result?.output;
869
+ const directOutput = payload.output;
870
+ const fromResult = collectNativeOutput(resultOutput);
871
+ if (fromResult.response || fromResult.reasoning) return fromResult;
872
+ return collectNativeOutput(directOutput);
873
+ }
874
+ function extractNativeDelta(payload) {
875
+ if (typeof payload !== "object" || payload === null) {
876
+ return { response: "", reasoning: "" };
877
+ }
878
+ const type = asNonEmptyString(payload.type)?.toLowerCase() ?? "";
879
+ const directText = flattenNativeText(payload.delta);
880
+ const fallbackText = directText || flattenNativeText(payload.content) || flattenNativeText(payload.text);
881
+ if (!fallbackText) {
882
+ return { response: "", reasoning: "" };
883
+ }
884
+ if (type.includes("reason")) {
885
+ return { response: "", reasoning: fallbackText };
886
+ }
887
+ if (type.includes("message") || type.includes("text") || type.includes("content")) {
888
+ return { response: fallbackText, reasoning: "" };
889
+ }
890
+ return { response: fallbackText, reasoning: "" };
891
+ }
892
+ function parseNonNegativeInt2(value) {
789
893
  if (!/^\d+$/.test(value)) return null;
790
894
  const parsed = Number.parseInt(value, 10);
791
895
  if (!Number.isSafeInteger(parsed) || parsed < 0) return null;
792
896
  return parsed;
793
897
  }
794
- function resolveStreamStallTimeoutMs(override) {
898
+ function resolveStreamStallTimeoutMs2(override) {
795
899
  if (override !== void 0) {
796
- if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
900
+ if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
797
901
  return override === 0 ? void 0 : Math.trunc(override);
798
902
  }
799
- const configured = process.env.LM_STUDIO_STREAM_STALL_TIMEOUT_MS?.trim();
800
- if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
801
- const parsed = parseNonNegativeInt(configured);
802
- if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
903
+ const configured = process.env[SHARED_STREAM_STALL_TIMEOUT_ENV2]?.trim();
904
+ if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
905
+ const parsed = parseNonNegativeInt2(configured);
906
+ if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
803
907
  return parsed === 0 ? void 0 : parsed;
804
908
  }
805
909
  function getLMStudioBaseUrl() {
@@ -822,25 +926,29 @@ function getLMStudioHeaders() {
822
926
  }
823
927
  return headers;
824
928
  }
825
- function extractUsage(payload) {
826
- if (typeof payload !== "object" || payload === null) return void 0;
827
- const usage = payload.usage;
828
- if (!usage) return void 0;
829
- return usage;
830
- }
831
- function extractChoice2(payload) {
832
- if (typeof payload !== "object" || payload === null) return void 0;
833
- const choices = payload.choices;
834
- if (!choices || choices.length === 0) return void 0;
835
- return choices[0];
836
- }
837
- function extractContent(choice) {
838
- const content = choice?.delta?.content ?? choice?.message?.content;
839
- return typeof content === "string" ? content : "";
929
+ function getUsageTokenCount(value) {
930
+ if (typeof value !== "number" || !Number.isFinite(value)) return 0;
931
+ if (value <= 0) return 0;
932
+ return Math.trunc(value);
933
+ }
934
+ function estimateCompletionTokensFallback(text) {
935
+ const normalized = text.trim();
936
+ if (!normalized) return 0;
937
+ const cjkMatches = normalized.match(/[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/gu);
938
+ const cjkCount = cjkMatches?.length ?? 0;
939
+ const withoutCjk = normalized.replace(
940
+ /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/gu,
941
+ ""
942
+ );
943
+ const nonCjkChars = withoutCjk.replace(/\s+/g, "").length;
944
+ const nonCjkHeuristic = Math.ceil(nonCjkChars / 4);
945
+ const whitespaceEstimate = estimateTokenCount(normalized);
946
+ return Math.max(1, Math.max(whitespaceEstimate, cjkCount + nonCjkHeuristic));
840
947
  }
841
- function extractReasoning(choice) {
842
- const reasoning = choice?.delta?.reasoning_content ?? choice?.delta?.reasoning ?? choice?.message?.reasoning_content ?? choice?.message?.reasoning;
843
- return typeof reasoning === "string" ? reasoning : "";
948
+ function resolveCompletionTokenCount(reportedTokenCount, response, reasoning) {
949
+ const reported = getUsageTokenCount(reportedTokenCount);
950
+ if (reported > 0) return reported;
951
+ return estimateCompletionTokensFallback(`${reasoning} ${response}`);
844
952
  }
845
953
  function asNonEmptyString(value) {
846
954
  if (typeof value !== "string") return void 0;
@@ -1090,11 +1198,17 @@ async function resolveLocalModelMetadata(modelId, apiModel, modelsRootDir) {
1090
1198
  const size = await readDirectorySizeBytes(source.fullPath);
1091
1199
  if (size > bestSize) bestSize = size;
1092
1200
  if (size > 0) {
1093
- return { size, parameterSize: definition.parameterSize };
1201
+ return {
1202
+ size,
1203
+ parameterSize: definition.parameterSize
1204
+ };
1094
1205
  }
1095
1206
  }
1096
1207
  if (bestSize > 0) {
1097
- return { size: bestSize, parameterSize: definition.parameterSize };
1208
+ return {
1209
+ size: bestSize,
1210
+ parameterSize: definition.parameterSize
1211
+ };
1098
1212
  }
1099
1213
  const fallback = await resolvePublisherModelMetadata(modelId, apiModel, modelsRootDir);
1100
1214
  if (fallback.size > 0) {
@@ -1103,7 +1217,10 @@ async function resolveLocalModelMetadata(modelId, apiModel, modelsRootDir) {
1103
1217
  parameterSize: definition.parameterSize ?? fallback.parameterSize
1104
1218
  };
1105
1219
  }
1106
- return { size: 0, parameterSize: definition.parameterSize ?? fallback.parameterSize };
1220
+ return {
1221
+ size: 0,
1222
+ parameterSize: definition.parameterSize ?? fallback.parameterSize
1223
+ };
1107
1224
  }
1108
1225
  function parseSizeBytes(model) {
1109
1226
  if (!model) return 0;
@@ -1141,6 +1258,21 @@ function inferParameterSizeFromModelId(modelId) {
1141
1258
  }
1142
1259
  return void 0;
1143
1260
  }
1261
+ function resolveModelFormat(apiModel, _localMetadata, _modelId) {
1262
+ return asNonEmptyString(apiModel?.compatibility_type);
1263
+ }
1264
+ function buildModelEntry(id, apiModel, localMetadata) {
1265
+ const apiSize = parseSizeBytes(apiModel);
1266
+ return {
1267
+ name: id,
1268
+ size: apiSize > 0 ? apiSize : localMetadata?.size ?? 0,
1269
+ parameterSize: localMetadata?.parameterSize ?? inferParameterSizeFromModelId(id),
1270
+ quantization: asNonEmptyString(apiModel?.quantization),
1271
+ runtimeStatus: asNonEmptyString(apiModel?.state),
1272
+ modelFormat: resolveModelFormat(apiModel, localMetadata, id),
1273
+ family: asNonEmptyString(apiModel?.arch) ?? asNonEmptyString(apiModel?.type) ?? asNonEmptyString(apiModel?.publisher)
1274
+ };
1275
+ }
1144
1276
  function isLoadedState(state) {
1145
1277
  if (!state) return false;
1146
1278
  const normalized = state.trim().toLowerCase();
@@ -1148,6 +1280,128 @@ function isLoadedState(state) {
1148
1280
  if (normalized === "loaded" || normalized === "ready") return true;
1149
1281
  return normalized.includes("loaded");
1150
1282
  }
1283
+ function execFileText(cmd, args, timeoutMs) {
1284
+ return new Promise((resolve, reject) => {
1285
+ execFile2(
1286
+ cmd,
1287
+ args,
1288
+ {
1289
+ timeout: timeoutMs,
1290
+ maxBuffer: 1024 * 1024,
1291
+ env: process.env
1292
+ },
1293
+ (err, stdout, stderr) => {
1294
+ if (err) {
1295
+ const error = err;
1296
+ error.stdout = stdout;
1297
+ error.stderr = stderr;
1298
+ reject(error);
1299
+ return;
1300
+ }
1301
+ resolve({ stdout, stderr });
1302
+ }
1303
+ );
1304
+ });
1305
+ }
1306
+ function isCommandMissingError(err) {
1307
+ return err instanceof Error && "code" in err && err.code === "ENOENT";
1308
+ }
1309
+ async function runLmsCli(args) {
1310
+ const configuredPath = asNonEmptyString(process.env[LM_STUDIO_CLI_PATH_ENV]);
1311
+ const fallbackPath = path.join(getLMStudioHomeDir(), "bin", "lms");
1312
+ const candidates = [
1313
+ configuredPath,
1314
+ "lms",
1315
+ fallbackPath
1316
+ ].filter(
1317
+ (candidate, index, list) => Boolean(candidate) && list.indexOf(candidate) === index
1318
+ );
1319
+ let lastError;
1320
+ for (const candidate of candidates) {
1321
+ try {
1322
+ return await execFileText(candidate, args, LM_STUDIO_CLI_TIMEOUT_MS);
1323
+ } catch (err) {
1324
+ lastError = err;
1325
+ if (isCommandMissingError(err)) continue;
1326
+ throw err;
1327
+ }
1328
+ }
1329
+ throw lastError ?? new Error("LM Studio CLI is not available.");
1330
+ }
1331
+ function normalizeCliToken(value) {
1332
+ return (value ?? "").trim().toLowerCase();
1333
+ }
1334
+ function matchesLoadedModelCliEntry(entry, model) {
1335
+ const target = normalizeCliToken(model);
1336
+ if (!target) return false;
1337
+ return [
1338
+ entry.identifier,
1339
+ entry.indexedModelIdentifier,
1340
+ entry.path,
1341
+ entry.modelKey
1342
+ ].some((candidate) => normalizeCliToken(candidate) === target);
1343
+ }
1344
+ async function listLoadedModelsFromCli() {
1345
+ const { stdout } = await runLmsCli(["ps", "--json"]);
1346
+ const parsed = JSON.parse(stdout);
1347
+ return Array.isArray(parsed) ? parsed : [];
1348
+ }
1349
+ function parseEstimatedBytes(output4) {
1350
+ const match = output4.match(/Estimated Total Memory:\s*([0-9]+(?:\.[0-9]+)?)\s*(KiB|MiB|GiB|TiB|KB|MB|GB|TB)/i);
1351
+ if (!match) return null;
1352
+ const value = Number.parseFloat(match[1] ?? "");
1353
+ const unit = (match[2] ?? "").toUpperCase();
1354
+ if (!Number.isFinite(value) || value <= 0) return null;
1355
+ const multipliers = {
1356
+ KIB: 1024,
1357
+ MIB: 1024 ** 2,
1358
+ GIB: 1024 ** 3,
1359
+ TIB: 1024 ** 4,
1360
+ KB: 1e3,
1361
+ MB: 1e3 ** 2,
1362
+ GB: 1e3 ** 3,
1363
+ TB: 1e3 ** 4
1364
+ };
1365
+ const multiplier = multipliers[unit];
1366
+ if (!multiplier) return null;
1367
+ return Math.round(value * multiplier);
1368
+ }
1369
+ async function estimateLoadedModelMemoryBytes(model) {
1370
+ let loadedEntry;
1371
+ try {
1372
+ const loadedModels = await listLoadedModelsFromCli();
1373
+ loadedEntry = loadedModels.find((entry) => matchesLoadedModelCliEntry(entry, model));
1374
+ } catch {
1375
+ loadedEntry = void 0;
1376
+ }
1377
+ if (!loadedEntry) return null;
1378
+ const candidateModelKeys = [
1379
+ loadedEntry?.path,
1380
+ loadedEntry?.indexedModelIdentifier,
1381
+ loadedEntry?.modelKey
1382
+ ].filter(
1383
+ (candidate, index, list) => Boolean(candidate?.trim()) && list.findIndex((item) => item === candidate) === index
1384
+ );
1385
+ for (const candidate of candidateModelKeys) {
1386
+ const args = ["load", "--estimate-only", "-y"];
1387
+ if (typeof loadedEntry?.contextLength === "number" && Number.isFinite(loadedEntry.contextLength) && loadedEntry.contextLength > 0) {
1388
+ args.push("--context-length", String(Math.trunc(loadedEntry.contextLength)));
1389
+ }
1390
+ args.push(candidate);
1391
+ try {
1392
+ const { stdout, stderr } = await runLmsCli(args);
1393
+ const estimated = parseEstimatedBytes(`${stdout}
1394
+ ${stderr}`);
1395
+ if (estimated !== null) return estimated;
1396
+ } catch (err) {
1397
+ const output4 = err instanceof Error ? `${String(err.stdout ?? "")}
1398
+ ${String(err.stderr ?? "")}` : "";
1399
+ const estimated = parseEstimatedBytes(output4);
1400
+ if (estimated !== null) return estimated;
1401
+ }
1402
+ }
1403
+ return null;
1404
+ }
1151
1405
  async function fetchApiModels() {
1152
1406
  try {
1153
1407
  const resp = await fetchWithTimeout(
@@ -1200,7 +1454,7 @@ async function getLMStudioVersion() {
1200
1454
  const localVersion = await resolveLocalLMStudioVersion();
1201
1455
  try {
1202
1456
  const resp = await fetchWithTimeout(
1203
- "/v1/models",
1457
+ "/api/v1/models",
1204
1458
  { method: "GET", headers: getLMStudioHeaders() },
1205
1459
  5e3,
1206
1460
  "LM Studio version check"
@@ -1215,7 +1469,7 @@ async function getLMStudioVersion() {
1215
1469
  }
1216
1470
  async function listModels2() {
1217
1471
  const resp = await fetchWithTimeout(
1218
- "/v1/models",
1472
+ "/api/v1/models",
1219
1473
  { method: "GET", headers: getLMStudioHeaders() },
1220
1474
  LM_STUDIO_INIT_TIMEOUT_MS,
1221
1475
  "LM Studio list models"
@@ -1233,25 +1487,25 @@ async function listModels2() {
1233
1487
  apiById.set(id, model);
1234
1488
  }
1235
1489
  const modelsRootDir = await resolveModelsRootDir();
1236
- const localMetadataById = /* @__PURE__ */ new Map();
1237
- for (const id of ids) {
1238
- const localMetadata = await resolveLocalModelMetadata(id, apiById.get(id), modelsRootDir);
1239
- localMetadataById.set(id, localMetadata);
1240
- }
1241
- return ids.map((id) => {
1242
- const apiModel = apiById.get(id);
1243
- const localMetadata = localMetadataById.get(id);
1244
- const apiSize = parseSizeBytes(apiModel);
1245
- return {
1246
- name: id,
1247
- size: apiSize > 0 ? apiSize : localMetadata?.size ?? 0,
1248
- parameterSize: localMetadata?.parameterSize ?? inferParameterSizeFromModelId(id),
1249
- quantization: asNonEmptyString(apiModel?.quantization),
1250
- runtimeStatus: asNonEmptyString(apiModel?.state),
1251
- modelFormat: asNonEmptyString(apiModel?.compatibility_type),
1252
- family: asNonEmptyString(apiModel?.arch) ?? asNonEmptyString(apiModel?.type) ?? asNonEmptyString(apiModel?.publisher)
1253
- };
1254
- });
1490
+ const localMetadataEntries = await Promise.all(
1491
+ ids.map(async (id) => {
1492
+ const localMetadata = await resolveLocalModelMetadata(id, apiById.get(id), modelsRootDir);
1493
+ return [id, localMetadata];
1494
+ })
1495
+ );
1496
+ const localMetadataById = new Map(
1497
+ localMetadataEntries
1498
+ );
1499
+ return ids.map((id) => buildModelEntry(id, apiById.get(id), localMetadataById.get(id)));
1500
+ }
1501
+ async function resolveModel(modelId) {
1502
+ const id = modelId.trim();
1503
+ if (!id) return null;
1504
+ const apiModels = await fetchApiModels();
1505
+ const apiModel = apiModels?.find((candidate) => asNonEmptyString(candidate.id) === id);
1506
+ const modelsRootDir = await resolveModelsRootDir();
1507
+ const localMetadata = await resolveLocalModelMetadata(id, apiModel, modelsRootDir);
1508
+ return buildModelEntry(id, apiModel, localMetadata);
1255
1509
  }
1256
1510
  async function listRunningModels2() {
1257
1511
  const apiModels = await fetchApiModels();
@@ -1272,11 +1526,11 @@ async function generate2(model, prompt, options) {
1272
1526
  activeAbortControllers.add(controller);
1273
1527
  try {
1274
1528
  const baseUrl = getLMStudioBaseUrl();
1275
- const url = new URL("/v1/chat/completions", baseUrl);
1529
+ const url = new URL("/api/v1/chat", baseUrl);
1276
1530
  const doRequest = (includeSampling) => fetch(url, {
1277
1531
  method: "POST",
1278
1532
  headers: getLMStudioHeaders(),
1279
- body: JSON.stringify(buildChatCompletionBody(model, prompt, options, false, includeSampling)),
1533
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, includeSampling)),
1280
1534
  signal: controller.signal
1281
1535
  });
1282
1536
  let resp = await doRequest(true);
@@ -1293,21 +1547,33 @@ async function generate2(model, prompt, options) {
1293
1547
  throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
1294
1548
  }
1295
1549
  const payload = await resp.json();
1296
- const choice = extractChoice2(payload);
1297
- const response = extractContent(choice);
1298
- const reasoning = extractReasoning(choice);
1550
+ const nativeResponse = extractNativeResponse(payload);
1551
+ const response = nativeResponse.response;
1552
+ const reasoning = nativeResponse.reasoning;
1299
1553
  assertThinkingModeRespected(model, options?.think, response, reasoning);
1300
- const usage = extractUsage(payload);
1554
+ const stats = extractNativeStats(payload);
1301
1555
  const totalDuration = Math.max(0, Date.now() - start) * 1e6;
1556
+ const outputTokens = getUsageTokenCount(stats?.total_output_tokens) || resolveCompletionTokenCount(void 0, response, reasoning);
1557
+ const throughput = getNativeStatNumber(stats?.tokens_per_second);
1558
+ const timeToFirstTokenSeconds = getNativeStatNumber(stats?.time_to_first_token_seconds);
1559
+ const modelLoadTimeSeconds = getNativeStatNumber(stats?.model_load_time_seconds);
1560
+ const evalCountEstimated = getUsageTokenCount(stats?.total_output_tokens) <= 0;
1561
+ const evalDuration = throughput !== void 0 && throughput > 0 && outputTokens > 0 ? Math.max(1, Math.round(outputTokens / throughput * 1e9)) : totalDuration;
1562
+ const promptEvalDuration = timeToFirstTokenSeconds !== void 0 ? Math.max(0, Math.round(timeToFirstTokenSeconds * 1e9)) : 0;
1563
+ const loadDuration = Math.max(
1564
+ 0,
1565
+ Math.round((modelLoadTimeSeconds ?? 0) * 1e9)
1566
+ );
1302
1567
  return {
1303
1568
  response,
1304
1569
  ...reasoning ? { thinking: reasoning } : {},
1305
1570
  totalDuration,
1306
- loadDuration: 0,
1307
- promptEvalCount: usage?.prompt_tokens ?? 0,
1308
- promptEvalDuration: 0,
1309
- evalCount: usage?.completion_tokens ?? 0,
1310
- evalDuration: totalDuration
1571
+ loadDuration,
1572
+ promptEvalCount: getUsageTokenCount(stats?.input_tokens),
1573
+ promptEvalDuration,
1574
+ evalCount: outputTokens,
1575
+ evalDuration,
1576
+ ...evalCountEstimated ? { evalCountEstimated: true } : {}
1311
1577
  };
1312
1578
  } catch (err) {
1313
1579
  if (err instanceof Error && err.name === "AbortError") {
@@ -1322,10 +1588,10 @@ async function generateStream2(model, prompt, callbacks, options) {
1322
1588
  const start = Date.now();
1323
1589
  const controller = new AbortController();
1324
1590
  activeAbortControllers.add(controller);
1325
- const stallTimeoutMs = resolveStreamStallTimeoutMs(options?.stall_timeout_ms);
1591
+ const stallTimeoutMs = resolveStreamStallTimeoutMs2(options?.stall_timeout_ms);
1326
1592
  let abortedByStallTimeout = false;
1327
1593
  const baseUrl = getLMStudioBaseUrl();
1328
- const url = new URL("/v1/chat/completions", baseUrl);
1594
+ const url = new URL("/api/v1/chat", baseUrl);
1329
1595
  let stallTimer = null;
1330
1596
  const resetStallTimer = () => {
1331
1597
  if (stallTimeoutMs === void 0) return;
@@ -1340,7 +1606,7 @@ async function generateStream2(model, prompt, callbacks, options) {
1340
1606
  const doRequest = (includeSampling) => fetch(url, {
1341
1607
  method: "POST",
1342
1608
  headers: getLMStudioHeaders(),
1343
- body: JSON.stringify(buildChatCompletionBody(model, prompt, options, true, includeSampling)),
1609
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, includeSampling)),
1344
1610
  signal: controller.signal
1345
1611
  });
1346
1612
  let resp = await doRequest(true);
@@ -1365,10 +1631,10 @@ async function generateStream2(model, prompt, callbacks, options) {
1365
1631
  let doneReceived = false;
1366
1632
  let fullResponse = "";
1367
1633
  let fullThinking = "";
1368
- let usage;
1634
+ let stats;
1369
1635
  let firstChunkSeen = false;
1370
- let firstTokenTime = null;
1371
- let lastTokenTime = null;
1636
+ let firstGeneratedTokenTime = null;
1637
+ let lastGeneratedTokenTime = null;
1372
1638
  const processDataLine = (rawLine) => {
1373
1639
  const line = rawLine.trim();
1374
1640
  if (!line.startsWith("data:")) return;
@@ -1384,18 +1650,27 @@ async function generateStream2(model, prompt, callbacks, options) {
1384
1650
  } catch {
1385
1651
  return;
1386
1652
  }
1387
- const choice = extractChoice2(payload);
1388
- const content = extractContent(choice);
1389
- const reasoning = extractReasoning(choice);
1390
- const chunkUsage = extractUsage(payload);
1391
- if (chunkUsage) usage = chunkUsage;
1653
+ const delta = extractNativeDelta(payload);
1654
+ const content = delta.response;
1655
+ const reasoning = delta.reasoning;
1656
+ const chunkStats = extractNativeStats(payload);
1657
+ if (chunkStats) stats = chunkStats;
1658
+ const aggregate = extractNativeResponse(payload);
1659
+ if (aggregate.response) {
1660
+ fullResponse = aggregate.response;
1661
+ }
1662
+ if (aggregate.reasoning) {
1663
+ fullThinking = aggregate.reasoning;
1664
+ }
1665
+ if (reasoning || content) {
1666
+ const now = Date.now();
1667
+ if (firstGeneratedTokenTime === null) firstGeneratedTokenTime = now;
1668
+ lastGeneratedTokenTime = now;
1669
+ }
1392
1670
  if (reasoning) {
1393
1671
  fullThinking += reasoning;
1394
1672
  }
1395
1673
  if (content) {
1396
- const now = Date.now();
1397
- if (firstTokenTime === null) firstTokenTime = now;
1398
- lastTokenTime = now;
1399
1674
  fullResponse += content;
1400
1675
  callbacks?.onToken?.(content);
1401
1676
  }
@@ -1415,6 +1690,14 @@ async function generateStream2(model, prompt, callbacks, options) {
1415
1690
  processDataLine(rawLine);
1416
1691
  }
1417
1692
  }
1693
+ buffered += decoder.decode();
1694
+ if (buffered.length > 0) {
1695
+ const lines = buffered.split("\n");
1696
+ buffered = lines.pop() ?? "";
1697
+ for (const rawLine of lines) {
1698
+ processDataLine(rawLine);
1699
+ }
1700
+ }
1418
1701
  if (buffered.trim().length > 0) {
1419
1702
  processDataLine(buffered);
1420
1703
  }
@@ -1423,16 +1706,25 @@ async function generateStream2(model, prompt, callbacks, options) {
1423
1706
  throw new Error("LM Studio stream ended without content");
1424
1707
  }
1425
1708
  const totalDuration = Math.max(0, Date.now() - start) * 1e6;
1426
- const evalDurationMs = firstTokenTime !== null && lastTokenTime !== null && lastTokenTime > firstTokenTime ? lastTokenTime - firstTokenTime : Date.now() - start;
1709
+ const outputTokens = getUsageTokenCount(stats?.total_output_tokens) || resolveCompletionTokenCount(void 0, fullResponse, fullThinking);
1710
+ const throughput = getNativeStatNumber(stats?.tokens_per_second);
1711
+ const timeToFirstTokenSeconds = getNativeStatNumber(stats?.time_to_first_token_seconds);
1712
+ const modelLoadTimeSeconds = getNativeStatNumber(stats?.model_load_time_seconds);
1713
+ const evalCountEstimated = getUsageTokenCount(stats?.total_output_tokens) <= 0;
1714
+ const evalDurationMs = throughput !== void 0 && throughput > 0 && outputTokens > 0 ? outputTokens / throughput * 1e3 : firstGeneratedTokenTime !== null && lastGeneratedTokenTime !== null && lastGeneratedTokenTime > firstGeneratedTokenTime ? lastGeneratedTokenTime - firstGeneratedTokenTime : Date.now() - start;
1427
1715
  const result = {
1428
1716
  response: fullResponse,
1429
1717
  ...fullThinking ? { thinking: fullThinking } : {},
1430
1718
  totalDuration,
1431
- loadDuration: 0,
1432
- promptEvalCount: usage?.prompt_tokens ?? 0,
1433
- promptEvalDuration: firstTokenTime !== null ? (firstTokenTime - start) * 1e6 : 0,
1434
- evalCount: usage?.completion_tokens ?? 0,
1435
- evalDuration: Math.max(1, evalDurationMs) * 1e6
1719
+ loadDuration: Math.max(
1720
+ 0,
1721
+ Math.round((modelLoadTimeSeconds ?? 0) * 1e9)
1722
+ ),
1723
+ promptEvalCount: getUsageTokenCount(stats?.input_tokens),
1724
+ promptEvalDuration: timeToFirstTokenSeconds !== void 0 ? Math.max(0, Math.round(timeToFirstTokenSeconds * 1e9)) : firstGeneratedTokenTime !== null ? (firstGeneratedTokenTime - start) * 1e6 : 0,
1725
+ evalCount: outputTokens,
1726
+ evalDuration: Math.max(1, Math.round(evalDurationMs * 1e6)),
1727
+ ...evalCountEstimated ? { evalCountEstimated: true } : {}
1436
1728
  };
1437
1729
  assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
1438
1730
  callbacks?.onDone?.(result);
@@ -1600,6 +1892,19 @@ function getRuntimeName() {
1600
1892
  function getRuntimeModelFormat() {
1601
1893
  return activeRuntime.modelFormat ?? "gguf";
1602
1894
  }
1895
+ async function resolveRuntimeModel(model) {
1896
+ if (activeRuntime.name === "lm-studio") {
1897
+ return resolveModel(model);
1898
+ }
1899
+ const knownModels = await activeRuntime.listModels();
1900
+ const matchedModel = knownModels.find((candidate) => candidate.name === model);
1901
+ if (matchedModel) return matchedModel;
1902
+ return {
1903
+ name: model,
1904
+ size: 0,
1905
+ modelFormat: activeRuntime.modelFormat ?? "gguf"
1906
+ };
1907
+ }
1603
1908
 
1604
1909
  // ../src/commands/bench.ts
1605
1910
  import { createHash as createHash3 } from "crypto";
@@ -1608,11 +1913,33 @@ import chalk8 from "chalk";
1608
1913
  // ../src/core/hardware.ts
1609
1914
  import si from "systeminformation";
1610
1915
  import os2 from "os";
1611
- import { execFile as execFile2 } from "child_process";
1916
+ import { execFile as execFile3 } from "child_process";
1612
1917
  import { readFile } from "fs/promises";
1918
+ function normalizeWhitespace(value) {
1919
+ return value.replace(/\s+/g, " ").trim();
1920
+ }
1921
+ function looksLikeGpuDescriptor(value) {
1922
+ return /\b(radeon|graphics|geforce|rtx|gtx|arc|iris|uhd|quadro|tesla|adreno|mali|powervr)\b/i.test(value);
1923
+ }
1924
+ function splitCpuAndInferredGpu(cpuLabel) {
1925
+ const normalized = normalizeWhitespace(cpuLabel);
1926
+ const withGpuMatch = normalized.match(/\s+(?:w\/\s*|with\s+)(.+)$/i);
1927
+ if (!withGpuMatch?.index) {
1928
+ return { cpu: normalized, inferredGpu: null };
1929
+ }
1930
+ const inferredGpu = normalizeWhitespace(withGpuMatch[1] ?? "");
1931
+ if (!looksLikeGpuDescriptor(inferredGpu)) {
1932
+ return { cpu: normalized, inferredGpu: null };
1933
+ }
1934
+ const cpu = normalizeWhitespace(normalized.slice(0, withGpuMatch.index));
1935
+ return {
1936
+ cpu: cpu || normalized,
1937
+ inferredGpu: inferredGpu || null
1938
+ };
1939
+ }
1613
1940
  function execCommand(cmd, args, timeoutMs = 3e3) {
1614
1941
  return new Promise((resolve) => {
1615
- const child = execFile2(cmd, args, { timeout: timeoutMs }, (err, stdout) => {
1942
+ const child = execFile3(cmd, args, { timeout: timeoutMs }, (err, stdout) => {
1616
1943
  if (err) return resolve("");
1617
1944
  resolve(stdout.trim());
1618
1945
  });
@@ -1746,11 +2073,14 @@ async function getHardwareInfo() {
1746
2073
  ]);
1747
2074
  const gpuController = graphics.controllers[0];
1748
2075
  const gpuNames = graphics.controllers.map((g) => g.model).filter(Boolean).join(", ");
2076
+ const cpuLabelRaw = normalizeWhitespace(`${cpu.manufacturer} ${cpu.brand}`);
2077
+ const { cpu: cpuLabel, inferredGpu } = splitCpuAndInferredGpu(cpuLabelRaw);
2078
+ const defaultIntegratedGpu = process.platform === "darwin" ? "Integrated / Apple Silicon" : "Integrated / Unknown";
1749
2079
  const gpuCoresRaw = gpuController?.cores;
1750
2080
  const gpuCores = gpuCoresRaw ? parseInt(String(gpuCoresRaw), 10) : null;
1751
2081
  const memType = memLayout.length > 0 ? memLayout[0].type : null;
1752
2082
  return {
1753
- cpu: `${cpu.manufacturer} ${cpu.brand}`,
2083
+ cpu: cpuLabel,
1754
2084
  cpuCores: cpu.cores,
1755
2085
  cpuPCores: cpu.performanceCores || null,
1756
2086
  cpuECores: cpu.efficiencyCores || null,
@@ -1760,7 +2090,7 @@ async function getHardwareInfo() {
1760
2090
  memoryType: memType || null,
1761
2091
  swapTotalGB: +(mem.swaptotal / 1024 / 1024 / 1024).toFixed(1),
1762
2092
  swapUsedGB: +(mem.swapused / 1024 / 1024 / 1024).toFixed(1),
1763
- gpu: gpuNames || "Integrated / Apple Silicon",
2093
+ gpu: normalizeWhitespace(gpuNames) || inferredGpu || defaultIntegratedGpu,
1764
2094
  gpuCores: gpuCores && !isNaN(gpuCores) ? gpuCores : null,
1765
2095
  gpuVramMB: gpuController?.vram ?? null,
1766
2096
  os: `${osInfo.distro} ${osInfo.release}`,
@@ -1795,6 +2125,10 @@ import chalk from "chalk";
1795
2125
 
1796
2126
  // ../src/ui/terminal.ts
1797
2127
  var supportsUnicode = process.platform !== "win32" || Boolean(process.env.WT_SESSION) || Boolean(process.env.TERM_PROGRAM);
2128
+ var ANSI_RE = /\x1b\[[0-9;]*[A-Za-z]/g;
2129
+ function stripAnsi(value) {
2130
+ return value.replace(ANSI_RE, "");
2131
+ }
1798
2132
 
1799
2133
  // ../src/ui/progress.ts
1800
2134
  var FUN_PHRASES = [
@@ -1995,6 +2329,8 @@ async function runPerformanceBench(model, options = {}) {
1995
2329
  optionalProbeWithAvailability(() => getSwapUsedGB(), 0),
1996
2330
  optionalProbe(() => detectBatteryPowered(), void 0)
1997
2331
  ]);
2332
+ const runningModelsBeforeWarmup = await optionalProbe(() => listRunningModels3(), []);
2333
+ const modelWasAlreadyLoaded = runningModelsBeforeWarmup.some((m) => m.name === model);
1998
2334
  const warmup = await withTimeout(
1999
2335
  generateStream3(model, WARMUP_PROMPT, void 0, {
2000
2336
  ...withBenchmarkProfile({
@@ -2012,15 +2348,6 @@ async function runPerformanceBench(model, options = {}) {
2012
2348
  const loadTime = warmup.loadDuration / 1e6;
2013
2349
  const runningModels = await listRunningModels3();
2014
2350
  const thisModel = runningModels.find((m) => m.name === model);
2015
- let installedModelSizeBytes = 0;
2016
- try {
2017
- const availableModels = await listModels3();
2018
- const listedModel = availableModels.find((m) => m.name === model);
2019
- if (listedModel && Number.isFinite(listedModel.size) && listedModel.size > 0) {
2020
- installedModelSizeBytes = listedModel.size;
2021
- }
2022
- } catch {
2023
- }
2024
2351
  spinner.succeed("Model loaded");
2025
2352
  const tpsValues = [];
2026
2353
  const firstChunkValues = [];
@@ -2034,6 +2361,7 @@ async function runPerformanceBench(model, options = {}) {
2034
2361
  let thinkingDetected = false;
2035
2362
  let totalThinkingTokens = 0;
2036
2363
  const cpuLoadSamples = [];
2364
+ let tokensPerSecondEstimated = false;
2037
2365
  for (let i = 0; i < BENCH_PROMPTS.length; i++) {
2038
2366
  spinner.start(`Running performance test ${i + 1}/${BENCH_PROMPTS.length}...`);
2039
2367
  let firstChunkTime = null;
@@ -2071,6 +2399,9 @@ async function runPerformanceBench(model, options = {}) {
2071
2399
  tpsValues.push(tps);
2072
2400
  totalEvalCount += result.evalCount;
2073
2401
  totalEvalDurationNs += result.evalDuration;
2402
+ if (result.evalCountEstimated) {
2403
+ tokensPerSecondEstimated = true;
2404
+ }
2074
2405
  if (firstChunkTime !== null) {
2075
2406
  firstChunkValues.push(firstChunkTime);
2076
2407
  }
@@ -2116,10 +2447,18 @@ async function runPerformanceBench(model, options = {}) {
2116
2447
  ]);
2117
2448
  let memoryUsedGB;
2118
2449
  let memoryPercent;
2119
- const loadedModelSizeBytes = thisModel && thisModel.size > 0 ? thisModel.size : installedModelSizeBytes;
2450
+ let memoryFootprintEstimated = false;
2451
+ const runtimeReportsComparableLoadedSize = runtimeName !== "lm-studio";
2452
+ const estimatedLoadedModelSizeBytes = runtimeName === "lm-studio" && modelWasAlreadyLoaded ? await optionalProbe(() => estimateLoadedModelMemoryBytes(model), null) : null;
2453
+ const loadedModelSizeBytes = runtimeReportsComparableLoadedSize && thisModel && thisModel.size > 0 ? thisModel.size : 0;
2454
+ const memoryFootprintAvailable = runtimeReportsComparableLoadedSize ? loadedModelSizeBytes > 0 || !modelWasAlreadyLoaded : (estimatedLoadedModelSizeBytes ?? 0) > 0 || !modelWasAlreadyLoaded;
2120
2455
  if (loadedModelSizeBytes > 0) {
2121
2456
  memoryUsedGB = loadedModelSizeBytes / 1024 ** 3;
2122
2457
  memoryPercent = memoryUsedGB / memAfter.totalGB * 100;
2458
+ } else if ((estimatedLoadedModelSizeBytes ?? 0) > 0) {
2459
+ memoryUsedGB = (estimatedLoadedModelSizeBytes ?? 0) / 1024 ** 3;
2460
+ memoryPercent = memoryUsedGB / memAfter.totalGB * 100;
2461
+ memoryFootprintEstimated = true;
2123
2462
  } else {
2124
2463
  memoryUsedGB = Math.max(0, memAfter.usedGB - memBefore.usedGB);
2125
2464
  memoryPercent = Math.max(0, memAfter.percent - memBefore.percent);
@@ -2141,6 +2480,7 @@ async function runPerformanceBench(model, options = {}) {
2141
2480
  return {
2142
2481
  metrics: {
2143
2482
  tokensPerSecond: totalEvalDurationNs > 0 ? totalEvalCount / (totalEvalDurationNs / 1e9) : avg(tpsValues),
2483
+ ...tokensPerSecondEstimated ? { tokensPerSecondEstimated: true } : {},
2144
2484
  ...firstChunkMs !== void 0 ? { firstChunkMs } : {},
2145
2485
  ttft: ttft >= 0 ? ttft : 3e4,
2146
2486
  // Fallback: 30s if no TTFT measured
@@ -2151,6 +2491,8 @@ async function runPerformanceBench(model, options = {}) {
2151
2491
  completionTokens: totalCompletionTokens,
2152
2492
  memoryUsedGB: +memoryUsedGB.toFixed(1),
2153
2493
  memoryPercent: +memoryPercent.toFixed(1),
2494
+ memoryFootprintAvailable,
2495
+ ...memoryFootprintEstimated ? { memoryFootprintEstimated: true } : {},
2154
2496
  memoryHostUsedGB: memAfter.usedGB,
2155
2497
  memoryHostPercent: memAfter.percent,
2156
2498
  tpsStdDev: tpsValues.length >= 2 ? stddev(tpsValues) : void 0,
@@ -2915,7 +3257,7 @@ Answer:`;
2915
3257
 
2916
3258
  // ../src/benchmarks/coding.ts
2917
3259
  import vm2 from "vm";
2918
- import { spawn } from "child_process";
3260
+ import { spawn as spawn2 } from "child_process";
2919
3261
  import { Worker } from "worker_threads";
2920
3262
 
2921
3263
  // ../src/datasets/coding.json
@@ -6588,7 +6930,7 @@ async function runTestsInSubprocess(code, task) {
6588
6930
  const total = task.tests.length;
6589
6931
  return new Promise((resolve) => {
6590
6932
  const wallTimeoutMs = computeIsolatedWallTimeoutMs(task);
6591
- const child = spawn(
6933
+ const child = spawn2(
6592
6934
  process.execPath,
6593
6935
  [
6594
6936
  "--max-old-space-size=96",
@@ -7891,13 +8233,15 @@ function computePerformanceScore(perf, hardware) {
7891
8233
  const tuning = deriveHardwareFitTuning(hardware);
7892
8234
  const safeTokensPerSecond = sanitizeNonNegative(perf.tokensPerSecond, 0);
7893
8235
  const safeTtft = sanitizeNonNegative(perf.ttft, tuning.ttft.hardMaxMs * 2);
7894
- const effectiveMemPercent = sanitizeNonNegative(
7895
- perf.memoryHostPercent ?? perf.memoryPercent,
7896
- 100
7897
- );
7898
8236
  const speed = Math.round(scoreSpeed(safeTokensPerSecond, tuning));
7899
8237
  const ttft = Math.round(scoreTTFT(safeTtft, tuning));
7900
- const memory = Math.round(scoreMemory(effectiveMemPercent));
8238
+ const memory = perf.memoryFootprintAvailable === false ? Math.round((speed + ttft) / (50 + 20) * 30) : Math.round(
8239
+ scoreMemory(
8240
+ // Score memory from the model's measured footprint/delta rather than
8241
+ // unrelated host RAM usage from other running workloads.
8242
+ sanitizeNonNegative(perf.memoryPercent, 100)
8243
+ )
8244
+ );
7901
8245
  return {
7902
8246
  total: clamp(speed + ttft + memory, 0, 100),
7903
8247
  speed,
@@ -8006,11 +8350,9 @@ function computeFitness(perf, quality, hardware, benchEnv) {
8006
8350
  const safeTokensPerSecond = sanitizeNonNegative(perf.tokensPerSecond, 0);
8007
8351
  const safeTtft = sanitizeNonNegative(perf.ttft, tuning.ttft.hardMaxMs * 10);
8008
8352
  const safeLoadTime = sanitizeNonNegative(perf.loadTime, tuning.loadTimeHardMaxMs * 10);
8009
- const hostMemoryPercent = sanitizeNonNegative(
8010
- perf.memoryHostPercent ?? perf.memoryPercent,
8011
- 100
8012
- );
8013
- const modelMemoryDeltaPercent = sanitizeNonNegative(perf.memoryPercent, 100);
8353
+ const modelMemoryFootprintAvailable = perf.memoryFootprintAvailable !== false;
8354
+ const modelMemoryDeltaPercent = modelMemoryFootprintAvailable ? sanitizeNonNegative(perf.memoryPercent, 100) : void 0;
8355
+ const hostMemoryPercent = perf.memoryHostPercent !== void 0 && Number.isFinite(perf.memoryHostPercent) && perf.memoryHostPercent >= 0 ? perf.memoryHostPercent : void 0;
8014
8356
  const disqualifiers = [];
8015
8357
  if (safeTokensPerSecond < tuning.speed.hardMin) {
8016
8358
  disqualifiers.push(
@@ -8027,12 +8369,12 @@ function computeFitness(perf, quality, hardware, benchEnv) {
8027
8369
  `Model load time too high: ${Math.round(safeLoadTime)}ms (maximum: ${tuning.loadTimeHardMaxMs}ms for ${tuning.profile} profile)`
8028
8370
  );
8029
8371
  }
8030
- const hostCritical = hostMemoryPercent > 95;
8031
- const modelDeltaCritical = modelMemoryDeltaPercent > 90;
8032
- const modelDeltaSignificant = modelMemoryDeltaPercent >= 10;
8033
- if (modelDeltaCritical || hostCritical && modelDeltaSignificant) {
8372
+ const hostCritical = hostMemoryPercent !== void 0 && hostMemoryPercent > 95;
8373
+ const modelDeltaCritical = modelMemoryDeltaPercent !== void 0 && modelMemoryDeltaPercent > 90;
8374
+ const modelDeltaSignificant = modelMemoryDeltaPercent !== void 0 && modelMemoryDeltaPercent >= 10;
8375
+ if (modelDeltaCritical) {
8034
8376
  disqualifiers.push(
8035
- `Memory usage critical: host ${hostMemoryPercent.toFixed(0)}%, model delta +${modelMemoryDeltaPercent.toFixed(0)}%`
8377
+ `Memory usage critical: model delta +${modelMemoryDeltaPercent.toFixed(0)}%`
8036
8378
  );
8037
8379
  }
8038
8380
  const verdictScore = globalScore ?? hardwareFitScore;
@@ -8067,9 +8409,28 @@ function computeFitness(perf, quality, hardware, benchEnv) {
8067
8409
  `Token speed is unstable (stddev ${perf.tpsStdDev.toFixed(1)} tok/s, mean ${safeTokensPerSecond.toFixed(1)} tok/s) \u2014 may indicate thermal throttling or memory pressure.`
8068
8410
  );
8069
8411
  }
8070
- if (hostCritical && !modelDeltaSignificant) {
8412
+ if (perf.tokensPerSecondEstimated) {
8413
+ warnings.push(
8414
+ "Token throughput is estimated from LM Studio output because native token stats were unavailable. Compare tok/s across backends cautiously."
8415
+ );
8416
+ }
8417
+ if (perf.memoryFootprintEstimated) {
8071
8418
  warnings.push(
8072
- `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) but model delta is limited (+${modelMemoryDeltaPercent.toFixed(0)}%). Verdict may be influenced by other running workloads.`
8419
+ "Model memory footprint is estimated via LM Studio CLI rather than measured from a fresh load."
8420
+ );
8421
+ }
8422
+ if (!modelMemoryFootprintAvailable) {
8423
+ warnings.push(
8424
+ "Model memory footprint was unavailable for this run, so RAM fit scoring was normalized from speed and TTFT only."
8425
+ );
8426
+ }
8427
+ if (hostCritical && !modelMemoryFootprintAvailable) {
8428
+ warnings.push(
8429
+ `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) and model footprint was unavailable. Results may be influenced by other running workloads.`
8430
+ );
8431
+ } else if (hostCritical && modelMemoryDeltaPercent !== void 0 && !modelDeltaSignificant) {
8432
+ warnings.push(
8433
+ `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) but model delta is limited (+${modelMemoryDeltaPercent.toFixed(0)}%). Results may be influenced by other running workloads.`
8073
8434
  );
8074
8435
  }
8075
8436
  if (hardware?.powerMode === "low-power") {
@@ -8157,6 +8518,21 @@ function getLevel(score) {
8157
8518
  if (score >= 25) return "Weak";
8158
8519
  return "Poor";
8159
8520
  }
8521
+ function formatCpuCoresLabel(hw) {
8522
+ if (hw.cpuPCores !== null && hw.cpuECores !== null) {
8523
+ return `${hw.cpuCores} total (${hw.cpuPCores} performance + ${hw.cpuECores} efficiency)`;
8524
+ }
8525
+ if (hw.cpuPCores !== null && hw.cpuCores > hw.cpuPCores) {
8526
+ return `${hw.cpuCores} threads (${hw.cpuPCores} cores)`;
8527
+ }
8528
+ if (hw.cpuPCores !== null) {
8529
+ return `${hw.cpuCores} total (${hw.cpuPCores} performance)`;
8530
+ }
8531
+ if (hw.cpuECores !== null) {
8532
+ return `${hw.cpuCores} total (${hw.cpuECores} efficiency)`;
8533
+ }
8534
+ return String(hw.cpuCores);
8535
+ }
8160
8536
  function summarizeCategoryIssues(name, details) {
8161
8537
  let crashes = 0;
8162
8538
  let timeouts = 0;
@@ -8181,7 +8557,7 @@ function printHardwareTable(hw) {
8181
8557
  head: [chalk3.bold("Hardware"), chalk3.bold("Value")],
8182
8558
  style: { head: [], border: [] }
8183
8559
  });
8184
- const coresDetail = hw.cpuPCores ? `${hw.cpuCores} (${hw.cpuPCores} performance + ${hw.cpuECores ?? 0} efficiency)` : String(hw.cpuCores);
8560
+ const coresDetail = formatCpuCoresLabel(hw);
8185
8561
  const cpuLine = hw.cpuFreqGHz ? `${hw.cpu} @ ${hw.cpuFreqGHz} GHz` : hw.cpu;
8186
8562
  const ramLine = hw.memoryType ? `${hw.totalMemoryGB} GB ${hw.memoryType} (${hw.freeMemoryGB} GB free)` : `${hw.totalMemoryGB} GB (${hw.freeMemoryGB} GB free)`;
8187
8563
  const swapColor = hw.swapUsedGB > hw.swapTotalGB * 0.5 ? chalk3.yellow : chalk3.green;
@@ -8217,7 +8593,10 @@ function printPerformanceTable(perf, benchEnvironment) {
8217
8593
  const ttftColor = perf.ttft < 1e3 ? chalk3.green : perf.ttft < 3e3 ? chalk3.yellow : chalk3.red;
8218
8594
  const memColor = perf.memoryPercent < 50 ? chalk3.green : perf.memoryPercent < 80 ? chalk3.yellow : chalk3.red;
8219
8595
  table.push(
8220
- ["Tokens/sec", tpsColor(`${perf.tokensPerSecond.toFixed(1)} tok/s`)],
8596
+ [
8597
+ "Tokens/sec",
8598
+ perf.tokensPerSecondEstimated ? chalk3.yellow(`${perf.tokensPerSecond.toFixed(1)} tok/s (estimated)`) : tpsColor(`${perf.tokensPerSecond.toFixed(1)} tok/s`)
8599
+ ],
8221
8600
  [
8222
8601
  "First Chunk Latency",
8223
8602
  perf.firstChunkMs !== void 0 ? formatDuration(perf.firstChunkMs) : chalk3.dim("N/A (stream metric unavailable)")
@@ -8232,8 +8611,8 @@ function printPerformanceTable(perf, benchEnvironment) {
8232
8611
  ["Completion Tokens", String(perf.completionTokens)],
8233
8612
  [
8234
8613
  "Model Memory Footprint",
8235
- memColor(
8236
- `${perf.memoryUsedGB.toFixed(1)} GB (+${perf.memoryPercent.toFixed(0)}%)`
8614
+ perf.memoryFootprintAvailable === false ? chalk3.dim("N/A (model already loaded; runtime metric unavailable)") : memColor(
8615
+ `${perf.memoryUsedGB.toFixed(1)} GB (+${perf.memoryPercent.toFixed(0)}%)${perf.memoryFootprintEstimated ? " (estimated)" : ""}`
8237
8616
  )
8238
8617
  ],
8239
8618
  [
@@ -8319,7 +8698,7 @@ function printSummaryTable(results) {
8319
8698
  chalk3.bold("Model"),
8320
8699
  chalk3.bold("tok/s"),
8321
8700
  chalk3.bold("TTFT"),
8322
- chalk3.bold("Host RAM%"),
8701
+ chalk3.bold("Model RAM%"),
8323
8702
  chalk3.bold("Profile"),
8324
8703
  chalk3.bold("HW Fit"),
8325
8704
  chalk3.bold("Quality"),
@@ -8332,17 +8711,23 @@ function printSummaryTable(results) {
8332
8711
  style: { head: [], border: [] },
8333
8712
  wordWrap: true
8334
8713
  });
8714
+ const formatSummaryModelMemory = (result) => {
8715
+ if (result.performance.memoryFootprintAvailable === false) return "N/A";
8716
+ const value = `${result.performance.memoryPercent.toFixed(0)}%`;
8717
+ return result.performance.memoryFootprintEstimated ? `${value}~` : value;
8718
+ };
8335
8719
  for (const r of results) {
8336
8720
  const vColor = r.fitness.verdict === "EXCELLENT" ? chalk3.green.bold : r.fitness.verdict === "GOOD" ? chalk3.blue.bold : r.fitness.verdict === "MARGINAL" ? chalk3.yellow.bold : chalk3.red.bold;
8337
8721
  const flags = [];
8338
8722
  if (r.hardware.powerMode === "low-power") flags.push(chalk3.red("ECO"));
8339
8723
  if (r.modelInfo?.thinkingDetected) flags.push(chalk3.magenta("THINK"));
8340
8724
  const modelName = compact && r.model.length > 20 ? r.model.slice(0, 18) + ".." : r.model;
8725
+ const throughputLabel = r.performance.tokensPerSecondEstimated ? `~${r.performance.tokensPerSecond.toFixed(1)}` : `${r.performance.tokensPerSecond.toFixed(1)}`;
8341
8726
  const row = [
8342
8727
  modelName,
8343
- `${r.performance.tokensPerSecond.toFixed(1)}`,
8728
+ throughputLabel,
8344
8729
  formatDuration(r.performance.ttft),
8345
- r.performance.memoryHostPercent !== void 0 ? `${r.performance.memoryHostPercent.toFixed(0)}%` : "n/a",
8730
+ formatSummaryModelMemory(r),
8346
8731
  r.fitness.tuning.profile,
8347
8732
  scoreColor(r.fitness.hardwareFitScore)(
8348
8733
  `${compactBar(r.fitness.hardwareFitScore)} ${r.fitness.hardwareFitScore}%`
@@ -8367,9 +8752,8 @@ function printSummaryTable(results) {
8367
8752
  // ../src/ui/verdict.ts
8368
8753
  import chalk4 from "chalk";
8369
8754
  var BOX_INNER = 60;
8370
- var ANSI_RE = /\x1b\[[0-9;]*m/g;
8371
8755
  function visibleLength(str) {
8372
- return str.replace(ANSI_RE, "").length;
8756
+ return stripAnsi(str).length;
8373
8757
  }
8374
8758
  function wrapText(text, maxWidth) {
8375
8759
  if (visibleLength(text) <= maxWidth) return [text];
@@ -8739,6 +9123,15 @@ function assertUploaderConfig(config) {
8739
9123
  );
8740
9124
  }
8741
9125
  }
9126
+ function resolveUploadedMemoryPercent(result) {
9127
+ return result.performance.memoryFootprintAvailable === false ? null : result.performance.memoryPercent;
9128
+ }
9129
+ function resolveUploadedModelFormat(result) {
9130
+ if (result.metadata.modelFormat?.trim()) return result.metadata.modelFormat;
9131
+ const runtimeBackend = result.metadata.runtimeBackend ?? "ollama";
9132
+ if (runtimeBackend === "ollama") return "gguf";
9133
+ return "unknown";
9134
+ }
8742
9135
  async function uploadBenchResult(result, options = {}) {
8743
9136
  const config = resolveUploaderConfig();
8744
9137
  assertUploaderConfig(config);
@@ -8751,7 +9144,7 @@ async function uploadBenchResult(result, options = {}) {
8751
9144
  thinking_detected: result.modelInfo?.thinkingDetected ?? null,
8752
9145
  tokens_per_second: result.performance.tokensPerSecond,
8753
9146
  ttft_ms: result.performance.ttft,
8754
- memory_percent: result.performance.memoryHostPercent ?? result.performance.memoryPercent,
9147
+ memory_percent: resolveUploadedMemoryPercent(result),
8755
9148
  thinking_tokens_estimate: result.performance.thinkingTokensEstimate ?? null,
8756
9149
  verdict: result.fitness.verdict,
8757
9150
  global_score: result.fitness.globalScore,
@@ -8768,7 +9161,7 @@ async function uploadBenchResult(result, options = {}) {
8768
9161
  benchmark_spec_version: result.metadata.benchmarkSpecVersion,
8769
9162
  runtime_version: result.metadata.runtimeVersion,
8770
9163
  runtime_backend: result.metadata.runtimeBackend ?? "ollama",
8771
- model_format: result.metadata.modelFormat ?? "gguf",
9164
+ model_format: resolveUploadedModelFormat(result),
8772
9165
  raw_log_hash: result.metadata.rawLogHash,
8773
9166
  result
8774
9167
  };
@@ -9021,6 +9414,7 @@ async function promptSubmitterProfile(deps, defaults = {}) {
9021
9414
  }
9022
9415
  console.log(chalk6.yellow("Nickname must be between 2 and 40 characters."));
9023
9416
  }
9417
+ console.log(chalk6.dim("Your email is never stored \u2014 only a SHA-256 hash is saved to match your leaderboard entries."));
9024
9418
  while (true) {
9025
9419
  const emailHint = defaults.email ? ` [${defaults.email}]` : "";
9026
9420
  const emailAnswer = await ask(`Email${emailHint} > `);
@@ -9180,6 +9574,20 @@ async function promptThinkingMode() {
9180
9574
  });
9181
9575
  }
9182
9576
 
9577
+ // ../src/core/runtime-unavailable.ts
9578
+ function getRuntimeUnavailableHelp(runtimeName, setupHints) {
9579
+ const runtimeDisplayName = getRuntimeDisplayName(runtimeName);
9580
+ const lines = [
9581
+ `MetriLLM is currently set to use ${runtimeDisplayName}.`,
9582
+ `Either start ${runtimeDisplayName}, or switch to another backend in Settings.`
9583
+ ];
9584
+ for (const hint of setupHints) {
9585
+ lines.push(` \u2022 ${hint}`);
9586
+ }
9587
+ lines.push(" \u2022 To change backend: Main Menu -> Settings -> Runtime backend");
9588
+ return lines;
9589
+ }
9590
+
9183
9591
  // ../src/commands/bench.ts
9184
9592
  var BENCHMARK_SPEC_VERSION = "0.2.1";
9185
9593
  var PROMPT_PACK_VERSION = "0.1.0";
@@ -9246,9 +9654,8 @@ async function benchCommand(options) {
9246
9654
  } catch (err) {
9247
9655
  if (!silent) {
9248
9656
  spinnerModels.fail(`Cannot connect to ${runtimeDisplayName}`);
9249
- errorMsg(`Make sure ${runtimeDisplayName} is installed and running.`);
9250
- for (const hint of runtimeSetupHints) {
9251
- errorMsg(` \u2022 ${hint}`);
9657
+ for (const line of getRuntimeUnavailableHelp(runtimeName, runtimeSetupHints)) {
9658
+ errorMsg(line);
9252
9659
  }
9253
9660
  if (err instanceof Error) errorMsg(err.message);
9254
9661
  }
@@ -9308,7 +9715,7 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
9308
9715
  minSuccessfulPrompts: options.perfMinSuccessfulPrompts,
9309
9716
  failOnPromptError: options.perfStrict,
9310
9717
  think: thinkEnabled,
9311
- streamStallTimeoutMs: options.lmStudioStreamStallTimeoutMs
9718
+ streamStallTimeoutMs: options.streamStallTimeoutMs
9312
9719
  });
9313
9720
  const perf = perfResult.metrics;
9314
9721
  const benchEnvironment = perfResult.benchEnvironment;
@@ -9346,13 +9753,22 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
9346
9753
  printVerdict(modelName, fitness);
9347
9754
  }
9348
9755
  const matchedModel = allModels.find((m) => m.name === modelName);
9349
- const modelInfo = matchedModel ? {
9350
- parameterSize: matchedModel.parameterSize,
9351
- quantization: matchedModel.quantization,
9352
- family: matchedModel.family,
9353
- // Persist the configured benchmark mode (not model auto-detection).
9354
- thinkingDetected: thinkEnabled
9355
- } : { thinkingDetected: thinkEnabled };
9756
+ let resolvedModel = matchedModel;
9757
+ if (matchedModel?.modelFormat === void 0) {
9758
+ try {
9759
+ resolvedModel = await resolveRuntimeModel(modelName) ?? matchedModel;
9760
+ } catch {
9761
+ resolvedModel = matchedModel;
9762
+ }
9763
+ }
9764
+ const modelMetadataSource = resolvedModel ?? matchedModel;
9765
+ const modelInfo = modelMetadataSource ? {
9766
+ parameterSize: modelMetadataSource.parameterSize,
9767
+ quantization: modelMetadataSource.quantization,
9768
+ family: modelMetadataSource.family,
9769
+ // Persist actual observed thinking behavior from the benchmark run.
9770
+ thinkingDetected: perfResult.thinkingDetected
9771
+ } : { thinkingDetected: perfResult.thinkingDetected };
9356
9772
  const partialResult = {
9357
9773
  model: modelName,
9358
9774
  modelInfo,
@@ -9367,7 +9783,7 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
9367
9783
  promptPackVersion: PROMPT_PACK_VERSION,
9368
9784
  runtimeVersion,
9369
9785
  runtimeBackend: getRuntimeName(),
9370
- modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat(),
9786
+ modelFormat: resolvedModel?.modelFormat ?? (getRuntimeName() === "ollama" ? getRuntimeModelFormat() : "unknown"),
9371
9787
  benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
9372
9788
  }
9373
9789
  };