metrillm-mcp 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -28,10 +28,22 @@ import { Ollama } from "ollama";
28
28
 
29
29
  // ../src/utils.ts
30
30
  import vm from "vm";
31
- import { execFile } from "child_process";
31
+ import { execFile, spawn } from "child_process";
32
32
  function openUrl(url) {
33
- const cmd = process.platform === "darwin" ? "open" : process.platform === "win32" ? "start" : "xdg-open";
34
- execFile(cmd, [url]);
33
+ if (process.platform === "win32") {
34
+ const child2 = spawn("cmd", ["/c", "start", "", url], {
35
+ windowsHide: true,
36
+ stdio: "ignore"
37
+ });
38
+ child2.on("error", () => {
39
+ });
40
+ child2.unref();
41
+ return;
42
+ }
43
+ const cmd = process.platform === "darwin" ? "open" : "xdg-open";
44
+ const child = execFile(cmd, [url]);
45
+ child.on("error", () => {
46
+ });
35
47
  }
36
48
  function avg(nums) {
37
49
  if (nums.length === 0) return 0;
@@ -110,7 +122,8 @@ function stripThinkTags(text) {
110
122
  }
111
123
  function hasThinkingContent(response, thinkingField) {
112
124
  if (thinkingField && thinkingField.trim().length > 0) return true;
113
- return /<think(?:ing)?[\s>]/i.test(response);
125
+ if (/<think(?:ing)?[\s>]/i.test(response)) return true;
126
+ return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
114
127
  }
115
128
  function estimateTokenCount(text) {
116
129
  if (!text) return 0;
@@ -514,7 +527,8 @@ function extractCodeBlock(text, preferredFunctionName) {
514
527
  var client = new Ollama();
515
528
  var DEFAULT_OLLAMA_HOST = "http://127.0.0.1:11434";
516
529
  var OLLAMA_INIT_TIMEOUT_MS = 12e4;
517
- var STREAM_STALL_TIMEOUT_MS = 3e4;
530
+ var DEFAULT_STREAM_STALL_TIMEOUT_MS = 3e4;
531
+ var SHARED_STREAM_STALL_TIMEOUT_ENV = "METRILLM_STREAM_STALL_TIMEOUT_MS";
518
532
  function getOllamaBaseUrl() {
519
533
  const configured = process.env.OLLAMA_HOST?.trim();
520
534
  if (!configured) return DEFAULT_OLLAMA_HOST;
@@ -564,35 +578,81 @@ var defaultKeepAlive;
564
578
  function setDefaultKeepAlive(keepAlive) {
565
579
  defaultKeepAlive = keepAlive;
566
580
  }
581
+ function hasSamplingOverrides(options) {
582
+ return options?.top_p !== void 0 || options?.seed !== void 0;
583
+ }
584
+ function isUnsupportedSamplingOptionError(err) {
585
+ const message = err instanceof Error ? err.message : String(err);
586
+ const lower = message.toLowerCase();
587
+ const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
588
+ if (!mentionsSampling) return false;
589
+ return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
590
+ }
591
+ function parseNonNegativeInt(value) {
592
+ if (!/^\d+$/.test(value)) return null;
593
+ const parsed = Number.parseInt(value, 10);
594
+ if (!Number.isSafeInteger(parsed) || parsed < 0) return null;
595
+ return parsed;
596
+ }
597
+ function resolveStreamStallTimeoutMs(override) {
598
+ if (override !== void 0) {
599
+ if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
600
+ return override === 0 ? void 0 : Math.trunc(override);
601
+ }
602
+ const configured = process.env[SHARED_STREAM_STALL_TIMEOUT_ENV]?.trim();
603
+ if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
604
+ const parsed = parseNonNegativeInt(configured);
605
+ if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
606
+ return parsed === 0 ? void 0 : parsed;
607
+ }
608
+ function buildGenerateRequest(model, prompt, options, includeSampling) {
609
+ return {
610
+ model,
611
+ prompt,
612
+ stream: true,
613
+ keep_alive: options?.keep_alive ?? defaultKeepAlive,
614
+ ...options?.think !== void 0 ? { think: options.think } : {},
615
+ options: {
616
+ temperature: options?.temperature ?? 0,
617
+ ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
618
+ ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
619
+ num_predict: options?.num_predict ?? 512
620
+ }
621
+ };
622
+ }
567
623
  async function generate(model, prompt, options) {
568
624
  return generateStream(model, prompt, void 0, options);
569
625
  }
570
626
  async function generateStream(model, prompt, callbacks, options) {
571
- const stream = await withTimeout(
572
- client.generate({
573
- model,
574
- prompt,
575
- stream: true,
576
- keep_alive: options?.keep_alive ?? defaultKeepAlive,
577
- ...options?.think !== void 0 ? { think: options.think } : {},
578
- options: {
579
- temperature: options?.temperature ?? 0,
580
- num_predict: options?.num_predict ?? 512
581
- }
582
- }),
627
+ const stallTimeoutMs = resolveStreamStallTimeoutMs(options?.stall_timeout_ms);
628
+ let abortedByStallTimeout = false;
629
+ const initializeStream = (includeSampling) => withTimeout(
630
+ client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
583
631
  OLLAMA_INIT_TIMEOUT_MS,
584
632
  "Ollama generate initialization"
585
633
  );
634
+ let stream;
635
+ try {
636
+ stream = await initializeStream(true);
637
+ } catch (err) {
638
+ if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
639
+ stream = await initializeStream(false);
640
+ } else {
641
+ throw err;
642
+ }
643
+ }
586
644
  let fullResponse = "";
587
645
  let fullThinking = "";
588
646
  let result = null;
589
647
  let firstChunkSeen = false;
590
648
  let stallTimer = null;
591
649
  const resetStallTimer = () => {
650
+ if (stallTimeoutMs === void 0) return;
592
651
  if (stallTimer) clearTimeout(stallTimer);
593
652
  stallTimer = setTimeout(() => {
653
+ abortedByStallTimeout = true;
594
654
  client.abort();
595
- }, STREAM_STALL_TIMEOUT_MS);
655
+ }, stallTimeoutMs);
596
656
  };
597
657
  try {
598
658
  resetStallTimer();
@@ -627,6 +687,9 @@ async function generateStream(model, prompt, callbacks, options) {
627
687
  if (stallTimer) clearTimeout(stallTimer);
628
688
  }
629
689
  if (!result) {
690
+ if (abortedByStallTimeout && stallTimeoutMs !== void 0) {
691
+ throw new Error(`Ollama stream timed out after ${stallTimeoutMs}ms`);
692
+ }
630
693
  throw new Error("Stream ended without done signal");
631
694
  }
632
695
  callbacks?.onDone?.(result);
@@ -656,42 +719,191 @@ function abortOngoingRequests() {
656
719
  import os from "os";
657
720
  import path from "path";
658
721
  import { promises as fs } from "fs";
722
+ import { execFile as execFile2 } from "child_process";
659
723
  var DEFAULT_LM_STUDIO_BASE_URL = "http://127.0.0.1:1234";
660
724
  var LM_STUDIO_INIT_TIMEOUT_MS = 15e3;
661
725
  var LM_STUDIO_METADATA_TIMEOUT_MS = 2e3;
662
- var DEFAULT_STREAM_STALL_TIMEOUT_MS = 18e4;
726
+ var DEFAULT_STREAM_STALL_TIMEOUT_MS2 = 3e4;
727
+ var LM_STUDIO_CLI_TIMEOUT_MS = 8e3;
728
+ var SHARED_STREAM_STALL_TIMEOUT_ENV2 = "METRILLM_STREAM_STALL_TIMEOUT_MS";
663
729
  var DEFAULT_LM_STUDIO_HOME_DIR = path.join(os.homedir(), ".lmstudio");
664
730
  var DEFAULT_LM_STUDIO_MODELS_DIR = path.join(DEFAULT_LM_STUDIO_HOME_DIR, "models");
665
731
  var LM_STUDIO_HOME_DIR_ENV = "LM_STUDIO_HOME_DIR";
666
732
  var LM_STUDIO_MODELS_DIR_ENV = "LM_STUDIO_MODELS_DIR";
733
+ var LM_STUDIO_CLI_PATH_ENV = "LM_STUDIO_CLI_PATH";
667
734
  var defaultKeepAlive2;
668
735
  var activeAbortControllers = /* @__PURE__ */ new Set();
669
736
  var directorySizeCache = /* @__PURE__ */ new Map();
670
737
  var modelDefinitionCache = /* @__PURE__ */ new Map();
671
- function buildThinkingConfig(think) {
672
- if (think === void 0) return {};
673
- const effort = think ? "high" : "low";
738
+ var NON_THINKING_SYSTEM_PROMPT = [
739
+ "You are in non-thinking mode for benchmark reproducibility.",
740
+ "Return only the final answer.",
741
+ "Do not output internal reasoning, chain-of-thought, or scratchpad.",
742
+ "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
743
+ ].join(" ");
744
+ function hasThinkingLeakText(response) {
745
+ return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
746
+ }
747
+ function assertThinkingModeRespected(model, think, response, reasoning) {
748
+ if (think !== false) return;
749
+ if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
750
+ throw new Error(
751
+ [
752
+ `LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
753
+ "In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
754
+ "If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
755
+ "Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
756
+ "Then eject/reload the model and run the benchmark again."
757
+ ].join(" ")
758
+ );
759
+ }
760
+ }
761
+ function buildNativeThinkingOption(think) {
762
+ if (think !== true) return void 0;
763
+ return "high";
764
+ }
765
+ function hasSamplingOverrides2(options) {
766
+ return options?.top_p !== void 0 || options?.seed !== void 0;
767
+ }
768
+ function isUnsupportedSamplingMessage(status, text) {
769
+ if (status !== 400 && status !== 422) return false;
770
+ const lower = text.toLowerCase();
771
+ const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
772
+ if (!mentionsSampling) return false;
773
+ return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
774
+ }
775
+ function extractLMStudioErrorMessage(body) {
776
+ const trimmed = body.trim();
777
+ if (!trimmed) return "";
778
+ try {
779
+ const parsed = JSON.parse(trimmed);
780
+ const message = parsed.error?.message;
781
+ if (typeof message === "string" && message.trim().length > 0) {
782
+ return message.trim();
783
+ }
784
+ } catch {
785
+ }
786
+ return trimmed;
787
+ }
788
+ function isModelLoadGuardrailError(message) {
789
+ const lower = message.toLowerCase();
790
+ if (!lower.includes("failed to load model")) return false;
791
+ return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
792
+ }
793
+ function buildLMStudioRequestError(kind, model, status, statusText, body) {
794
+ const backendMessage = extractLMStudioErrorMessage(body);
795
+ if (isModelLoadGuardrailError(backendMessage)) {
796
+ return new Error(
797
+ [
798
+ `LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
799
+ "In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
800
+ `Backend error: ${backendMessage}`
801
+ ].join(" ")
802
+ );
803
+ }
804
+ const suffix = backendMessage ? ` ${backendMessage}` : "";
805
+ return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
806
+ }
807
+ function buildNativeChatBody(model, prompt, options, stream, includeSampling) {
808
+ const reasoning = buildNativeThinkingOption(options?.think);
674
809
  return {
675
- include_reasoning: think,
676
- reasoning_effort: effort,
677
- reasoning: { effort }
810
+ model,
811
+ input: prompt,
812
+ temperature: options?.temperature ?? 0,
813
+ ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
814
+ ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
815
+ max_tokens: options?.num_predict ?? 512,
816
+ stream,
817
+ ...reasoning !== void 0 ? { reasoning } : {},
818
+ ...options?.think === false ? { system_prompt: NON_THINKING_SYSTEM_PROMPT } : {}
678
819
  };
679
820
  }
680
- function parseNonNegativeInt(value) {
821
+ function getNativeStatNumber(value) {
822
+ if (typeof value !== "number" || !Number.isFinite(value) || value < 0) return void 0;
823
+ return value;
824
+ }
825
+ function flattenNativeText(value, depth = 0) {
826
+ if (depth > 3 || value == null) return "";
827
+ if (typeof value === "string") return value;
828
+ if (Array.isArray(value)) {
829
+ return value.map((item) => flattenNativeText(item, depth + 1)).join("");
830
+ }
831
+ if (typeof value === "object") {
832
+ const candidate = value;
833
+ return flattenNativeText(candidate.text, depth + 1) || flattenNativeText(candidate.content, depth + 1) || flattenNativeText(candidate.delta, depth + 1) || flattenNativeText(candidate.value, depth + 1);
834
+ }
835
+ return "";
836
+ }
837
+ function collectNativeOutput(output4) {
838
+ if (!Array.isArray(output4)) {
839
+ return { response: "", reasoning: "" };
840
+ }
841
+ let response = "";
842
+ let reasoning = "";
843
+ for (const item of output4) {
844
+ if (typeof item !== "object" || item === null) continue;
845
+ const nativeItem = item;
846
+ const text = flattenNativeText(nativeItem.text ?? nativeItem.content).trim();
847
+ if (!text) continue;
848
+ const type = asNonEmptyString(nativeItem.type)?.toLowerCase() ?? "";
849
+ if (type.includes("reason")) {
850
+ reasoning += text;
851
+ } else {
852
+ response += text;
853
+ }
854
+ }
855
+ return { response, reasoning };
856
+ }
857
+ function extractNativeStats(payload) {
858
+ if (typeof payload !== "object" || payload === null) return void 0;
859
+ const direct = payload.stats;
860
+ if (direct) return direct;
861
+ const result = payload.result?.stats;
862
+ return result;
863
+ }
864
+ function extractNativeResponse(payload) {
865
+ if (typeof payload !== "object" || payload === null) {
866
+ return { response: "", reasoning: "" };
867
+ }
868
+ const resultOutput = payload.result?.output;
869
+ const directOutput = payload.output;
870
+ const fromResult = collectNativeOutput(resultOutput);
871
+ if (fromResult.response || fromResult.reasoning) return fromResult;
872
+ return collectNativeOutput(directOutput);
873
+ }
874
+ function extractNativeDelta(payload) {
875
+ if (typeof payload !== "object" || payload === null) {
876
+ return { response: "", reasoning: "" };
877
+ }
878
+ const type = asNonEmptyString(payload.type)?.toLowerCase() ?? "";
879
+ const directText = flattenNativeText(payload.delta);
880
+ const fallbackText = directText || flattenNativeText(payload.content) || flattenNativeText(payload.text);
881
+ if (!fallbackText) {
882
+ return { response: "", reasoning: "" };
883
+ }
884
+ if (type.includes("reason")) {
885
+ return { response: "", reasoning: fallbackText };
886
+ }
887
+ if (type.includes("message") || type.includes("text") || type.includes("content")) {
888
+ return { response: fallbackText, reasoning: "" };
889
+ }
890
+ return { response: fallbackText, reasoning: "" };
891
+ }
892
+ function parseNonNegativeInt2(value) {
681
893
  if (!/^\d+$/.test(value)) return null;
682
894
  const parsed = Number.parseInt(value, 10);
683
895
  if (!Number.isSafeInteger(parsed) || parsed < 0) return null;
684
896
  return parsed;
685
897
  }
686
- function resolveStreamStallTimeoutMs(override) {
898
+ function resolveStreamStallTimeoutMs2(override) {
687
899
  if (override !== void 0) {
688
- if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
900
+ if (!Number.isFinite(override) || override < 0) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
689
901
  return override === 0 ? void 0 : Math.trunc(override);
690
902
  }
691
- const configured = process.env.LM_STUDIO_STREAM_STALL_TIMEOUT_MS?.trim();
692
- if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
693
- const parsed = parseNonNegativeInt(configured);
694
- if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS;
903
+ const configured = process.env[SHARED_STREAM_STALL_TIMEOUT_ENV2]?.trim();
904
+ if (!configured) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
905
+ const parsed = parseNonNegativeInt2(configured);
906
+ if (parsed === null) return DEFAULT_STREAM_STALL_TIMEOUT_MS2;
695
907
  return parsed === 0 ? void 0 : parsed;
696
908
  }
697
909
  function getLMStudioBaseUrl() {
@@ -714,25 +926,29 @@ function getLMStudioHeaders() {
714
926
  }
715
927
  return headers;
716
928
  }
717
- function extractUsage(payload) {
718
- if (typeof payload !== "object" || payload === null) return void 0;
719
- const usage = payload.usage;
720
- if (!usage) return void 0;
721
- return usage;
722
- }
723
- function extractChoice2(payload) {
724
- if (typeof payload !== "object" || payload === null) return void 0;
725
- const choices = payload.choices;
726
- if (!choices || choices.length === 0) return void 0;
727
- return choices[0];
728
- }
729
- function extractContent(choice) {
730
- const content = choice?.delta?.content ?? choice?.message?.content;
731
- return typeof content === "string" ? content : "";
929
+ function getUsageTokenCount(value) {
930
+ if (typeof value !== "number" || !Number.isFinite(value)) return 0;
931
+ if (value <= 0) return 0;
932
+ return Math.trunc(value);
933
+ }
934
+ function estimateCompletionTokensFallback(text) {
935
+ const normalized = text.trim();
936
+ if (!normalized) return 0;
937
+ const cjkMatches = normalized.match(/[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/gu);
938
+ const cjkCount = cjkMatches?.length ?? 0;
939
+ const withoutCjk = normalized.replace(
940
+ /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/gu,
941
+ ""
942
+ );
943
+ const nonCjkChars = withoutCjk.replace(/\s+/g, "").length;
944
+ const nonCjkHeuristic = Math.ceil(nonCjkChars / 4);
945
+ const whitespaceEstimate = estimateTokenCount(normalized);
946
+ return Math.max(1, Math.max(whitespaceEstimate, cjkCount + nonCjkHeuristic));
732
947
  }
733
- function extractReasoning(choice) {
734
- const reasoning = choice?.delta?.reasoning_content ?? choice?.delta?.reasoning ?? choice?.message?.reasoning_content ?? choice?.message?.reasoning;
735
- return typeof reasoning === "string" ? reasoning : "";
948
+ function resolveCompletionTokenCount(reportedTokenCount, response, reasoning) {
949
+ const reported = getUsageTokenCount(reportedTokenCount);
950
+ if (reported > 0) return reported;
951
+ return estimateCompletionTokensFallback(`${reasoning} ${response}`);
736
952
  }
737
953
  function asNonEmptyString(value) {
738
954
  if (typeof value !== "string") return void 0;
@@ -761,7 +977,7 @@ async function pathIsDirectory(targetPath) {
761
977
  try {
762
978
  const stat = await fs.stat(targetPath);
763
979
  return stat.isDirectory();
764
- } catch {
980
+ } catch (_err) {
765
981
  return false;
766
982
  }
767
983
  }
@@ -982,11 +1198,17 @@ async function resolveLocalModelMetadata(modelId, apiModel, modelsRootDir) {
982
1198
  const size = await readDirectorySizeBytes(source.fullPath);
983
1199
  if (size > bestSize) bestSize = size;
984
1200
  if (size > 0) {
985
- return { size, parameterSize: definition.parameterSize };
1201
+ return {
1202
+ size,
1203
+ parameterSize: definition.parameterSize
1204
+ };
986
1205
  }
987
1206
  }
988
1207
  if (bestSize > 0) {
989
- return { size: bestSize, parameterSize: definition.parameterSize };
1208
+ return {
1209
+ size: bestSize,
1210
+ parameterSize: definition.parameterSize
1211
+ };
990
1212
  }
991
1213
  const fallback = await resolvePublisherModelMetadata(modelId, apiModel, modelsRootDir);
992
1214
  if (fallback.size > 0) {
@@ -995,7 +1217,10 @@ async function resolveLocalModelMetadata(modelId, apiModel, modelsRootDir) {
995
1217
  parameterSize: definition.parameterSize ?? fallback.parameterSize
996
1218
  };
997
1219
  }
998
- return { size: 0, parameterSize: definition.parameterSize ?? fallback.parameterSize };
1220
+ return {
1221
+ size: 0,
1222
+ parameterSize: definition.parameterSize ?? fallback.parameterSize
1223
+ };
999
1224
  }
1000
1225
  function parseSizeBytes(model) {
1001
1226
  if (!model) return 0;
@@ -1033,6 +1258,21 @@ function inferParameterSizeFromModelId(modelId) {
1033
1258
  }
1034
1259
  return void 0;
1035
1260
  }
1261
+ function resolveModelFormat(apiModel, _localMetadata, _modelId) {
1262
+ return asNonEmptyString(apiModel?.compatibility_type);
1263
+ }
1264
+ function buildModelEntry(id, apiModel, localMetadata) {
1265
+ const apiSize = parseSizeBytes(apiModel);
1266
+ return {
1267
+ name: id,
1268
+ size: apiSize > 0 ? apiSize : localMetadata?.size ?? 0,
1269
+ parameterSize: localMetadata?.parameterSize ?? inferParameterSizeFromModelId(id),
1270
+ quantization: asNonEmptyString(apiModel?.quantization),
1271
+ runtimeStatus: asNonEmptyString(apiModel?.state),
1272
+ modelFormat: resolveModelFormat(apiModel, localMetadata, id),
1273
+ family: asNonEmptyString(apiModel?.arch) ?? asNonEmptyString(apiModel?.type) ?? asNonEmptyString(apiModel?.publisher)
1274
+ };
1275
+ }
1036
1276
  function isLoadedState(state) {
1037
1277
  if (!state) return false;
1038
1278
  const normalized = state.trim().toLowerCase();
@@ -1040,6 +1280,128 @@ function isLoadedState(state) {
1040
1280
  if (normalized === "loaded" || normalized === "ready") return true;
1041
1281
  return normalized.includes("loaded");
1042
1282
  }
1283
+ function execFileText(cmd, args, timeoutMs) {
1284
+ return new Promise((resolve, reject) => {
1285
+ execFile2(
1286
+ cmd,
1287
+ args,
1288
+ {
1289
+ timeout: timeoutMs,
1290
+ maxBuffer: 1024 * 1024,
1291
+ env: process.env
1292
+ },
1293
+ (err, stdout, stderr) => {
1294
+ if (err) {
1295
+ const error = err;
1296
+ error.stdout = stdout;
1297
+ error.stderr = stderr;
1298
+ reject(error);
1299
+ return;
1300
+ }
1301
+ resolve({ stdout, stderr });
1302
+ }
1303
+ );
1304
+ });
1305
+ }
1306
+ function isCommandMissingError(err) {
1307
+ return err instanceof Error && "code" in err && err.code === "ENOENT";
1308
+ }
1309
+ async function runLmsCli(args) {
1310
+ const configuredPath = asNonEmptyString(process.env[LM_STUDIO_CLI_PATH_ENV]);
1311
+ const fallbackPath = path.join(getLMStudioHomeDir(), "bin", "lms");
1312
+ const candidates = [
1313
+ configuredPath,
1314
+ "lms",
1315
+ fallbackPath
1316
+ ].filter(
1317
+ (candidate, index, list) => Boolean(candidate) && list.indexOf(candidate) === index
1318
+ );
1319
+ let lastError;
1320
+ for (const candidate of candidates) {
1321
+ try {
1322
+ return await execFileText(candidate, args, LM_STUDIO_CLI_TIMEOUT_MS);
1323
+ } catch (err) {
1324
+ lastError = err;
1325
+ if (isCommandMissingError(err)) continue;
1326
+ throw err;
1327
+ }
1328
+ }
1329
+ throw lastError ?? new Error("LM Studio CLI is not available.");
1330
+ }
1331
+ function normalizeCliToken(value) {
1332
+ return (value ?? "").trim().toLowerCase();
1333
+ }
1334
+ function matchesLoadedModelCliEntry(entry, model) {
1335
+ const target = normalizeCliToken(model);
1336
+ if (!target) return false;
1337
+ return [
1338
+ entry.identifier,
1339
+ entry.indexedModelIdentifier,
1340
+ entry.path,
1341
+ entry.modelKey
1342
+ ].some((candidate) => normalizeCliToken(candidate) === target);
1343
+ }
1344
+ async function listLoadedModelsFromCli() {
1345
+ const { stdout } = await runLmsCli(["ps", "--json"]);
1346
+ const parsed = JSON.parse(stdout);
1347
+ return Array.isArray(parsed) ? parsed : [];
1348
+ }
1349
+ function parseEstimatedBytes(output4) {
1350
+ const match = output4.match(/Estimated Total Memory:\s*([0-9]+(?:\.[0-9]+)?)\s*(KiB|MiB|GiB|TiB|KB|MB|GB|TB)/i);
1351
+ if (!match) return null;
1352
+ const value = Number.parseFloat(match[1] ?? "");
1353
+ const unit = (match[2] ?? "").toUpperCase();
1354
+ if (!Number.isFinite(value) || value <= 0) return null;
1355
+ const multipliers = {
1356
+ KIB: 1024,
1357
+ MIB: 1024 ** 2,
1358
+ GIB: 1024 ** 3,
1359
+ TIB: 1024 ** 4,
1360
+ KB: 1e3,
1361
+ MB: 1e3 ** 2,
1362
+ GB: 1e3 ** 3,
1363
+ TB: 1e3 ** 4
1364
+ };
1365
+ const multiplier = multipliers[unit];
1366
+ if (!multiplier) return null;
1367
+ return Math.round(value * multiplier);
1368
+ }
1369
+ async function estimateLoadedModelMemoryBytes(model) {
1370
+ let loadedEntry;
1371
+ try {
1372
+ const loadedModels = await listLoadedModelsFromCli();
1373
+ loadedEntry = loadedModels.find((entry) => matchesLoadedModelCliEntry(entry, model));
1374
+ } catch {
1375
+ loadedEntry = void 0;
1376
+ }
1377
+ if (!loadedEntry) return null;
1378
+ const candidateModelKeys = [
1379
+ loadedEntry?.path,
1380
+ loadedEntry?.indexedModelIdentifier,
1381
+ loadedEntry?.modelKey
1382
+ ].filter(
1383
+ (candidate, index, list) => Boolean(candidate?.trim()) && list.findIndex((item) => item === candidate) === index
1384
+ );
1385
+ for (const candidate of candidateModelKeys) {
1386
+ const args = ["load", "--estimate-only", "-y"];
1387
+ if (typeof loadedEntry?.contextLength === "number" && Number.isFinite(loadedEntry.contextLength) && loadedEntry.contextLength > 0) {
1388
+ args.push("--context-length", String(Math.trunc(loadedEntry.contextLength)));
1389
+ }
1390
+ args.push(candidate);
1391
+ try {
1392
+ const { stdout, stderr } = await runLmsCli(args);
1393
+ const estimated = parseEstimatedBytes(`${stdout}
1394
+ ${stderr}`);
1395
+ if (estimated !== null) return estimated;
1396
+ } catch (err) {
1397
+ const output4 = err instanceof Error ? `${String(err.stdout ?? "")}
1398
+ ${String(err.stderr ?? "")}` : "";
1399
+ const estimated = parseEstimatedBytes(output4);
1400
+ if (estimated !== null) return estimated;
1401
+ }
1402
+ }
1403
+ return null;
1404
+ }
1043
1405
  async function fetchApiModels() {
1044
1406
  try {
1045
1407
  const resp = await fetchWithTimeout(
@@ -1092,7 +1454,7 @@ async function getLMStudioVersion() {
1092
1454
  const localVersion = await resolveLocalLMStudioVersion();
1093
1455
  try {
1094
1456
  const resp = await fetchWithTimeout(
1095
- "/v1/models",
1457
+ "/api/v1/models",
1096
1458
  { method: "GET", headers: getLMStudioHeaders() },
1097
1459
  5e3,
1098
1460
  "LM Studio version check"
@@ -1107,7 +1469,7 @@ async function getLMStudioVersion() {
1107
1469
  }
1108
1470
  async function listModels2() {
1109
1471
  const resp = await fetchWithTimeout(
1110
- "/v1/models",
1472
+ "/api/v1/models",
1111
1473
  { method: "GET", headers: getLMStudioHeaders() },
1112
1474
  LM_STUDIO_INIT_TIMEOUT_MS,
1113
1475
  "LM Studio list models"
@@ -1125,25 +1487,25 @@ async function listModels2() {
1125
1487
  apiById.set(id, model);
1126
1488
  }
1127
1489
  const modelsRootDir = await resolveModelsRootDir();
1128
- const localMetadataById = /* @__PURE__ */ new Map();
1129
- for (const id of ids) {
1130
- const localMetadata = await resolveLocalModelMetadata(id, apiById.get(id), modelsRootDir);
1131
- localMetadataById.set(id, localMetadata);
1132
- }
1133
- return ids.map((id) => {
1134
- const apiModel = apiById.get(id);
1135
- const localMetadata = localMetadataById.get(id);
1136
- const apiSize = parseSizeBytes(apiModel);
1137
- return {
1138
- name: id,
1139
- size: apiSize > 0 ? apiSize : localMetadata?.size ?? 0,
1140
- parameterSize: localMetadata?.parameterSize ?? inferParameterSizeFromModelId(id),
1141
- quantization: asNonEmptyString(apiModel?.quantization),
1142
- runtimeStatus: asNonEmptyString(apiModel?.state),
1143
- modelFormat: asNonEmptyString(apiModel?.compatibility_type),
1144
- family: asNonEmptyString(apiModel?.arch) ?? asNonEmptyString(apiModel?.type) ?? asNonEmptyString(apiModel?.publisher)
1145
- };
1146
- });
1490
+ const localMetadataEntries = await Promise.all(
1491
+ ids.map(async (id) => {
1492
+ const localMetadata = await resolveLocalModelMetadata(id, apiById.get(id), modelsRootDir);
1493
+ return [id, localMetadata];
1494
+ })
1495
+ );
1496
+ const localMetadataById = new Map(
1497
+ localMetadataEntries
1498
+ );
1499
+ return ids.map((id) => buildModelEntry(id, apiById.get(id), localMetadataById.get(id)));
1500
+ }
1501
+ async function resolveModel(modelId) {
1502
+ const id = modelId.trim();
1503
+ if (!id) return null;
1504
+ const apiModels = await fetchApiModels();
1505
+ const apiModel = apiModels?.find((candidate) => asNonEmptyString(candidate.id) === id);
1506
+ const modelsRootDir = await resolveModelsRootDir();
1507
+ const localMetadata = await resolveLocalModelMetadata(id, apiModel, modelsRootDir);
1508
+ return buildModelEntry(id, apiModel, localMetadata);
1147
1509
  }
1148
1510
  async function listRunningModels2() {
1149
1511
  const apiModels = await fetchApiModels();
@@ -1164,39 +1526,54 @@ async function generate2(model, prompt, options) {
1164
1526
  activeAbortControllers.add(controller);
1165
1527
  try {
1166
1528
  const baseUrl = getLMStudioBaseUrl();
1167
- const url = new URL("/v1/chat/completions", baseUrl);
1168
- const resp = await fetch(url, {
1529
+ const url = new URL("/api/v1/chat", baseUrl);
1530
+ const doRequest = (includeSampling) => fetch(url, {
1169
1531
  method: "POST",
1170
1532
  headers: getLMStudioHeaders(),
1171
- body: JSON.stringify({
1172
- model,
1173
- messages: [{ role: "user", content: prompt }],
1174
- temperature: options?.temperature ?? 0,
1175
- max_tokens: options?.num_predict ?? 512,
1176
- stream: false,
1177
- ...buildThinkingConfig(options?.think)
1178
- }),
1533
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, includeSampling)),
1179
1534
  signal: controller.signal
1180
1535
  });
1536
+ let resp = await doRequest(true);
1181
1537
  if (!resp.ok) {
1182
1538
  const body = await resp.text().catch(() => "");
1183
- throw new Error(`LM Studio generate failed (${resp.status} ${resp.statusText}) ${body}`.trim());
1539
+ if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
1540
+ resp = await doRequest(false);
1541
+ } else {
1542
+ throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
1543
+ }
1544
+ }
1545
+ if (!resp.ok) {
1546
+ const body = await resp.text().catch(() => "");
1547
+ throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
1184
1548
  }
1185
1549
  const payload = await resp.json();
1186
- const choice = extractChoice2(payload);
1187
- const response = extractContent(choice);
1188
- const reasoning = extractReasoning(choice);
1189
- const usage = extractUsage(payload);
1550
+ const nativeResponse = extractNativeResponse(payload);
1551
+ const response = nativeResponse.response;
1552
+ const reasoning = nativeResponse.reasoning;
1553
+ assertThinkingModeRespected(model, options?.think, response, reasoning);
1554
+ const stats = extractNativeStats(payload);
1190
1555
  const totalDuration = Math.max(0, Date.now() - start) * 1e6;
1556
+ const outputTokens = getUsageTokenCount(stats?.total_output_tokens) || resolveCompletionTokenCount(void 0, response, reasoning);
1557
+ const throughput = getNativeStatNumber(stats?.tokens_per_second);
1558
+ const timeToFirstTokenSeconds = getNativeStatNumber(stats?.time_to_first_token_seconds);
1559
+ const modelLoadTimeSeconds = getNativeStatNumber(stats?.model_load_time_seconds);
1560
+ const evalCountEstimated = getUsageTokenCount(stats?.total_output_tokens) <= 0;
1561
+ const evalDuration = throughput !== void 0 && throughput > 0 && outputTokens > 0 ? Math.max(1, Math.round(outputTokens / throughput * 1e9)) : totalDuration;
1562
+ const promptEvalDuration = timeToFirstTokenSeconds !== void 0 ? Math.max(0, Math.round(timeToFirstTokenSeconds * 1e9)) : 0;
1563
+ const loadDuration = Math.max(
1564
+ 0,
1565
+ Math.round((modelLoadTimeSeconds ?? 0) * 1e9)
1566
+ );
1191
1567
  return {
1192
1568
  response,
1193
1569
  ...reasoning ? { thinking: reasoning } : {},
1194
1570
  totalDuration,
1195
- loadDuration: 0,
1196
- promptEvalCount: usage?.prompt_tokens ?? 0,
1197
- promptEvalDuration: 0,
1198
- evalCount: usage?.completion_tokens ?? 0,
1199
- evalDuration: totalDuration
1571
+ loadDuration,
1572
+ promptEvalCount: getUsageTokenCount(stats?.input_tokens),
1573
+ promptEvalDuration,
1574
+ evalCount: outputTokens,
1575
+ evalDuration,
1576
+ ...evalCountEstimated ? { evalCountEstimated: true } : {}
1200
1577
  };
1201
1578
  } catch (err) {
1202
1579
  if (err instanceof Error && err.name === "AbortError") {
@@ -1211,10 +1588,10 @@ async function generateStream2(model, prompt, callbacks, options) {
1211
1588
  const start = Date.now();
1212
1589
  const controller = new AbortController();
1213
1590
  activeAbortControllers.add(controller);
1214
- const stallTimeoutMs = resolveStreamStallTimeoutMs(options?.stall_timeout_ms);
1591
+ const stallTimeoutMs = resolveStreamStallTimeoutMs2(options?.stall_timeout_ms);
1215
1592
  let abortedByStallTimeout = false;
1216
1593
  const baseUrl = getLMStudioBaseUrl();
1217
- const url = new URL("/v1/chat/completions", baseUrl);
1594
+ const url = new URL("/api/v1/chat", baseUrl);
1218
1595
  let stallTimer = null;
1219
1596
  const resetStallTimer = () => {
1220
1597
  if (stallTimeoutMs === void 0) return;
@@ -1226,23 +1603,24 @@ async function generateStream2(model, prompt, callbacks, options) {
1226
1603
  };
1227
1604
  try {
1228
1605
  resetStallTimer();
1229
- const resp = await fetch(url, {
1606
+ const doRequest = (includeSampling) => fetch(url, {
1230
1607
  method: "POST",
1231
1608
  headers: getLMStudioHeaders(),
1232
- body: JSON.stringify({
1233
- model,
1234
- messages: [{ role: "user", content: prompt }],
1235
- temperature: options?.temperature ?? 0,
1236
- max_tokens: options?.num_predict ?? 512,
1237
- stream: true,
1238
- stream_options: { include_usage: true },
1239
- ...buildThinkingConfig(options?.think)
1240
- }),
1609
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, includeSampling)),
1241
1610
  signal: controller.signal
1242
1611
  });
1612
+ let resp = await doRequest(true);
1613
+ if (!resp.ok) {
1614
+ const body = await resp.text().catch(() => "");
1615
+ if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
1616
+ resp = await doRequest(false);
1617
+ } else {
1618
+ throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
1619
+ }
1620
+ }
1243
1621
  if (!resp.ok) {
1244
1622
  const body = await resp.text().catch(() => "");
1245
- throw new Error(`LM Studio stream failed (${resp.status} ${resp.statusText}) ${body}`.trim());
1623
+ throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
1246
1624
  }
1247
1625
  if (!resp.body) {
1248
1626
  throw new Error("LM Studio stream response body is empty");
@@ -1253,10 +1631,10 @@ async function generateStream2(model, prompt, callbacks, options) {
1253
1631
  let doneReceived = false;
1254
1632
  let fullResponse = "";
1255
1633
  let fullThinking = "";
1256
- let usage;
1634
+ let stats;
1257
1635
  let firstChunkSeen = false;
1258
- let firstTokenTime = null;
1259
- let lastTokenTime = null;
1636
+ let firstGeneratedTokenTime = null;
1637
+ let lastGeneratedTokenTime = null;
1260
1638
  const processDataLine = (rawLine) => {
1261
1639
  const line = rawLine.trim();
1262
1640
  if (!line.startsWith("data:")) return;
@@ -1272,18 +1650,27 @@ async function generateStream2(model, prompt, callbacks, options) {
1272
1650
  } catch {
1273
1651
  return;
1274
1652
  }
1275
- const choice = extractChoice2(payload);
1276
- const content = extractContent(choice);
1277
- const reasoning = extractReasoning(choice);
1278
- const chunkUsage = extractUsage(payload);
1279
- if (chunkUsage) usage = chunkUsage;
1653
+ const delta = extractNativeDelta(payload);
1654
+ const content = delta.response;
1655
+ const reasoning = delta.reasoning;
1656
+ const chunkStats = extractNativeStats(payload);
1657
+ if (chunkStats) stats = chunkStats;
1658
+ const aggregate = extractNativeResponse(payload);
1659
+ if (aggregate.response) {
1660
+ fullResponse = aggregate.response;
1661
+ }
1662
+ if (aggregate.reasoning) {
1663
+ fullThinking = aggregate.reasoning;
1664
+ }
1665
+ if (reasoning || content) {
1666
+ const now = Date.now();
1667
+ if (firstGeneratedTokenTime === null) firstGeneratedTokenTime = now;
1668
+ lastGeneratedTokenTime = now;
1669
+ }
1280
1670
  if (reasoning) {
1281
1671
  fullThinking += reasoning;
1282
1672
  }
1283
1673
  if (content) {
1284
- const now = Date.now();
1285
- if (firstTokenTime === null) firstTokenTime = now;
1286
- lastTokenTime = now;
1287
1674
  fullResponse += content;
1288
1675
  callbacks?.onToken?.(content);
1289
1676
  }
@@ -1303,6 +1690,14 @@ async function generateStream2(model, prompt, callbacks, options) {
1303
1690
  processDataLine(rawLine);
1304
1691
  }
1305
1692
  }
1693
+ buffered += decoder.decode();
1694
+ if (buffered.length > 0) {
1695
+ const lines = buffered.split("\n");
1696
+ buffered = lines.pop() ?? "";
1697
+ for (const rawLine of lines) {
1698
+ processDataLine(rawLine);
1699
+ }
1700
+ }
1306
1701
  if (buffered.trim().length > 0) {
1307
1702
  processDataLine(buffered);
1308
1703
  }
@@ -1311,17 +1706,27 @@ async function generateStream2(model, prompt, callbacks, options) {
1311
1706
  throw new Error("LM Studio stream ended without content");
1312
1707
  }
1313
1708
  const totalDuration = Math.max(0, Date.now() - start) * 1e6;
1314
- const evalDurationMs = firstTokenTime !== null && lastTokenTime !== null && lastTokenTime > firstTokenTime ? lastTokenTime - firstTokenTime : Date.now() - start;
1709
+ const outputTokens = getUsageTokenCount(stats?.total_output_tokens) || resolveCompletionTokenCount(void 0, fullResponse, fullThinking);
1710
+ const throughput = getNativeStatNumber(stats?.tokens_per_second);
1711
+ const timeToFirstTokenSeconds = getNativeStatNumber(stats?.time_to_first_token_seconds);
1712
+ const modelLoadTimeSeconds = getNativeStatNumber(stats?.model_load_time_seconds);
1713
+ const evalCountEstimated = getUsageTokenCount(stats?.total_output_tokens) <= 0;
1714
+ const evalDurationMs = throughput !== void 0 && throughput > 0 && outputTokens > 0 ? outputTokens / throughput * 1e3 : firstGeneratedTokenTime !== null && lastGeneratedTokenTime !== null && lastGeneratedTokenTime > firstGeneratedTokenTime ? lastGeneratedTokenTime - firstGeneratedTokenTime : Date.now() - start;
1315
1715
  const result = {
1316
1716
  response: fullResponse,
1317
1717
  ...fullThinking ? { thinking: fullThinking } : {},
1318
1718
  totalDuration,
1319
- loadDuration: 0,
1320
- promptEvalCount: usage?.prompt_tokens ?? 0,
1321
- promptEvalDuration: firstTokenTime !== null ? (firstTokenTime - start) * 1e6 : 0,
1322
- evalCount: usage?.completion_tokens ?? 0,
1323
- evalDuration: Math.max(1, evalDurationMs) * 1e6
1719
+ loadDuration: Math.max(
1720
+ 0,
1721
+ Math.round((modelLoadTimeSeconds ?? 0) * 1e9)
1722
+ ),
1723
+ promptEvalCount: getUsageTokenCount(stats?.input_tokens),
1724
+ promptEvalDuration: timeToFirstTokenSeconds !== void 0 ? Math.max(0, Math.round(timeToFirstTokenSeconds * 1e9)) : firstGeneratedTokenTime !== null ? (firstGeneratedTokenTime - start) * 1e6 : 0,
1725
+ evalCount: outputTokens,
1726
+ evalDuration: Math.max(1, Math.round(evalDurationMs * 1e6)),
1727
+ ...evalCountEstimated ? { evalCountEstimated: true } : {}
1324
1728
  };
1729
+ assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
1325
1730
  callbacks?.onDone?.(result);
1326
1731
  return result;
1327
1732
  } catch (err) {
@@ -1487,6 +1892,19 @@ function getRuntimeName() {
1487
1892
  function getRuntimeModelFormat() {
1488
1893
  return activeRuntime.modelFormat ?? "gguf";
1489
1894
  }
1895
+ async function resolveRuntimeModel(model) {
1896
+ if (activeRuntime.name === "lm-studio") {
1897
+ return resolveModel(model);
1898
+ }
1899
+ const knownModels = await activeRuntime.listModels();
1900
+ const matchedModel = knownModels.find((candidate) => candidate.name === model);
1901
+ if (matchedModel) return matchedModel;
1902
+ return {
1903
+ name: model,
1904
+ size: 0,
1905
+ modelFormat: activeRuntime.modelFormat ?? "gguf"
1906
+ };
1907
+ }
1490
1908
 
1491
1909
  // ../src/commands/bench.ts
1492
1910
  import { createHash as createHash3 } from "crypto";
@@ -1495,11 +1913,33 @@ import chalk8 from "chalk";
1495
1913
  // ../src/core/hardware.ts
1496
1914
  import si from "systeminformation";
1497
1915
  import os2 from "os";
1498
- import { execFile as execFile2 } from "child_process";
1916
+ import { execFile as execFile3 } from "child_process";
1499
1917
  import { readFile } from "fs/promises";
1918
+ function normalizeWhitespace(value) {
1919
+ return value.replace(/\s+/g, " ").trim();
1920
+ }
1921
+ function looksLikeGpuDescriptor(value) {
1922
+ return /\b(radeon|graphics|geforce|rtx|gtx|arc|iris|uhd|quadro|tesla|adreno|mali|powervr)\b/i.test(value);
1923
+ }
1924
+ function splitCpuAndInferredGpu(cpuLabel) {
1925
+ const normalized = normalizeWhitespace(cpuLabel);
1926
+ const withGpuMatch = normalized.match(/\s+(?:w\/\s*|with\s+)(.+)$/i);
1927
+ if (!withGpuMatch?.index) {
1928
+ return { cpu: normalized, inferredGpu: null };
1929
+ }
1930
+ const inferredGpu = normalizeWhitespace(withGpuMatch[1] ?? "");
1931
+ if (!looksLikeGpuDescriptor(inferredGpu)) {
1932
+ return { cpu: normalized, inferredGpu: null };
1933
+ }
1934
+ const cpu = normalizeWhitespace(normalized.slice(0, withGpuMatch.index));
1935
+ return {
1936
+ cpu: cpu || normalized,
1937
+ inferredGpu: inferredGpu || null
1938
+ };
1939
+ }
1500
1940
  function execCommand(cmd, args, timeoutMs = 3e3) {
1501
1941
  return new Promise((resolve) => {
1502
- const child = execFile2(cmd, args, { timeout: timeoutMs }, (err, stdout) => {
1942
+ const child = execFile3(cmd, args, { timeout: timeoutMs }, (err, stdout) => {
1503
1943
  if (err) return resolve("");
1504
1944
  resolve(stdout.trim());
1505
1945
  });
@@ -1633,11 +2073,14 @@ async function getHardwareInfo() {
1633
2073
  ]);
1634
2074
  const gpuController = graphics.controllers[0];
1635
2075
  const gpuNames = graphics.controllers.map((g) => g.model).filter(Boolean).join(", ");
2076
+ const cpuLabelRaw = normalizeWhitespace(`${cpu.manufacturer} ${cpu.brand}`);
2077
+ const { cpu: cpuLabel, inferredGpu } = splitCpuAndInferredGpu(cpuLabelRaw);
2078
+ const defaultIntegratedGpu = process.platform === "darwin" ? "Integrated / Apple Silicon" : "Integrated / Unknown";
1636
2079
  const gpuCoresRaw = gpuController?.cores;
1637
2080
  const gpuCores = gpuCoresRaw ? parseInt(String(gpuCoresRaw), 10) : null;
1638
2081
  const memType = memLayout.length > 0 ? memLayout[0].type : null;
1639
2082
  return {
1640
- cpu: `${cpu.manufacturer} ${cpu.brand}`,
2083
+ cpu: cpuLabel,
1641
2084
  cpuCores: cpu.cores,
1642
2085
  cpuPCores: cpu.performanceCores || null,
1643
2086
  cpuECores: cpu.efficiencyCores || null,
@@ -1647,7 +2090,7 @@ async function getHardwareInfo() {
1647
2090
  memoryType: memType || null,
1648
2091
  swapTotalGB: +(mem.swaptotal / 1024 / 1024 / 1024).toFixed(1),
1649
2092
  swapUsedGB: +(mem.swapused / 1024 / 1024 / 1024).toFixed(1),
1650
- gpu: gpuNames || "Integrated / Apple Silicon",
2093
+ gpu: normalizeWhitespace(gpuNames) || inferredGpu || defaultIntegratedGpu,
1651
2094
  gpuCores: gpuCores && !isNaN(gpuCores) ? gpuCores : null,
1652
2095
  gpuVramMB: gpuController?.vram ?? null,
1653
2096
  os: `${osInfo.distro} ${osInfo.release}`,
@@ -1682,6 +2125,10 @@ import chalk from "chalk";
1682
2125
 
1683
2126
  // ../src/ui/terminal.ts
1684
2127
  var supportsUnicode = process.platform !== "win32" || Boolean(process.env.WT_SESSION) || Boolean(process.env.TERM_PROGRAM);
2128
+ var ANSI_RE = /\x1b\[[0-9;]*[A-Za-z]/g;
2129
+ function stripAnsi(value) {
2130
+ return value.replace(ANSI_RE, "");
2131
+ }
1685
2132
 
1686
2133
  // ../src/ui/progress.ts
1687
2134
  var FUN_PHRASES = [
@@ -1816,6 +2263,33 @@ function errorMsg(text) {
1816
2263
  console.log(chalk.red(` ${CROSS_MARK} ${text}`));
1817
2264
  }
1818
2265
 
2266
+ // ../src/benchmarks/profile.ts
2267
+ var BENCHMARK_PROFILE_VERSION = "v1";
2268
+ var BENCHMARK_PROFILE_SEED = 42;
2269
+ var BENCHMARK_PROFILE_TOP_P = 1;
2270
+ var BENCHMARK_PROFILE_TEMPERATURE = 0;
2271
+ function withBenchmarkProfile(opts = {}) {
2272
+ return {
2273
+ temperature: BENCHMARK_PROFILE_TEMPERATURE,
2274
+ top_p: BENCHMARK_PROFILE_TOP_P,
2275
+ seed: BENCHMARK_PROFILE_SEED,
2276
+ ...opts
2277
+ };
2278
+ }
2279
+ function buildBenchmarkProfileMetadata(thinkEnabled) {
2280
+ return {
2281
+ version: BENCHMARK_PROFILE_VERSION,
2282
+ sampling: {
2283
+ temperature: BENCHMARK_PROFILE_TEMPERATURE,
2284
+ topP: BENCHMARK_PROFILE_TOP_P,
2285
+ seed: BENCHMARK_PROFILE_SEED
2286
+ },
2287
+ thinkingMode: thinkEnabled ? "enabled" : "disabled",
2288
+ contextWindowTokens: null,
2289
+ contextPolicy: "runtime-default"
2290
+ };
2291
+ }
2292
+
1819
2293
  // ../src/benchmarks/performance.ts
1820
2294
  var WARMUP_PROMPT = "Say hello in one word.";
1821
2295
  var BENCH_PROMPTS = [
@@ -1855,11 +2329,15 @@ async function runPerformanceBench(model, options = {}) {
1855
2329
  optionalProbeWithAvailability(() => getSwapUsedGB(), 0),
1856
2330
  optionalProbe(() => detectBatteryPowered(), void 0)
1857
2331
  ]);
2332
+ const runningModelsBeforeWarmup = await optionalProbe(() => listRunningModels3(), []);
2333
+ const modelWasAlreadyLoaded = runningModelsBeforeWarmup.some((m) => m.name === model);
1858
2334
  const warmup = await withTimeout(
1859
2335
  generateStream3(model, WARMUP_PROMPT, void 0, {
1860
- num_predict: 32,
1861
- think: options.think,
1862
- stall_timeout_ms: options.streamStallTimeoutMs
2336
+ ...withBenchmarkProfile({
2337
+ num_predict: 32,
2338
+ think: options.think,
2339
+ stall_timeout_ms: options.streamStallTimeoutMs
2340
+ })
1863
2341
  }),
1864
2342
  warmupTimeoutMs,
1865
2343
  "Model warmup",
@@ -1870,15 +2348,6 @@ async function runPerformanceBench(model, options = {}) {
1870
2348
  const loadTime = warmup.loadDuration / 1e6;
1871
2349
  const runningModels = await listRunningModels3();
1872
2350
  const thisModel = runningModels.find((m) => m.name === model);
1873
- let installedModelSizeBytes = 0;
1874
- try {
1875
- const availableModels = await listModels3();
1876
- const listedModel = availableModels.find((m) => m.name === model);
1877
- if (listedModel && Number.isFinite(listedModel.size) && listedModel.size > 0) {
1878
- installedModelSizeBytes = listedModel.size;
1879
- }
1880
- } catch {
1881
- }
1882
2351
  spinner.succeed("Model loaded");
1883
2352
  const tpsValues = [];
1884
2353
  const firstChunkValues = [];
@@ -1892,6 +2361,7 @@ async function runPerformanceBench(model, options = {}) {
1892
2361
  let thinkingDetected = false;
1893
2362
  let totalThinkingTokens = 0;
1894
2363
  const cpuLoadSamples = [];
2364
+ let tokensPerSecondEstimated = false;
1895
2365
  for (let i = 0; i < BENCH_PROMPTS.length; i++) {
1896
2366
  spinner.start(`Running performance test ${i + 1}/${BENCH_PROMPTS.length}...`);
1897
2367
  let firstChunkTime = null;
@@ -1914,11 +2384,11 @@ async function runPerformanceBench(model, options = {}) {
1914
2384
  }
1915
2385
  }
1916
2386
  },
1917
- {
2387
+ withBenchmarkProfile({
1918
2388
  num_predict: 256,
1919
2389
  think: options.think,
1920
2390
  stall_timeout_ms: options.streamStallTimeoutMs
1921
- }
2391
+ })
1922
2392
  ),
1923
2393
  promptTimeoutMs,
1924
2394
  "Performance benchmark",
@@ -1929,6 +2399,9 @@ async function runPerformanceBench(model, options = {}) {
1929
2399
  tpsValues.push(tps);
1930
2400
  totalEvalCount += result.evalCount;
1931
2401
  totalEvalDurationNs += result.evalDuration;
2402
+ if (result.evalCountEstimated) {
2403
+ tokensPerSecondEstimated = true;
2404
+ }
1932
2405
  if (firstChunkTime !== null) {
1933
2406
  firstChunkValues.push(firstChunkTime);
1934
2407
  }
@@ -1974,10 +2447,18 @@ async function runPerformanceBench(model, options = {}) {
1974
2447
  ]);
1975
2448
  let memoryUsedGB;
1976
2449
  let memoryPercent;
1977
- const loadedModelSizeBytes = thisModel && thisModel.size > 0 ? thisModel.size : installedModelSizeBytes;
2450
+ let memoryFootprintEstimated = false;
2451
+ const runtimeReportsComparableLoadedSize = runtimeName !== "lm-studio";
2452
+ const estimatedLoadedModelSizeBytes = runtimeName === "lm-studio" && modelWasAlreadyLoaded ? await optionalProbe(() => estimateLoadedModelMemoryBytes(model), null) : null;
2453
+ const loadedModelSizeBytes = runtimeReportsComparableLoadedSize && thisModel && thisModel.size > 0 ? thisModel.size : 0;
2454
+ const memoryFootprintAvailable = runtimeReportsComparableLoadedSize ? loadedModelSizeBytes > 0 || !modelWasAlreadyLoaded : (estimatedLoadedModelSizeBytes ?? 0) > 0 || !modelWasAlreadyLoaded;
1978
2455
  if (loadedModelSizeBytes > 0) {
1979
2456
  memoryUsedGB = loadedModelSizeBytes / 1024 ** 3;
1980
2457
  memoryPercent = memoryUsedGB / memAfter.totalGB * 100;
2458
+ } else if ((estimatedLoadedModelSizeBytes ?? 0) > 0) {
2459
+ memoryUsedGB = (estimatedLoadedModelSizeBytes ?? 0) / 1024 ** 3;
2460
+ memoryPercent = memoryUsedGB / memAfter.totalGB * 100;
2461
+ memoryFootprintEstimated = true;
1981
2462
  } else {
1982
2463
  memoryUsedGB = Math.max(0, memAfter.usedGB - memBefore.usedGB);
1983
2464
  memoryPercent = Math.max(0, memAfter.percent - memBefore.percent);
@@ -1999,6 +2480,7 @@ async function runPerformanceBench(model, options = {}) {
1999
2480
  return {
2000
2481
  metrics: {
2001
2482
  tokensPerSecond: totalEvalDurationNs > 0 ? totalEvalCount / (totalEvalDurationNs / 1e9) : avg(tpsValues),
2483
+ ...tokensPerSecondEstimated ? { tokensPerSecondEstimated: true } : {},
2002
2484
  ...firstChunkMs !== void 0 ? { firstChunkMs } : {},
2003
2485
  ttft: ttft >= 0 ? ttft : 3e4,
2004
2486
  // Fallback: 30s if no TTFT measured
@@ -2009,6 +2491,8 @@ async function runPerformanceBench(model, options = {}) {
2009
2491
  completionTokens: totalCompletionTokens,
2010
2492
  memoryUsedGB: +memoryUsedGB.toFixed(1),
2011
2493
  memoryPercent: +memoryPercent.toFixed(1),
2494
+ memoryFootprintAvailable,
2495
+ ...memoryFootprintEstimated ? { memoryFootprintEstimated: true } : {},
2012
2496
  memoryHostUsedGB: memAfter.usedGB,
2013
2497
  memoryHostPercent: memAfter.percent,
2014
2498
  tpsStdDev: tpsValues.length >= 2 ? stddev(tpsValues) : void 0,
@@ -2401,7 +2885,7 @@ Answer:`;
2401
2885
  const startTime = Date.now();
2402
2886
  try {
2403
2887
  const result = await withTimeout(
2404
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
2888
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
2405
2889
  timeoutMs,
2406
2890
  "Reasoning question",
2407
2891
  abortOngoingRequests3
@@ -2730,7 +3214,7 @@ Answer:`;
2730
3214
  const startTime = Date.now();
2731
3215
  try {
2732
3216
  const result = await withTimeout(
2733
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
3217
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
2734
3218
  timeoutMs,
2735
3219
  "Math problem",
2736
3220
  abortOngoingRequests3
@@ -2773,7 +3257,7 @@ Answer:`;
2773
3257
 
2774
3258
  // ../src/benchmarks/coding.ts
2775
3259
  import vm2 from "vm";
2776
- import { spawn } from "child_process";
3260
+ import { spawn as spawn2 } from "child_process";
2777
3261
  import { Worker } from "worker_threads";
2778
3262
 
2779
3263
  // ../src/datasets/coding.json
@@ -6446,7 +6930,7 @@ async function runTestsInSubprocess(code, task) {
6446
6930
  const total = task.tests.length;
6447
6931
  return new Promise((resolve) => {
6448
6932
  const wallTimeoutMs = computeIsolatedWallTimeoutMs(task);
6449
- const child = spawn(
6933
+ const child = spawn2(
6450
6934
  process.execPath,
6451
6935
  [
6452
6936
  "--max-old-space-size=96",
@@ -6613,7 +7097,7 @@ Reply with ONLY the function code, no explanation.`;
6613
7097
  const startTime = Date.now();
6614
7098
  try {
6615
7099
  const result = await withTimeout(
6616
- generate3(model, prompt, { temperature: 0, num_predict: 2048, think: opts?.think }),
7100
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
6617
7101
  timeoutMs,
6618
7102
  "Coding task",
6619
7103
  abortOngoingRequests3
@@ -6968,7 +7452,7 @@ async function runInstructionFollowingBench(model, opts) {
6968
7452
  const startTime = Date.now();
6969
7453
  try {
6970
7454
  const result = await withTimeout(
6971
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
7455
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
6972
7456
  timeoutMs,
6973
7457
  "Instruction following task",
6974
7458
  abortOngoingRequests3
@@ -7354,7 +7838,7 @@ async function runStructuredOutputBench(model, opts) {
7354
7838
  const startTime = Date.now();
7355
7839
  try {
7356
7840
  const result = await withTimeout(
7357
- generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
7841
+ generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
7358
7842
  timeoutMs,
7359
7843
  "Structured output task",
7360
7844
  abortOngoingRequests3
@@ -7613,7 +8097,7 @@ async function runMultilingualBench(model, opts) {
7613
8097
  const startTime = Date.now();
7614
8098
  try {
7615
8099
  const result = await withTimeout(
7616
- generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
8100
+ generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
7617
8101
  timeoutMs,
7618
8102
  "Multilingual task",
7619
8103
  abortOngoingRequests3
@@ -7749,13 +8233,15 @@ function computePerformanceScore(perf, hardware) {
7749
8233
  const tuning = deriveHardwareFitTuning(hardware);
7750
8234
  const safeTokensPerSecond = sanitizeNonNegative(perf.tokensPerSecond, 0);
7751
8235
  const safeTtft = sanitizeNonNegative(perf.ttft, tuning.ttft.hardMaxMs * 2);
7752
- const effectiveMemPercent = sanitizeNonNegative(
7753
- perf.memoryHostPercent ?? perf.memoryPercent,
7754
- 100
7755
- );
7756
8236
  const speed = Math.round(scoreSpeed(safeTokensPerSecond, tuning));
7757
8237
  const ttft = Math.round(scoreTTFT(safeTtft, tuning));
7758
- const memory = Math.round(scoreMemory(effectiveMemPercent));
8238
+ const memory = perf.memoryFootprintAvailable === false ? Math.round((speed + ttft) / (50 + 20) * 30) : Math.round(
8239
+ scoreMemory(
8240
+ // Score memory from the model's measured footprint/delta rather than
8241
+ // unrelated host RAM usage from other running workloads.
8242
+ sanitizeNonNegative(perf.memoryPercent, 100)
8243
+ )
8244
+ );
7759
8245
  return {
7760
8246
  total: clamp(speed + ttft + memory, 0, 100),
7761
8247
  speed,
@@ -7864,11 +8350,9 @@ function computeFitness(perf, quality, hardware, benchEnv) {
7864
8350
  const safeTokensPerSecond = sanitizeNonNegative(perf.tokensPerSecond, 0);
7865
8351
  const safeTtft = sanitizeNonNegative(perf.ttft, tuning.ttft.hardMaxMs * 10);
7866
8352
  const safeLoadTime = sanitizeNonNegative(perf.loadTime, tuning.loadTimeHardMaxMs * 10);
7867
- const hostMemoryPercent = sanitizeNonNegative(
7868
- perf.memoryHostPercent ?? perf.memoryPercent,
7869
- 100
7870
- );
7871
- const modelMemoryDeltaPercent = sanitizeNonNegative(perf.memoryPercent, 100);
8353
+ const modelMemoryFootprintAvailable = perf.memoryFootprintAvailable !== false;
8354
+ const modelMemoryDeltaPercent = modelMemoryFootprintAvailable ? sanitizeNonNegative(perf.memoryPercent, 100) : void 0;
8355
+ const hostMemoryPercent = perf.memoryHostPercent !== void 0 && Number.isFinite(perf.memoryHostPercent) && perf.memoryHostPercent >= 0 ? perf.memoryHostPercent : void 0;
7872
8356
  const disqualifiers = [];
7873
8357
  if (safeTokensPerSecond < tuning.speed.hardMin) {
7874
8358
  disqualifiers.push(
@@ -7885,12 +8369,12 @@ function computeFitness(perf, quality, hardware, benchEnv) {
7885
8369
  `Model load time too high: ${Math.round(safeLoadTime)}ms (maximum: ${tuning.loadTimeHardMaxMs}ms for ${tuning.profile} profile)`
7886
8370
  );
7887
8371
  }
7888
- const hostCritical = hostMemoryPercent > 95;
7889
- const modelDeltaCritical = modelMemoryDeltaPercent > 90;
7890
- const modelDeltaSignificant = modelMemoryDeltaPercent >= 10;
7891
- if (modelDeltaCritical || hostCritical && modelDeltaSignificant) {
8372
+ const hostCritical = hostMemoryPercent !== void 0 && hostMemoryPercent > 95;
8373
+ const modelDeltaCritical = modelMemoryDeltaPercent !== void 0 && modelMemoryDeltaPercent > 90;
8374
+ const modelDeltaSignificant = modelMemoryDeltaPercent !== void 0 && modelMemoryDeltaPercent >= 10;
8375
+ if (modelDeltaCritical) {
7892
8376
  disqualifiers.push(
7893
- `Memory usage critical: host ${hostMemoryPercent.toFixed(0)}%, model delta +${modelMemoryDeltaPercent.toFixed(0)}%`
8377
+ `Memory usage critical: model delta +${modelMemoryDeltaPercent.toFixed(0)}%`
7894
8378
  );
7895
8379
  }
7896
8380
  const verdictScore = globalScore ?? hardwareFitScore;
@@ -7925,9 +8409,28 @@ function computeFitness(perf, quality, hardware, benchEnv) {
7925
8409
  `Token speed is unstable (stddev ${perf.tpsStdDev.toFixed(1)} tok/s, mean ${safeTokensPerSecond.toFixed(1)} tok/s) \u2014 may indicate thermal throttling or memory pressure.`
7926
8410
  );
7927
8411
  }
7928
- if (hostCritical && !modelDeltaSignificant) {
8412
+ if (perf.tokensPerSecondEstimated) {
8413
+ warnings.push(
8414
+ "Token throughput is estimated from LM Studio output because native token stats were unavailable. Compare tok/s across backends cautiously."
8415
+ );
8416
+ }
8417
+ if (perf.memoryFootprintEstimated) {
8418
+ warnings.push(
8419
+ "Model memory footprint is estimated via LM Studio CLI rather than measured from a fresh load."
8420
+ );
8421
+ }
8422
+ if (!modelMemoryFootprintAvailable) {
7929
8423
  warnings.push(
7930
- `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) but model delta is limited (+${modelMemoryDeltaPercent.toFixed(0)}%). Verdict may be influenced by other running workloads.`
8424
+ "Model memory footprint was unavailable for this run, so RAM fit scoring was normalized from speed and TTFT only."
8425
+ );
8426
+ }
8427
+ if (hostCritical && !modelMemoryFootprintAvailable) {
8428
+ warnings.push(
8429
+ `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) and model footprint was unavailable. Results may be influenced by other running workloads.`
8430
+ );
8431
+ } else if (hostCritical && modelMemoryDeltaPercent !== void 0 && !modelDeltaSignificant) {
8432
+ warnings.push(
8433
+ `Host memory is already high (${hostMemoryPercent.toFixed(0)}%) but model delta is limited (+${modelMemoryDeltaPercent.toFixed(0)}%). Results may be influenced by other running workloads.`
7931
8434
  );
7932
8435
  }
7933
8436
  if (hardware?.powerMode === "low-power") {
@@ -8015,12 +8518,46 @@ function getLevel(score) {
8015
8518
  if (score >= 25) return "Weak";
8016
8519
  return "Poor";
8017
8520
  }
8521
+ function formatCpuCoresLabel(hw) {
8522
+ if (hw.cpuPCores !== null && hw.cpuECores !== null) {
8523
+ return `${hw.cpuCores} total (${hw.cpuPCores} performance + ${hw.cpuECores} efficiency)`;
8524
+ }
8525
+ if (hw.cpuPCores !== null && hw.cpuCores > hw.cpuPCores) {
8526
+ return `${hw.cpuCores} threads (${hw.cpuPCores} cores)`;
8527
+ }
8528
+ if (hw.cpuPCores !== null) {
8529
+ return `${hw.cpuCores} total (${hw.cpuPCores} performance)`;
8530
+ }
8531
+ if (hw.cpuECores !== null) {
8532
+ return `${hw.cpuCores} total (${hw.cpuECores} efficiency)`;
8533
+ }
8534
+ return String(hw.cpuCores);
8535
+ }
8536
+ function summarizeCategoryIssues(name, details) {
8537
+ let crashes = 0;
8538
+ let timeouts = 0;
8539
+ let errors = 0;
8540
+ for (const detail of details) {
8541
+ const actual = detail.actual ?? "";
8542
+ if (/^TIMEOUT\b/i.test(actual)) {
8543
+ timeouts++;
8544
+ continue;
8545
+ }
8546
+ if (/^ERROR:/i.test(actual)) {
8547
+ errors++;
8548
+ if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
8549
+ crashes++;
8550
+ }
8551
+ }
8552
+ }
8553
+ return { name, crashes, timeouts, errors };
8554
+ }
8018
8555
  function printHardwareTable(hw) {
8019
8556
  const table = new Table({
8020
8557
  head: [chalk3.bold("Hardware"), chalk3.bold("Value")],
8021
8558
  style: { head: [], border: [] }
8022
8559
  });
8023
- const coresDetail = hw.cpuPCores ? `${hw.cpuCores} (${hw.cpuPCores} performance + ${hw.cpuECores ?? 0} efficiency)` : String(hw.cpuCores);
8560
+ const coresDetail = formatCpuCoresLabel(hw);
8024
8561
  const cpuLine = hw.cpuFreqGHz ? `${hw.cpu} @ ${hw.cpuFreqGHz} GHz` : hw.cpu;
8025
8562
  const ramLine = hw.memoryType ? `${hw.totalMemoryGB} GB ${hw.memoryType} (${hw.freeMemoryGB} GB free)` : `${hw.totalMemoryGB} GB (${hw.freeMemoryGB} GB free)`;
8026
8563
  const swapColor = hw.swapUsedGB > hw.swapTotalGB * 0.5 ? chalk3.yellow : chalk3.green;
@@ -8056,7 +8593,10 @@ function printPerformanceTable(perf, benchEnvironment) {
8056
8593
  const ttftColor = perf.ttft < 1e3 ? chalk3.green : perf.ttft < 3e3 ? chalk3.yellow : chalk3.red;
8057
8594
  const memColor = perf.memoryPercent < 50 ? chalk3.green : perf.memoryPercent < 80 ? chalk3.yellow : chalk3.red;
8058
8595
  table.push(
8059
- ["Tokens/sec", tpsColor(`${perf.tokensPerSecond.toFixed(1)} tok/s`)],
8596
+ [
8597
+ "Tokens/sec",
8598
+ perf.tokensPerSecondEstimated ? chalk3.yellow(`${perf.tokensPerSecond.toFixed(1)} tok/s (estimated)`) : tpsColor(`${perf.tokensPerSecond.toFixed(1)} tok/s`)
8599
+ ],
8060
8600
  [
8061
8601
  "First Chunk Latency",
8062
8602
  perf.firstChunkMs !== void 0 ? formatDuration(perf.firstChunkMs) : chalk3.dim("N/A (stream metric unavailable)")
@@ -8071,8 +8611,8 @@ function printPerformanceTable(perf, benchEnvironment) {
8071
8611
  ["Completion Tokens", String(perf.completionTokens)],
8072
8612
  [
8073
8613
  "Model Memory Footprint",
8074
- memColor(
8075
- `${perf.memoryUsedGB.toFixed(1)} GB (+${perf.memoryPercent.toFixed(0)}%)`
8614
+ perf.memoryFootprintAvailable === false ? chalk3.dim("N/A (model already loaded; runtime metric unavailable)") : memColor(
8615
+ `${perf.memoryUsedGB.toFixed(1)} GB (+${perf.memoryPercent.toFixed(0)}%)${perf.memoryFootprintEstimated ? " (estimated)" : ""}`
8076
8616
  )
8077
8617
  ],
8078
8618
  [
@@ -8133,6 +8673,18 @@ function printQualityTable(quality, timePenalties) {
8133
8673
  ]);
8134
8674
  }
8135
8675
  console.log(table.toString());
8676
+ const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
8677
+ if (issueSummaries.length > 0) {
8678
+ console.log(chalk3.yellow("Execution issues detected during quality benchmark:"));
8679
+ for (const summary of issueSummaries) {
8680
+ const parts = [];
8681
+ if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
8682
+ const nonCrashErrors = summary.errors - summary.crashes;
8683
+ if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
8684
+ if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
8685
+ console.log(chalk3.yellow(` \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
8686
+ }
8687
+ }
8136
8688
  }
8137
8689
  function printSummaryTable(results) {
8138
8690
  const termWidth = process.stdout.columns || 80;
@@ -8146,7 +8698,7 @@ function printSummaryTable(results) {
8146
8698
  chalk3.bold("Model"),
8147
8699
  chalk3.bold("tok/s"),
8148
8700
  chalk3.bold("TTFT"),
8149
- chalk3.bold("Host RAM%"),
8701
+ chalk3.bold("Model RAM%"),
8150
8702
  chalk3.bold("Profile"),
8151
8703
  chalk3.bold("HW Fit"),
8152
8704
  chalk3.bold("Quality"),
@@ -8159,17 +8711,23 @@ function printSummaryTable(results) {
8159
8711
  style: { head: [], border: [] },
8160
8712
  wordWrap: true
8161
8713
  });
8714
+ const formatSummaryModelMemory = (result) => {
8715
+ if (result.performance.memoryFootprintAvailable === false) return "N/A";
8716
+ const value = `${result.performance.memoryPercent.toFixed(0)}%`;
8717
+ return result.performance.memoryFootprintEstimated ? `${value}~` : value;
8718
+ };
8162
8719
  for (const r of results) {
8163
8720
  const vColor = r.fitness.verdict === "EXCELLENT" ? chalk3.green.bold : r.fitness.verdict === "GOOD" ? chalk3.blue.bold : r.fitness.verdict === "MARGINAL" ? chalk3.yellow.bold : chalk3.red.bold;
8164
8721
  const flags = [];
8165
8722
  if (r.hardware.powerMode === "low-power") flags.push(chalk3.red("ECO"));
8166
8723
  if (r.modelInfo?.thinkingDetected) flags.push(chalk3.magenta("THINK"));
8167
8724
  const modelName = compact && r.model.length > 20 ? r.model.slice(0, 18) + ".." : r.model;
8725
+ const throughputLabel = r.performance.tokensPerSecondEstimated ? `~${r.performance.tokensPerSecond.toFixed(1)}` : `${r.performance.tokensPerSecond.toFixed(1)}`;
8168
8726
  const row = [
8169
8727
  modelName,
8170
- `${r.performance.tokensPerSecond.toFixed(1)}`,
8728
+ throughputLabel,
8171
8729
  formatDuration(r.performance.ttft),
8172
- r.performance.memoryHostPercent !== void 0 ? `${r.performance.memoryHostPercent.toFixed(0)}%` : "n/a",
8730
+ formatSummaryModelMemory(r),
8173
8731
  r.fitness.tuning.profile,
8174
8732
  scoreColor(r.fitness.hardwareFitScore)(
8175
8733
  `${compactBar(r.fitness.hardwareFitScore)} ${r.fitness.hardwareFitScore}%`
@@ -8194,9 +8752,8 @@ function printSummaryTable(results) {
8194
8752
  // ../src/ui/verdict.ts
8195
8753
  import chalk4 from "chalk";
8196
8754
  var BOX_INNER = 60;
8197
- var ANSI_RE = /\x1b\[[0-9;]*m/g;
8198
8755
  function visibleLength(str) {
8199
- return str.replace(ANSI_RE, "").length;
8756
+ return stripAnsi(str).length;
8200
8757
  }
8201
8758
  function wrapText(text, maxWidth) {
8202
8759
  if (visibleLength(text) <= maxWidth) return [text];
@@ -8566,6 +9123,15 @@ function assertUploaderConfig(config) {
8566
9123
  );
8567
9124
  }
8568
9125
  }
9126
+ function resolveUploadedMemoryPercent(result) {
9127
+ return result.performance.memoryFootprintAvailable === false ? null : result.performance.memoryPercent;
9128
+ }
9129
+ function resolveUploadedModelFormat(result) {
9130
+ if (result.metadata.modelFormat?.trim()) return result.metadata.modelFormat;
9131
+ const runtimeBackend = result.metadata.runtimeBackend ?? "ollama";
9132
+ if (runtimeBackend === "ollama") return "gguf";
9133
+ return "unknown";
9134
+ }
8569
9135
  async function uploadBenchResult(result, options = {}) {
8570
9136
  const config = resolveUploaderConfig();
8571
9137
  assertUploaderConfig(config);
@@ -8578,7 +9144,7 @@ async function uploadBenchResult(result, options = {}) {
8578
9144
  thinking_detected: result.modelInfo?.thinkingDetected ?? null,
8579
9145
  tokens_per_second: result.performance.tokensPerSecond,
8580
9146
  ttft_ms: result.performance.ttft,
8581
- memory_percent: result.performance.memoryHostPercent ?? result.performance.memoryPercent,
9147
+ memory_percent: resolveUploadedMemoryPercent(result),
8582
9148
  thinking_tokens_estimate: result.performance.thinkingTokensEstimate ?? null,
8583
9149
  verdict: result.fitness.verdict,
8584
9150
  global_score: result.fitness.globalScore,
@@ -8595,7 +9161,7 @@ async function uploadBenchResult(result, options = {}) {
8595
9161
  benchmark_spec_version: result.metadata.benchmarkSpecVersion,
8596
9162
  runtime_version: result.metadata.runtimeVersion,
8597
9163
  runtime_backend: result.metadata.runtimeBackend ?? "ollama",
8598
- model_format: result.metadata.modelFormat ?? "gguf",
9164
+ model_format: resolveUploadedModelFormat(result),
8599
9165
  raw_log_hash: result.metadata.rawLogHash,
8600
9166
  result
8601
9167
  };
@@ -8848,6 +9414,7 @@ async function promptSubmitterProfile(deps, defaults = {}) {
8848
9414
  }
8849
9415
  console.log(chalk6.yellow("Nickname must be between 2 and 40 characters."));
8850
9416
  }
9417
+ console.log(chalk6.dim("Your email is never stored \u2014 only a SHA-256 hash is saved to match your leaderboard entries."));
8851
9418
  while (true) {
8852
9419
  const emailHint = defaults.email ? ` [${defaults.email}]` : "";
8853
9420
  const emailAnswer = await ask(`Email${emailHint} > `);
@@ -9008,7 +9575,7 @@ async function promptThinkingMode() {
9008
9575
  }
9009
9576
 
9010
9577
  // ../src/commands/bench.ts
9011
- var BENCHMARK_SPEC_VERSION = "0.2.0";
9578
+ var BENCHMARK_SPEC_VERSION = "0.2.1";
9012
9579
  var PROMPT_PACK_VERSION = "0.1.0";
9013
9580
  async function benchCommand(options) {
9014
9581
  if (options.backend !== void 0) {
@@ -9101,6 +9668,11 @@ async function benchCommand(options) {
9101
9668
  if (!silent && thinkEnabled) {
9102
9669
  infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
9103
9670
  }
9671
+ if (!silent) {
9672
+ infoMsg(
9673
+ `Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
9674
+ );
9675
+ }
9104
9676
  try {
9105
9677
  const results = [];
9106
9678
  const failedModels = [];
@@ -9130,7 +9702,7 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
9130
9702
  minSuccessfulPrompts: options.perfMinSuccessfulPrompts,
9131
9703
  failOnPromptError: options.perfStrict,
9132
9704
  think: thinkEnabled,
9133
- streamStallTimeoutMs: options.lmStudioStreamStallTimeoutMs
9705
+ streamStallTimeoutMs: options.streamStallTimeoutMs
9134
9706
  });
9135
9707
  const perf = perfResult.metrics;
9136
9708
  const benchEnvironment = perfResult.benchEnvironment;
@@ -9168,13 +9740,22 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
9168
9740
  printVerdict(modelName, fitness);
9169
9741
  }
9170
9742
  const matchedModel = allModels.find((m) => m.name === modelName);
9171
- const modelInfo = matchedModel ? {
9172
- parameterSize: matchedModel.parameterSize,
9173
- quantization: matchedModel.quantization,
9174
- family: matchedModel.family,
9175
- // Persist the configured benchmark mode (not model auto-detection).
9176
- thinkingDetected: thinkEnabled
9177
- } : { thinkingDetected: thinkEnabled };
9743
+ let resolvedModel = matchedModel;
9744
+ if (matchedModel?.modelFormat === void 0) {
9745
+ try {
9746
+ resolvedModel = await resolveRuntimeModel(modelName) ?? matchedModel;
9747
+ } catch {
9748
+ resolvedModel = matchedModel;
9749
+ }
9750
+ }
9751
+ const modelMetadataSource = resolvedModel ?? matchedModel;
9752
+ const modelInfo = modelMetadataSource ? {
9753
+ parameterSize: modelMetadataSource.parameterSize,
9754
+ quantization: modelMetadataSource.quantization,
9755
+ family: modelMetadataSource.family,
9756
+ // Persist actual observed thinking behavior from the benchmark run.
9757
+ thinkingDetected: perfResult.thinkingDetected
9758
+ } : { thinkingDetected: perfResult.thinkingDetected };
9178
9759
  const partialResult = {
9179
9760
  model: modelName,
9180
9761
  modelInfo,
@@ -9189,7 +9770,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
9189
9770
  promptPackVersion: PROMPT_PACK_VERSION,
9190
9771
  runtimeVersion,
9191
9772
  runtimeBackend: getRuntimeName(),
9192
- modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
9773
+ modelFormat: resolvedModel?.modelFormat ?? (getRuntimeName() === "ollama" ? getRuntimeModelFormat() : "unknown"),
9774
+ benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
9193
9775
  }
9194
9776
  };
9195
9777
  const rawLogHash = createHash3("sha256").update(JSON.stringify(partialResult)).digest("hex");
@@ -9550,7 +10132,7 @@ async function handleShareResult(args) {
9550
10132
  // src/index.ts
9551
10133
  var server = new McpServer({
9552
10134
  name: "metrillm",
9553
- version: "0.1.0"
10135
+ version: "0.2.1"
9554
10136
  });
9555
10137
  for (const def of toolDefinitions) {
9556
10138
  switch (def.name) {