metrillm-mcp 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -110,7 +110,8 @@ function stripThinkTags(text) {
110
110
  }
111
111
  function hasThinkingContent(response, thinkingField) {
112
112
  if (thinkingField && thinkingField.trim().length > 0) return true;
113
- return /<think(?:ing)?[\s>]/i.test(response);
113
+ if (/<think(?:ing)?[\s>]/i.test(response)) return true;
114
+ return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
114
115
  }
115
116
  function estimateTokenCount(text) {
116
117
  if (!text) return 0;
@@ -564,25 +565,50 @@ var defaultKeepAlive;
564
565
  function setDefaultKeepAlive(keepAlive) {
565
566
  defaultKeepAlive = keepAlive;
566
567
  }
568
+ function hasSamplingOverrides(options) {
569
+ return options?.top_p !== void 0 || options?.seed !== void 0;
570
+ }
571
+ function isUnsupportedSamplingOptionError(err) {
572
+ const message = err instanceof Error ? err.message : String(err);
573
+ const lower = message.toLowerCase();
574
+ const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
575
+ if (!mentionsSampling) return false;
576
+ return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
577
+ }
578
+ function buildGenerateRequest(model, prompt, options, includeSampling) {
579
+ return {
580
+ model,
581
+ prompt,
582
+ stream: true,
583
+ keep_alive: options?.keep_alive ?? defaultKeepAlive,
584
+ ...options?.think !== void 0 ? { think: options.think } : {},
585
+ options: {
586
+ temperature: options?.temperature ?? 0,
587
+ ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
588
+ ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
589
+ num_predict: options?.num_predict ?? 512
590
+ }
591
+ };
592
+ }
567
593
  async function generate(model, prompt, options) {
568
594
  return generateStream(model, prompt, void 0, options);
569
595
  }
570
596
  async function generateStream(model, prompt, callbacks, options) {
571
- const stream = await withTimeout(
572
- client.generate({
573
- model,
574
- prompt,
575
- stream: true,
576
- keep_alive: options?.keep_alive ?? defaultKeepAlive,
577
- ...options?.think !== void 0 ? { think: options.think } : {},
578
- options: {
579
- temperature: options?.temperature ?? 0,
580
- num_predict: options?.num_predict ?? 512
581
- }
582
- }),
597
+ const initializeStream = (includeSampling) => withTimeout(
598
+ client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
583
599
  OLLAMA_INIT_TIMEOUT_MS,
584
600
  "Ollama generate initialization"
585
601
  );
602
+ let stream;
603
+ try {
604
+ stream = await initializeStream(true);
605
+ } catch (err) {
606
+ if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
607
+ stream = await initializeStream(false);
608
+ } else {
609
+ throw err;
610
+ }
611
+ }
586
612
  let fullResponse = "";
587
613
  let fullThinking = "";
588
614
  let result = null;
@@ -668,6 +694,29 @@ var defaultKeepAlive2;
668
694
  var activeAbortControllers = /* @__PURE__ */ new Set();
669
695
  var directorySizeCache = /* @__PURE__ */ new Map();
670
696
  var modelDefinitionCache = /* @__PURE__ */ new Map();
697
+ var NON_THINKING_SYSTEM_PROMPT = [
698
+ "You are in non-thinking mode for benchmark reproducibility.",
699
+ "Return only the final answer.",
700
+ "Do not output internal reasoning, chain-of-thought, or scratchpad.",
701
+ "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
702
+ ].join(" ");
703
+ function hasThinkingLeakText(response) {
704
+ return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
705
+ }
706
+ function assertThinkingModeRespected(model, think, response, reasoning) {
707
+ if (think !== false) return;
708
+ if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
709
+ throw new Error(
710
+ [
711
+ `LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
712
+ "In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
713
+ "If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
714
+ "Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
715
+ "Then eject/reload the model and run the benchmark again."
716
+ ].join(" ")
717
+ );
718
+ }
719
+ }
671
720
  function buildThinkingConfig(think) {
672
721
  if (think === void 0) return {};
673
722
  const effort = think ? "high" : "low";
@@ -677,6 +726,65 @@ function buildThinkingConfig(think) {
677
726
  reasoning: { effort }
678
727
  };
679
728
  }
729
+ function hasSamplingOverrides2(options) {
730
+ return options?.top_p !== void 0 || options?.seed !== void 0;
731
+ }
732
+ function isUnsupportedSamplingMessage(status, text) {
733
+ if (status !== 400 && status !== 422) return false;
734
+ const lower = text.toLowerCase();
735
+ const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
736
+ if (!mentionsSampling) return false;
737
+ return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
738
+ }
739
+ function extractLMStudioErrorMessage(body) {
740
+ const trimmed = body.trim();
741
+ if (!trimmed) return "";
742
+ try {
743
+ const parsed = JSON.parse(trimmed);
744
+ const message = parsed.error?.message;
745
+ if (typeof message === "string" && message.trim().length > 0) {
746
+ return message.trim();
747
+ }
748
+ } catch {
749
+ }
750
+ return trimmed;
751
+ }
752
+ function isModelLoadGuardrailError(message) {
753
+ const lower = message.toLowerCase();
754
+ if (!lower.includes("failed to load model")) return false;
755
+ return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
756
+ }
757
+ function buildLMStudioRequestError(kind, model, status, statusText, body) {
758
+ const backendMessage = extractLMStudioErrorMessage(body);
759
+ if (isModelLoadGuardrailError(backendMessage)) {
760
+ return new Error(
761
+ [
762
+ `LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
763
+ "In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
764
+ `Backend error: ${backendMessage}`
765
+ ].join(" ")
766
+ );
767
+ }
768
+ const suffix = backendMessage ? ` ${backendMessage}` : "";
769
+ return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
770
+ }
771
+ function buildChatCompletionBody(model, prompt, options, stream, includeSampling) {
772
+ const messages = options?.think === false ? [
773
+ { role: "system", content: NON_THINKING_SYSTEM_PROMPT },
774
+ { role: "user", content: prompt }
775
+ ] : [{ role: "user", content: prompt }];
776
+ return {
777
+ model,
778
+ messages,
779
+ temperature: options?.temperature ?? 0,
780
+ ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
781
+ ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
782
+ max_tokens: options?.num_predict ?? 512,
783
+ stream,
784
+ ...stream ? { stream_options: { include_usage: true } } : {},
785
+ ...buildThinkingConfig(options?.think)
786
+ };
787
+ }
680
788
  function parseNonNegativeInt(value) {
681
789
  if (!/^\d+$/.test(value)) return null;
682
790
  const parsed = Number.parseInt(value, 10);
@@ -761,7 +869,7 @@ async function pathIsDirectory(targetPath) {
761
869
  try {
762
870
  const stat = await fs.stat(targetPath);
763
871
  return stat.isDirectory();
764
- } catch {
872
+ } catch (_err) {
765
873
  return false;
766
874
  }
767
875
  }
@@ -1165,27 +1273,30 @@ async function generate2(model, prompt, options) {
1165
1273
  try {
1166
1274
  const baseUrl = getLMStudioBaseUrl();
1167
1275
  const url = new URL("/v1/chat/completions", baseUrl);
1168
- const resp = await fetch(url, {
1276
+ const doRequest = (includeSampling) => fetch(url, {
1169
1277
  method: "POST",
1170
1278
  headers: getLMStudioHeaders(),
1171
- body: JSON.stringify({
1172
- model,
1173
- messages: [{ role: "user", content: prompt }],
1174
- temperature: options?.temperature ?? 0,
1175
- max_tokens: options?.num_predict ?? 512,
1176
- stream: false,
1177
- ...buildThinkingConfig(options?.think)
1178
- }),
1279
+ body: JSON.stringify(buildChatCompletionBody(model, prompt, options, false, includeSampling)),
1179
1280
  signal: controller.signal
1180
1281
  });
1282
+ let resp = await doRequest(true);
1181
1283
  if (!resp.ok) {
1182
1284
  const body = await resp.text().catch(() => "");
1183
- throw new Error(`LM Studio generate failed (${resp.status} ${resp.statusText}) ${body}`.trim());
1285
+ if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
1286
+ resp = await doRequest(false);
1287
+ } else {
1288
+ throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
1289
+ }
1290
+ }
1291
+ if (!resp.ok) {
1292
+ const body = await resp.text().catch(() => "");
1293
+ throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
1184
1294
  }
1185
1295
  const payload = await resp.json();
1186
1296
  const choice = extractChoice2(payload);
1187
1297
  const response = extractContent(choice);
1188
1298
  const reasoning = extractReasoning(choice);
1299
+ assertThinkingModeRespected(model, options?.think, response, reasoning);
1189
1300
  const usage = extractUsage(payload);
1190
1301
  const totalDuration = Math.max(0, Date.now() - start) * 1e6;
1191
1302
  return {
@@ -1226,23 +1337,24 @@ async function generateStream2(model, prompt, callbacks, options) {
1226
1337
  };
1227
1338
  try {
1228
1339
  resetStallTimer();
1229
- const resp = await fetch(url, {
1340
+ const doRequest = (includeSampling) => fetch(url, {
1230
1341
  method: "POST",
1231
1342
  headers: getLMStudioHeaders(),
1232
- body: JSON.stringify({
1233
- model,
1234
- messages: [{ role: "user", content: prompt }],
1235
- temperature: options?.temperature ?? 0,
1236
- max_tokens: options?.num_predict ?? 512,
1237
- stream: true,
1238
- stream_options: { include_usage: true },
1239
- ...buildThinkingConfig(options?.think)
1240
- }),
1343
+ body: JSON.stringify(buildChatCompletionBody(model, prompt, options, true, includeSampling)),
1241
1344
  signal: controller.signal
1242
1345
  });
1346
+ let resp = await doRequest(true);
1243
1347
  if (!resp.ok) {
1244
1348
  const body = await resp.text().catch(() => "");
1245
- throw new Error(`LM Studio stream failed (${resp.status} ${resp.statusText}) ${body}`.trim());
1349
+ if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
1350
+ resp = await doRequest(false);
1351
+ } else {
1352
+ throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
1353
+ }
1354
+ }
1355
+ if (!resp.ok) {
1356
+ const body = await resp.text().catch(() => "");
1357
+ throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
1246
1358
  }
1247
1359
  if (!resp.body) {
1248
1360
  throw new Error("LM Studio stream response body is empty");
@@ -1322,6 +1434,7 @@ async function generateStream2(model, prompt, callbacks, options) {
1322
1434
  evalCount: usage?.completion_tokens ?? 0,
1323
1435
  evalDuration: Math.max(1, evalDurationMs) * 1e6
1324
1436
  };
1437
+ assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
1325
1438
  callbacks?.onDone?.(result);
1326
1439
  return result;
1327
1440
  } catch (err) {
@@ -1816,6 +1929,33 @@ function errorMsg(text) {
1816
1929
  console.log(chalk.red(` ${CROSS_MARK} ${text}`));
1817
1930
  }
1818
1931
 
1932
+ // ../src/benchmarks/profile.ts
1933
+ var BENCHMARK_PROFILE_VERSION = "v1";
1934
+ var BENCHMARK_PROFILE_SEED = 42;
1935
+ var BENCHMARK_PROFILE_TOP_P = 1;
1936
+ var BENCHMARK_PROFILE_TEMPERATURE = 0;
1937
+ function withBenchmarkProfile(opts = {}) {
1938
+ return {
1939
+ temperature: BENCHMARK_PROFILE_TEMPERATURE,
1940
+ top_p: BENCHMARK_PROFILE_TOP_P,
1941
+ seed: BENCHMARK_PROFILE_SEED,
1942
+ ...opts
1943
+ };
1944
+ }
1945
+ function buildBenchmarkProfileMetadata(thinkEnabled) {
1946
+ return {
1947
+ version: BENCHMARK_PROFILE_VERSION,
1948
+ sampling: {
1949
+ temperature: BENCHMARK_PROFILE_TEMPERATURE,
1950
+ topP: BENCHMARK_PROFILE_TOP_P,
1951
+ seed: BENCHMARK_PROFILE_SEED
1952
+ },
1953
+ thinkingMode: thinkEnabled ? "enabled" : "disabled",
1954
+ contextWindowTokens: null,
1955
+ contextPolicy: "runtime-default"
1956
+ };
1957
+ }
1958
+
1819
1959
  // ../src/benchmarks/performance.ts
1820
1960
  var WARMUP_PROMPT = "Say hello in one word.";
1821
1961
  var BENCH_PROMPTS = [
@@ -1857,9 +1997,11 @@ async function runPerformanceBench(model, options = {}) {
1857
1997
  ]);
1858
1998
  const warmup = await withTimeout(
1859
1999
  generateStream3(model, WARMUP_PROMPT, void 0, {
1860
- num_predict: 32,
1861
- think: options.think,
1862
- stall_timeout_ms: options.streamStallTimeoutMs
2000
+ ...withBenchmarkProfile({
2001
+ num_predict: 32,
2002
+ think: options.think,
2003
+ stall_timeout_ms: options.streamStallTimeoutMs
2004
+ })
1863
2005
  }),
1864
2006
  warmupTimeoutMs,
1865
2007
  "Model warmup",
@@ -1914,11 +2056,11 @@ async function runPerformanceBench(model, options = {}) {
1914
2056
  }
1915
2057
  }
1916
2058
  },
1917
- {
2059
+ withBenchmarkProfile({
1918
2060
  num_predict: 256,
1919
2061
  think: options.think,
1920
2062
  stall_timeout_ms: options.streamStallTimeoutMs
1921
- }
2063
+ })
1922
2064
  ),
1923
2065
  promptTimeoutMs,
1924
2066
  "Performance benchmark",
@@ -2401,7 +2543,7 @@ Answer:`;
2401
2543
  const startTime = Date.now();
2402
2544
  try {
2403
2545
  const result = await withTimeout(
2404
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
2546
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
2405
2547
  timeoutMs,
2406
2548
  "Reasoning question",
2407
2549
  abortOngoingRequests3
@@ -2730,7 +2872,7 @@ Answer:`;
2730
2872
  const startTime = Date.now();
2731
2873
  try {
2732
2874
  const result = await withTimeout(
2733
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
2875
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
2734
2876
  timeoutMs,
2735
2877
  "Math problem",
2736
2878
  abortOngoingRequests3
@@ -6613,7 +6755,7 @@ Reply with ONLY the function code, no explanation.`;
6613
6755
  const startTime = Date.now();
6614
6756
  try {
6615
6757
  const result = await withTimeout(
6616
- generate3(model, prompt, { temperature: 0, num_predict: 2048, think: opts?.think }),
6758
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
6617
6759
  timeoutMs,
6618
6760
  "Coding task",
6619
6761
  abortOngoingRequests3
@@ -6968,7 +7110,7 @@ async function runInstructionFollowingBench(model, opts) {
6968
7110
  const startTime = Date.now();
6969
7111
  try {
6970
7112
  const result = await withTimeout(
6971
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
7113
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
6972
7114
  timeoutMs,
6973
7115
  "Instruction following task",
6974
7116
  abortOngoingRequests3
@@ -7354,7 +7496,7 @@ async function runStructuredOutputBench(model, opts) {
7354
7496
  const startTime = Date.now();
7355
7497
  try {
7356
7498
  const result = await withTimeout(
7357
- generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
7499
+ generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
7358
7500
  timeoutMs,
7359
7501
  "Structured output task",
7360
7502
  abortOngoingRequests3
@@ -7613,7 +7755,7 @@ async function runMultilingualBench(model, opts) {
7613
7755
  const startTime = Date.now();
7614
7756
  try {
7615
7757
  const result = await withTimeout(
7616
- generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
7758
+ generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
7617
7759
  timeoutMs,
7618
7760
  "Multilingual task",
7619
7761
  abortOngoingRequests3
@@ -8015,6 +8157,25 @@ function getLevel(score) {
8015
8157
  if (score >= 25) return "Weak";
8016
8158
  return "Poor";
8017
8159
  }
8160
+ function summarizeCategoryIssues(name, details) {
8161
+ let crashes = 0;
8162
+ let timeouts = 0;
8163
+ let errors = 0;
8164
+ for (const detail of details) {
8165
+ const actual = detail.actual ?? "";
8166
+ if (/^TIMEOUT\b/i.test(actual)) {
8167
+ timeouts++;
8168
+ continue;
8169
+ }
8170
+ if (/^ERROR:/i.test(actual)) {
8171
+ errors++;
8172
+ if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
8173
+ crashes++;
8174
+ }
8175
+ }
8176
+ }
8177
+ return { name, crashes, timeouts, errors };
8178
+ }
8018
8179
  function printHardwareTable(hw) {
8019
8180
  const table = new Table({
8020
8181
  head: [chalk3.bold("Hardware"), chalk3.bold("Value")],
@@ -8133,6 +8294,18 @@ function printQualityTable(quality, timePenalties) {
8133
8294
  ]);
8134
8295
  }
8135
8296
  console.log(table.toString());
8297
+ const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
8298
+ if (issueSummaries.length > 0) {
8299
+ console.log(chalk3.yellow("Execution issues detected during quality benchmark:"));
8300
+ for (const summary of issueSummaries) {
8301
+ const parts = [];
8302
+ if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
8303
+ const nonCrashErrors = summary.errors - summary.crashes;
8304
+ if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
8305
+ if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
8306
+ console.log(chalk3.yellow(` \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
8307
+ }
8308
+ }
8136
8309
  }
8137
8310
  function printSummaryTable(results) {
8138
8311
  const termWidth = process.stdout.columns || 80;
@@ -9008,7 +9181,7 @@ async function promptThinkingMode() {
9008
9181
  }
9009
9182
 
9010
9183
  // ../src/commands/bench.ts
9011
- var BENCHMARK_SPEC_VERSION = "0.2.0";
9184
+ var BENCHMARK_SPEC_VERSION = "0.2.1";
9012
9185
  var PROMPT_PACK_VERSION = "0.1.0";
9013
9186
  async function benchCommand(options) {
9014
9187
  if (options.backend !== void 0) {
@@ -9101,6 +9274,11 @@ async function benchCommand(options) {
9101
9274
  if (!silent && thinkEnabled) {
9102
9275
  infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
9103
9276
  }
9277
+ if (!silent) {
9278
+ infoMsg(
9279
+ `Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
9280
+ );
9281
+ }
9104
9282
  try {
9105
9283
  const results = [];
9106
9284
  const failedModels = [];
@@ -9189,7 +9367,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
9189
9367
  promptPackVersion: PROMPT_PACK_VERSION,
9190
9368
  runtimeVersion,
9191
9369
  runtimeBackend: getRuntimeName(),
9192
- modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
9370
+ modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat(),
9371
+ benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
9193
9372
  }
9194
9373
  };
9195
9374
  const rawLogHash = createHash3("sha256").update(JSON.stringify(partialResult)).digest("hex");
@@ -9550,7 +9729,7 @@ async function handleShareResult(args) {
9550
9729
  // src/index.ts
9551
9730
  var server = new McpServer({
9552
9731
  name: "metrillm",
9553
- version: "0.1.0"
9732
+ version: "0.2.1"
9554
9733
  });
9555
9734
  for (const def of toolDefinitions) {
9556
9735
  switch (def.name) {