npm - metrillm-mcp - Versions diffs - 0.2.0 → 0.2.1 - Mend

metrillm-mcp 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -110,7 +110,8 @@ function stripThinkTags(text) {
 }
 function hasThinkingContent(response, thinkingField) {
   if (thinkingField && thinkingField.trim().length > 0) return true;
-  return /<think(?:ing)?[\s>]/i.test(response);
+  if (/<think(?:ing)?[\s>]/i.test(response)) return true;
+  return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
 }
 function estimateTokenCount(text) {
   if (!text) return 0;
@@ -564,25 +565,50 @@ var defaultKeepAlive;
 function setDefaultKeepAlive(keepAlive) {
   defaultKeepAlive = keepAlive;
 }
+function hasSamplingOverrides(options) {
+  return options?.top_p !== void 0 || options?.seed !== void 0;
+}
+function isUnsupportedSamplingOptionError(err) {
+  const message = err instanceof Error ? err.message : String(err);
+  const lower = message.toLowerCase();
+  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
+  if (!mentionsSampling) return false;
+  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+}
+function buildGenerateRequest(model, prompt, options, includeSampling) {
+  return {
+    model,
+    prompt,
+    stream: true,
+    keep_alive: options?.keep_alive ?? defaultKeepAlive,
+    ...options?.think !== void 0 ? { think: options.think } : {},
+    options: {
+      temperature: options?.temperature ?? 0,
+      ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
+      ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
+      num_predict: options?.num_predict ?? 512
+    }
+  };
+}
 async function generate(model, prompt, options) {
   return generateStream(model, prompt, void 0, options);
 }
 async function generateStream(model, prompt, callbacks, options) {
-  const stream = await withTimeout(
-    client.generate({
-      model,
-      prompt,
-      stream: true,
-      keep_alive: options?.keep_alive ?? defaultKeepAlive,
-      ...options?.think !== void 0 ? { think: options.think } : {},
-      options: {
-        temperature: options?.temperature ?? 0,
-        num_predict: options?.num_predict ?? 512
-      }
-    }),
+  const initializeStream = (includeSampling) => withTimeout(
+    client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
     OLLAMA_INIT_TIMEOUT_MS,
     "Ollama generate initialization"
   );
+  let stream;
+  try {
+    stream = await initializeStream(true);
+  } catch (err) {
+    if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
+      stream = await initializeStream(false);
+    } else {
+      throw err;
+    }
+  }
   let fullResponse = "";
   let fullThinking = "";
   let result = null;
@@ -668,6 +694,29 @@ var defaultKeepAlive2;
 var activeAbortControllers = /* @__PURE__ */ new Set();
 var directorySizeCache = /* @__PURE__ */ new Map();
 var modelDefinitionCache = /* @__PURE__ */ new Map();
+var NON_THINKING_SYSTEM_PROMPT = [
+  "You are in non-thinking mode for benchmark reproducibility.",
+  "Return only the final answer.",
+  "Do not output internal reasoning, chain-of-thought, or scratchpad.",
+  "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
+].join(" ");
+function hasThinkingLeakText(response) {
+  return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
+}
+function assertThinkingModeRespected(model, think, response, reasoning) {
+  if (think !== false) return;
+  if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
+    throw new Error(
+      [
+        `LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
+        "In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
+        "If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
+        "Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
+        "Then eject/reload the model and run the benchmark again."
+      ].join(" ")
+    );
+  }
+}
 function buildThinkingConfig(think) {
   if (think === void 0) return {};
   const effort = think ? "high" : "low";
@@ -677,6 +726,65 @@ function buildThinkingConfig(think) {
     reasoning: { effort }
   };
 }
+function hasSamplingOverrides2(options) {
+  return options?.top_p !== void 0 || options?.seed !== void 0;
+}
+function isUnsupportedSamplingMessage(status, text) {
+  if (status !== 400 && status !== 422) return false;
+  const lower = text.toLowerCase();
+  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
+  if (!mentionsSampling) return false;
+  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+}
+function extractLMStudioErrorMessage(body) {
+  const trimmed = body.trim();
+  if (!trimmed) return "";
+  try {
+    const parsed = JSON.parse(trimmed);
+    const message = parsed.error?.message;
+    if (typeof message === "string" && message.trim().length > 0) {
+      return message.trim();
+    }
+  } catch {
+  }
+  return trimmed;
+}
+function isModelLoadGuardrailError(message) {
+  const lower = message.toLowerCase();
+  if (!lower.includes("failed to load model")) return false;
+  return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
+}
+function buildLMStudioRequestError(kind, model, status, statusText, body) {
+  const backendMessage = extractLMStudioErrorMessage(body);
+  if (isModelLoadGuardrailError(backendMessage)) {
+    return new Error(
+      [
+        `LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
+        "In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
+        `Backend error: ${backendMessage}`
+      ].join(" ")
+    );
+  }
+  const suffix = backendMessage ? ` ${backendMessage}` : "";
+  return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
+}
+function buildChatCompletionBody(model, prompt, options, stream, includeSampling) {
+  const messages = options?.think === false ? [
+    { role: "system", content: NON_THINKING_SYSTEM_PROMPT },
+    { role: "user", content: prompt }
+  ] : [{ role: "user", content: prompt }];
+  return {
+    model,
+    messages,
+    temperature: options?.temperature ?? 0,
+    ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
+    ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
+    max_tokens: options?.num_predict ?? 512,
+    stream,
+    ...stream ? { stream_options: { include_usage: true } } : {},
+    ...buildThinkingConfig(options?.think)
+  };
+}
 function parseNonNegativeInt(value) {
   if (!/^\d+$/.test(value)) return null;
   const parsed = Number.parseInt(value, 10);
@@ -761,7 +869,7 @@ async function pathIsDirectory(targetPath) {
   try {
     const stat = await fs.stat(targetPath);
     return stat.isDirectory();
-  } catch {
+  } catch (_err) {
     return false;
   }
 }
@@ -1165,27 +1273,30 @@ async function generate2(model, prompt, options) {
   try {
     const baseUrl = getLMStudioBaseUrl();
     const url = new URL("/v1/chat/completions", baseUrl);
-    const resp = await fetch(url, {
+    const doRequest = (includeSampling) => fetch(url, {
       method: "POST",
       headers: getLMStudioHeaders(),
-      body: JSON.stringify({
-        model,
-        messages: [{ role: "user", content: prompt }],
-        temperature: options?.temperature ?? 0,
-        max_tokens: options?.num_predict ?? 512,
-        stream: false,
-        ...buildThinkingConfig(options?.think)
-      }),
+      body: JSON.stringify(buildChatCompletionBody(model, prompt, options, false, includeSampling)),
       signal: controller.signal
     });
+    let resp = await doRequest(true);
     if (!resp.ok) {
       const body = await resp.text().catch(() => "");
-      throw new Error(`LM Studio generate failed (${resp.status} ${resp.statusText}) ${body}`.trim());
+      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+        resp = await doRequest(false);
+      } else {
+        throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
+      }
+    }
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
     }
     const payload = await resp.json();
     const choice = extractChoice2(payload);
     const response = extractContent(choice);
     const reasoning = extractReasoning(choice);
+    assertThinkingModeRespected(model, options?.think, response, reasoning);
     const usage = extractUsage(payload);
     const totalDuration = Math.max(0, Date.now() - start) * 1e6;
     return {
@@ -1226,23 +1337,24 @@ async function generateStream2(model, prompt, callbacks, options) {
   };
   try {
     resetStallTimer();
-    const resp = await fetch(url, {
+    const doRequest = (includeSampling) => fetch(url, {
       method: "POST",
       headers: getLMStudioHeaders(),
-      body: JSON.stringify({
-        model,
-        messages: [{ role: "user", content: prompt }],
-        temperature: options?.temperature ?? 0,
-        max_tokens: options?.num_predict ?? 512,
-        stream: true,
-        stream_options: { include_usage: true },
-        ...buildThinkingConfig(options?.think)
-      }),
+      body: JSON.stringify(buildChatCompletionBody(model, prompt, options, true, includeSampling)),
       signal: controller.signal
     });
+    let resp = await doRequest(true);
     if (!resp.ok) {
       const body = await resp.text().catch(() => "");
-      throw new Error(`LM Studio stream failed (${resp.status} ${resp.statusText}) ${body}`.trim());
+      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+        resp = await doRequest(false);
+      } else {
+        throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
+      }
+    }
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
     }
     if (!resp.body) {
       throw new Error("LM Studio stream response body is empty");
@@ -1322,6 +1434,7 @@ async function generateStream2(model, prompt, callbacks, options) {
       evalCount: usage?.completion_tokens ?? 0,
       evalDuration: Math.max(1, evalDurationMs) * 1e6
     };
+    assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
     callbacks?.onDone?.(result);
     return result;
   } catch (err) {
@@ -1816,6 +1929,33 @@ function errorMsg(text) {
   console.log(chalk.red(`  ${CROSS_MARK} ${text}`));
 }
+// ../src/benchmarks/profile.ts
+var BENCHMARK_PROFILE_VERSION = "v1";
+var BENCHMARK_PROFILE_SEED = 42;
+var BENCHMARK_PROFILE_TOP_P = 1;
+var BENCHMARK_PROFILE_TEMPERATURE = 0;
+function withBenchmarkProfile(opts = {}) {
+  return {
+    temperature: BENCHMARK_PROFILE_TEMPERATURE,
+    top_p: BENCHMARK_PROFILE_TOP_P,
+    seed: BENCHMARK_PROFILE_SEED,
+    ...opts
+  };
+}
+function buildBenchmarkProfileMetadata(thinkEnabled) {
+  return {
+    version: BENCHMARK_PROFILE_VERSION,
+    sampling: {
+      temperature: BENCHMARK_PROFILE_TEMPERATURE,
+      topP: BENCHMARK_PROFILE_TOP_P,
+      seed: BENCHMARK_PROFILE_SEED
+    },
+    thinkingMode: thinkEnabled ? "enabled" : "disabled",
+    contextWindowTokens: null,
+    contextPolicy: "runtime-default"
+  };
+}
 // ../src/benchmarks/performance.ts
 var WARMUP_PROMPT = "Say hello in one word.";
 var BENCH_PROMPTS = [
@@ -1857,9 +1997,11 @@ async function runPerformanceBench(model, options = {}) {
     ]);
     const warmup = await withTimeout(
       generateStream3(model, WARMUP_PROMPT, void 0, {
-        num_predict: 32,
-        think: options.think,
-        stall_timeout_ms: options.streamStallTimeoutMs
+        ...withBenchmarkProfile({
+          num_predict: 32,
+          think: options.think,
+          stall_timeout_ms: options.streamStallTimeoutMs
+        })
       }),
       warmupTimeoutMs,
       "Model warmup",
@@ -1914,11 +2056,11 @@ async function runPerformanceBench(model, options = {}) {
                 }
               }
             },
-            {
+            withBenchmarkProfile({
               num_predict: 256,
               think: options.think,
               stall_timeout_ms: options.streamStallTimeoutMs
-            }
+            })
           ),
           promptTimeoutMs,
           "Performance benchmark",
@@ -2401,7 +2543,7 @@ Answer:`;
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Reasoning question",
           abortOngoingRequests3
@@ -2730,7 +2872,7 @@ Answer:`;
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Math problem",
           abortOngoingRequests3
@@ -6613,7 +6755,7 @@ Reply with ONLY the function code, no explanation.`;
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 2048, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
           timeoutMs,
           "Coding task",
           abortOngoingRequests3
@@ -6968,7 +7110,7 @@ async function runInstructionFollowingBench(model, opts) {
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Instruction following task",
           abortOngoingRequests3
@@ -7354,7 +7496,7 @@ async function runStructuredOutputBench(model, opts) {
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Structured output task",
           abortOngoingRequests3
@@ -7613,7 +7755,7 @@ async function runMultilingualBench(model, opts) {
       const startTime = Date.now();
       try {
         const result = await withTimeout(
-          generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Multilingual task",
           abortOngoingRequests3
@@ -8015,6 +8157,25 @@ function getLevel(score) {
   if (score >= 25) return "Weak";
   return "Poor";
 }
+function summarizeCategoryIssues(name, details) {
+  let crashes = 0;
+  let timeouts = 0;
+  let errors = 0;
+  for (const detail of details) {
+    const actual = detail.actual ?? "";
+    if (/^TIMEOUT\b/i.test(actual)) {
+      timeouts++;
+      continue;
+    }
+    if (/^ERROR:/i.test(actual)) {
+      errors++;
+      if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
+        crashes++;
+      }
+    }
+  }
+  return { name, crashes, timeouts, errors };
+}
 function printHardwareTable(hw) {
   const table = new Table({
     head: [chalk3.bold("Hardware"), chalk3.bold("Value")],
@@ -8133,6 +8294,18 @@ function printQualityTable(quality, timePenalties) {
     ]);
   }
   console.log(table.toString());
+  const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
+  if (issueSummaries.length > 0) {
+    console.log(chalk3.yellow("Execution issues detected during quality benchmark:"));
+    for (const summary of issueSummaries) {
+      const parts = [];
+      if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
+      const nonCrashErrors = summary.errors - summary.crashes;
+      if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
+      if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
+      console.log(chalk3.yellow(`  \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
+    }
+  }
 }
 function printSummaryTable(results) {
   const termWidth = process.stdout.columns || 80;
@@ -9008,7 +9181,7 @@ async function promptThinkingMode() {
 }
 // ../src/commands/bench.ts
-var BENCHMARK_SPEC_VERSION = "0.2.0";
+var BENCHMARK_SPEC_VERSION = "0.2.1";
 var PROMPT_PACK_VERSION = "0.1.0";
 async function benchCommand(options) {
   if (options.backend !== void 0) {
@@ -9101,6 +9274,11 @@ async function benchCommand(options) {
   if (!silent && thinkEnabled) {
     infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
   }
+  if (!silent) {
+    infoMsg(
+      `Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
+    );
+  }
   try {
     const results = [];
     const failedModels = [];
@@ -9189,7 +9367,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
             promptPackVersion: PROMPT_PACK_VERSION,
             runtimeVersion,
             runtimeBackend: getRuntimeName(),
-            modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
+            modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat(),
+            benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
           }
         };
         const rawLogHash = createHash3("sha256").update(JSON.stringify(partialResult)).digest("hex");
@@ -9550,7 +9729,7 @@ async function handleShareResult(args) {
 // src/index.ts
 var server = new McpServer({
   name: "metrillm",
-  version: "0.1.0"
+  version: "0.2.1"
 });
 for (const def of toolDefinitions) {
   switch (def.name) {