npm - metrillm-mcp - Versions diffs - 0.2.3 → 0.2.5 - Mend

metrillm-mcp 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -735,6 +735,7 @@ var defaultKeepAlive2;
 var activeAbortControllers = /* @__PURE__ */ new Set();
 var directorySizeCache = /* @__PURE__ */ new Map();
 var modelDefinitionCache = /* @__PURE__ */ new Map();
+var outputLimitModeCache = /* @__PURE__ */ new Map();
 var NON_THINKING_SYSTEM_PROMPT = [
   "You are in non-thinking mode for benchmark reproducibility.",
   "Return only the final answer.",
@@ -765,12 +766,22 @@ function buildNativeThinkingOption(think) {
 function hasSamplingOverrides2(options) {
   return options?.top_p !== void 0 || options?.seed !== void 0;
 }
+var UNSUPPORTED_SAMPLING_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/;
+var UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|unexpected|additional|extra|invalid field/;
 function isUnsupportedSamplingMessage(status, text) {
   if (status !== 400 && status !== 422) return false;
   const lower = text.toLowerCase();
-  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
-  if (!mentionsSampling) return false;
-  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+  if (!/\b(seed|top_p|topp)\b/.test(lower)) return false;
+  return UNSUPPORTED_SAMPLING_FIELD_PATTERN.test(lower);
+}
+function isUnsupportedOutputLimitMessage(status, text, mode) {
+  if (status !== 400 && status !== 422) return false;
+  const lower = text.toLowerCase();
+  const fieldName = mode === "legacy" ? "max_tokens" : "max_output_tokens";
+  const alternateFieldName = mode === "legacy" ? "max_output_tokens" : "max_tokens";
+  const mentionsUnsupportedCurrentField = lower.includes(fieldName) && UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN.test(lower);
+  const mentionsRequiredAlternateField = lower.includes(alternateFieldName) && /\b(required|missing)\b/.test(lower);
+  return mentionsUnsupportedCurrentField || mentionsRequiredAlternateField;
 }
 function extractLMStudioErrorMessage(body) {
   const trimmed = body.trim();
@@ -804,20 +815,66 @@ function buildLMStudioRequestError(kind, model, status, statusText, body) {
   const suffix = backendMessage ? ` ${backendMessage}` : "";
   return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
 }
-function buildNativeChatBody(model, prompt, options, stream, includeSampling) {
+function buildUnsupportedOutputLimitNegotiationError(kind, model, body) {
+  const backendMessage = extractLMStudioErrorMessage(body);
+  return new Error(
+    [
+      `LM Studio ${kind} failed for "${model}" because this backend rejected both max_output_tokens and max_tokens.`,
+      "MetriLLM cannot safely continue without an explicit output limit because benchmarks rely on bounded generation.",
+      backendMessage ? `Backend error: ${backendMessage}` : void 0
+    ].filter(Boolean).join(" ")
+  );
+}
+function buildNativeChatBody(model, prompt, options, stream, includeSampling, outputLimitMode) {
   const reasoning = buildNativeThinkingOption(options?.think);
+  const outputLimit = options?.num_predict !== void 0 ? options.num_predict : 512;
   return {
     model,
     input: prompt,
     temperature: options?.temperature ?? 0,
     ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
     ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
-    max_tokens: options?.num_predict ?? 512,
+    ...outputLimitMode === "preferred" ? { max_output_tokens: outputLimit } : {},
+    ...outputLimitMode === "legacy" ? { max_tokens: outputLimit } : {},
     stream,
     ...reasoning !== void 0 ? { reasoning } : {},
     ...options?.think === false ? { system_prompt: NON_THINKING_SYSTEM_PROMPT } : {}
   };
 }
+var MAX_NEGOTIATE_RETRIES = 5;
+async function negotiateRequest(kind, model, cacheKey, options, makeRequest) {
+  let includeSampling = true;
+  let outputLimitMode = outputLimitModeCache.get(cacheKey) ?? "preferred";
+  const triedOutputLimitModes = /* @__PURE__ */ new Set([outputLimitMode]);
+  let resp = await makeRequest(includeSampling, outputLimitMode);
+  let retries = 0;
+  while (!resp.ok && retries < MAX_NEGOTIATE_RETRIES) {
+    retries++;
+    const body = await resp.text().catch(() => "");
+    if (includeSampling && hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+      includeSampling = false;
+      resp = await makeRequest(includeSampling, outputLimitMode);
+      continue;
+    }
+    if (isUnsupportedOutputLimitMessage(resp.status, body, outputLimitMode)) {
+      const nextMode = outputLimitMode === "preferred" ? "legacy" : !triedOutputLimitModes.has("preferred") ? "preferred" : null;
+      if (!nextMode) {
+        throw buildUnsupportedOutputLimitNegotiationError(kind, model, body);
+      }
+      outputLimitMode = nextMode;
+      triedOutputLimitModes.add(outputLimitMode);
+      resp = await makeRequest(includeSampling, outputLimitMode);
+      continue;
+    }
+    throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
+  }
+  if (!resp.ok) {
+    const body = await resp.text().catch(() => "");
+    throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
+  }
+  outputLimitModeCache.set(cacheKey, outputLimitMode);
+  return resp;
+}
 function getNativeStatNumber(value) {
   if (typeof value !== "number" || !Number.isFinite(value) || value < 0) return void 0;
   return value;
@@ -1478,14 +1535,17 @@ async function listModels2() {
     throw new Error(`LM Studio list models failed (${resp.status} ${resp.statusText})`);
   }
   const data = await resp.json();
-  const ids = (data.data ?? []).map((m) => m.id?.trim()).filter((id) => Boolean(id));
+  const primaryIds = (data.data ?? []).map((m) => m.id?.trim()).filter((id) => Boolean(id));
   const apiModels = await fetchApiModels();
   const apiById = /* @__PURE__ */ new Map();
+  const secondaryIds = [];
   for (const model of apiModels ?? []) {
     const id = asNonEmptyString(model.id);
     if (!id) continue;
     apiById.set(id, model);
+    secondaryIds.push(id);
   }
+  const ids = Array.from(/* @__PURE__ */ new Set([...primaryIds, ...secondaryIds]));
   const modelsRootDir = await resolveModelsRootDir();
   const localMetadataEntries = await Promise.all(
     ids.map(async (id) => {
@@ -1527,25 +1587,18 @@ async function generate2(model, prompt, options) {
   try {
     const baseUrl = getLMStudioBaseUrl();
     const url = new URL("/api/v1/chat", baseUrl);
-    const doRequest = (includeSampling) => fetch(url, {
-      method: "POST",
-      headers: getLMStudioHeaders(),
-      body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, includeSampling)),
-      signal: controller.signal
-    });
-    let resp = await doRequest(true);
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
-        resp = await doRequest(false);
-      } else {
-        throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
-      }
-    }
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
-    }
+    const resp = await negotiateRequest(
+      "generate",
+      model,
+      baseUrl,
+      options,
+      (sampling, limitMode) => fetch(url, {
+        method: "POST",
+        headers: getLMStudioHeaders(),
+        body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, sampling, limitMode)),
+        signal: controller.signal
+      })
+    );
     const payload = await resp.json();
     const nativeResponse = extractNativeResponse(payload);
     const response = nativeResponse.response;
@@ -1603,25 +1656,18 @@ async function generateStream2(model, prompt, callbacks, options) {
   };
   try {
     resetStallTimer();
-    const doRequest = (includeSampling) => fetch(url, {
-      method: "POST",
-      headers: getLMStudioHeaders(),
-      body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, includeSampling)),
-      signal: controller.signal
-    });
-    let resp = await doRequest(true);
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
-        resp = await doRequest(false);
-      } else {
-        throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
-      }
-    }
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
-    }
+    const resp = await negotiateRequest(
+      "stream",
+      model,
+      baseUrl,
+      options,
+      (sampling, limitMode) => fetch(url, {
+        method: "POST",
+        headers: getLMStudioHeaders(),
+        body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, sampling, limitMode)),
+        signal: controller.signal
+      })
+    );
     if (!resp.body) {
       throw new Error("LM Studio stream response body is empty");
     }