npm - metrillm - Versions diffs - 0.2.3 → 0.2.5 - Mend

metrillm 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.mjs +93 -45
package/package.json +1 -1

package/dist/index.mjs CHANGED Viewed

@@ -5500,9 +5500,17 @@ function hasSamplingOverrides2(options) {
 function isUnsupportedSamplingMessage(status, text) {
   if (status !== 400 && status !== 422) return false;
   const lower = text.toLowerCase();
-  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
-  if (!mentionsSampling) return false;
-  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+  if (!/\b(seed|top_p|topp)\b/.test(lower)) return false;
+  return UNSUPPORTED_SAMPLING_FIELD_PATTERN.test(lower);
+}
+function isUnsupportedOutputLimitMessage(status, text, mode) {
+  if (status !== 400 && status !== 422) return false;
+  const lower = text.toLowerCase();
+  const fieldName = mode === "legacy" ? "max_tokens" : "max_output_tokens";
+  const alternateFieldName = mode === "legacy" ? "max_output_tokens" : "max_tokens";
+  const mentionsUnsupportedCurrentField = lower.includes(fieldName) && UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN.test(lower);
+  const mentionsRequiredAlternateField = lower.includes(alternateFieldName) && /\b(required|missing)\b/.test(lower);
+  return mentionsUnsupportedCurrentField || mentionsRequiredAlternateField;
 }
 function extractLMStudioErrorMessage(body) {
   const trimmed = body.trim();
@@ -5536,20 +5544,65 @@ function buildLMStudioRequestError(kind, model, status, statusText, body) {
   const suffix = backendMessage ? ` ${backendMessage}` : "";
   return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
 }
-function buildNativeChatBody(model, prompt, options, stream, includeSampling) {
+function buildUnsupportedOutputLimitNegotiationError(kind, model, body) {
+  const backendMessage = extractLMStudioErrorMessage(body);
+  return new Error(
+    [
+      `LM Studio ${kind} failed for "${model}" because this backend rejected both max_output_tokens and max_tokens.`,
+      "MetriLLM cannot safely continue without an explicit output limit because benchmarks rely on bounded generation.",
+      backendMessage ? `Backend error: ${backendMessage}` : void 0
+    ].filter(Boolean).join(" ")
+  );
+}
+function buildNativeChatBody(model, prompt, options, stream, includeSampling, outputLimitMode) {
   const reasoning = buildNativeThinkingOption(options?.think);
+  const outputLimit = options?.num_predict !== void 0 ? options.num_predict : 512;
   return {
     model,
     input: prompt,
     temperature: options?.temperature ?? 0,
     ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
     ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
-    max_tokens: options?.num_predict ?? 512,
+    ...outputLimitMode === "preferred" ? { max_output_tokens: outputLimit } : {},
+    ...outputLimitMode === "legacy" ? { max_tokens: outputLimit } : {},
     stream,
     ...reasoning !== void 0 ? { reasoning } : {},
     ...options?.think === false ? { system_prompt: NON_THINKING_SYSTEM_PROMPT } : {}
   };
 }
+async function negotiateRequest(kind, model, cacheKey, options, makeRequest) {
+  let includeSampling = true;
+  let outputLimitMode = outputLimitModeCache.get(cacheKey) ?? "preferred";
+  const triedOutputLimitModes = /* @__PURE__ */ new Set([outputLimitMode]);
+  let resp = await makeRequest(includeSampling, outputLimitMode);
+  let retries = 0;
+  while (!resp.ok && retries < MAX_NEGOTIATE_RETRIES) {
+    retries++;
+    const body = await resp.text().catch(() => "");
+    if (includeSampling && hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+      includeSampling = false;
+      resp = await makeRequest(includeSampling, outputLimitMode);
+      continue;
+    }
+    if (isUnsupportedOutputLimitMessage(resp.status, body, outputLimitMode)) {
+      const nextMode = outputLimitMode === "preferred" ? "legacy" : !triedOutputLimitModes.has("preferred") ? "preferred" : null;
+      if (!nextMode) {
+        throw buildUnsupportedOutputLimitNegotiationError(kind, model, body);
+      }
+      outputLimitMode = nextMode;
+      triedOutputLimitModes.add(outputLimitMode);
+      resp = await makeRequest(includeSampling, outputLimitMode);
+      continue;
+    }
+    throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
+  }
+  if (!resp.ok) {
+    const body = await resp.text().catch(() => "");
+    throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
+  }
+  outputLimitModeCache.set(cacheKey, outputLimitMode);
+  return resp;
+}
 function getNativeStatNumber(value) {
   if (typeof value !== "number" || !Number.isFinite(value) || value < 0) return void 0;
   return value;
@@ -6210,14 +6263,17 @@ async function listModels2() {
     throw new Error(`LM Studio list models failed (${resp.status} ${resp.statusText})`);
   }
   const data = await resp.json();
-  const ids = (data.data ?? []).map((m) => m.id?.trim()).filter((id) => Boolean(id));
+  const primaryIds = (data.data ?? []).map((m) => m.id?.trim()).filter((id) => Boolean(id));
   const apiModels = await fetchApiModels();
   const apiById = /* @__PURE__ */ new Map();
+  const secondaryIds = [];
   for (const model of apiModels ?? []) {
     const id = asNonEmptyString(model.id);
     if (!id) continue;
     apiById.set(id, model);
+    secondaryIds.push(id);
   }
+  const ids = Array.from(/* @__PURE__ */ new Set([...primaryIds, ...secondaryIds]));
   const modelsRootDir = await resolveModelsRootDir();
   const localMetadataEntries = await Promise.all(
     ids.map(async (id) => {
@@ -6255,25 +6311,18 @@ async function generate2(model, prompt, options) {
   try {
     const baseUrl = getLMStudioBaseUrl();
     const url = new URL("/api/v1/chat", baseUrl);
-    const doRequest = (includeSampling) => fetch(url, {
-      method: "POST",
-      headers: getLMStudioHeaders(),
-      body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, includeSampling)),
-      signal: controller.signal
-    });
-    let resp = await doRequest(true);
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
-        resp = await doRequest(false);
-      } else {
-        throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
-      }
-    }
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
-    }
+    const resp = await negotiateRequest(
+      "generate",
+      model,
+      baseUrl,
+      options,
+      (sampling, limitMode) => fetch(url, {
+        method: "POST",
+        headers: getLMStudioHeaders(),
+        body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, sampling, limitMode)),
+        signal: controller.signal
+      })
+    );
     const payload = await resp.json();
     const nativeResponse = extractNativeResponse(payload);
     const response = nativeResponse.response;
@@ -6331,25 +6380,18 @@ async function generateStream2(model, prompt, callbacks, options) {
   };
   try {
     resetStallTimer();
-    const doRequest = (includeSampling) => fetch(url, {
-      method: "POST",
-      headers: getLMStudioHeaders(),
-      body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, includeSampling)),
-      signal: controller.signal
-    });
-    let resp = await doRequest(true);
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
-        resp = await doRequest(false);
-      } else {
-        throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
-      }
-    }
-    if (!resp.ok) {
-      const body = await resp.text().catch(() => "");
-      throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
-    }
+    const resp = await negotiateRequest(
+      "stream",
+      model,
+      baseUrl,
+      options,
+      (sampling, limitMode) => fetch(url, {
+        method: "POST",
+        headers: getLMStudioHeaders(),
+        body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, sampling, limitMode)),
+        signal: controller.signal
+      })
+    );
     if (!resp.body) {
       throw new Error("LM Studio stream response body is empty");
     }
@@ -6478,7 +6520,7 @@ function abortOngoingRequests2() {
   }
   activeAbortControllers.clear();
 }
-var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS2, LM_STUDIO_CLI_TIMEOUT_MS, SHARED_STREAM_STALL_TIMEOUT_ENV2, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, LM_STUDIO_CLI_PATH_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache, NON_THINKING_SYSTEM_PROMPT;
+var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS2, LM_STUDIO_CLI_TIMEOUT_MS, SHARED_STREAM_STALL_TIMEOUT_ENV2, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, LM_STUDIO_CLI_PATH_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache, outputLimitModeCache, NON_THINKING_SYSTEM_PROMPT, UNSUPPORTED_SAMPLING_FIELD_PATTERN, UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN, MAX_NEGOTIATE_RETRIES;
 var init_lm_studio_client = __esm({
   "src/core/lm-studio-client.ts"() {
     init_utils();
@@ -6496,12 +6538,16 @@ var init_lm_studio_client = __esm({
     activeAbortControllers = /* @__PURE__ */ new Set();
     directorySizeCache = /* @__PURE__ */ new Map();
     modelDefinitionCache = /* @__PURE__ */ new Map();
+    outputLimitModeCache = /* @__PURE__ */ new Map();
     NON_THINKING_SYSTEM_PROMPT = [
       "You are in non-thinking mode for benchmark reproducibility.",
       "Return only the final answer.",
       "Do not output internal reasoning, chain-of-thought, or scratchpad.",
       "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
     ].join(" ");
+    UNSUPPORTED_SAMPLING_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/;
+    UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|unexpected|additional|extra|invalid field/;
+    MAX_NEGOTIATE_RETRIES = 5;
   }
 });
@@ -53935,6 +53981,7 @@ async function selectWithArrows(title, options, config = {}) {
     };
     readline4.emitKeypressEvents(stdin);
     stdin.resume();
+    stdin.ref?.();
     if (stdin.isTTY) {
       stdin.setRawMode(true);
     }
@@ -54017,6 +54064,7 @@ ${source_default.dim(message)}
     };
     readline4.emitKeypressEvents(stdin);
     stdin.resume();
+    stdin.ref?.();
     if (stdin.isTTY) stdin.setRawMode(true);
     stdout.write("\x1B[?25l");
     stdin.on("keypress", onKeypress);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "metrillm",
-  "version": "0.2.3",
+  "version": "0.2.5",
   "description": "Benchmark your local LLM models — speed, quality & hardware fitness verdict",
   "type": "module",
   "bin": {