metrillm-mcp 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -735,6 +735,7 @@ var defaultKeepAlive2;
735
735
  var activeAbortControllers = /* @__PURE__ */ new Set();
736
736
  var directorySizeCache = /* @__PURE__ */ new Map();
737
737
  var modelDefinitionCache = /* @__PURE__ */ new Map();
738
+ var outputLimitModeCache = /* @__PURE__ */ new Map();
738
739
  var NON_THINKING_SYSTEM_PROMPT = [
739
740
  "You are in non-thinking mode for benchmark reproducibility.",
740
741
  "Return only the final answer.",
@@ -765,12 +766,22 @@ function buildNativeThinkingOption(think) {
765
766
  function hasSamplingOverrides2(options) {
766
767
  return options?.top_p !== void 0 || options?.seed !== void 0;
767
768
  }
769
+ var UNSUPPORTED_SAMPLING_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/;
770
+ var UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|unexpected|additional|extra|invalid field/;
768
771
  function isUnsupportedSamplingMessage(status, text) {
769
772
  if (status !== 400 && status !== 422) return false;
770
773
  const lower = text.toLowerCase();
771
- const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
772
- if (!mentionsSampling) return false;
773
- return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
774
+ if (!/\b(seed|top_p|topp)\b/.test(lower)) return false;
775
+ return UNSUPPORTED_SAMPLING_FIELD_PATTERN.test(lower);
776
+ }
777
+ function isUnsupportedOutputLimitMessage(status, text, mode) {
778
+ if (status !== 400 && status !== 422) return false;
779
+ const lower = text.toLowerCase();
780
+ const fieldName = mode === "legacy" ? "max_tokens" : "max_output_tokens";
781
+ const alternateFieldName = mode === "legacy" ? "max_output_tokens" : "max_tokens";
782
+ const mentionsUnsupportedCurrentField = lower.includes(fieldName) && UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN.test(lower);
783
+ const mentionsRequiredAlternateField = lower.includes(alternateFieldName) && /\b(required|missing)\b/.test(lower);
784
+ return mentionsUnsupportedCurrentField || mentionsRequiredAlternateField;
774
785
  }
775
786
  function extractLMStudioErrorMessage(body) {
776
787
  const trimmed = body.trim();
@@ -804,20 +815,66 @@ function buildLMStudioRequestError(kind, model, status, statusText, body) {
804
815
  const suffix = backendMessage ? ` ${backendMessage}` : "";
805
816
  return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
806
817
  }
807
- function buildNativeChatBody(model, prompt, options, stream, includeSampling) {
818
+ function buildUnsupportedOutputLimitNegotiationError(kind, model, body) {
819
+ const backendMessage = extractLMStudioErrorMessage(body);
820
+ return new Error(
821
+ [
822
+ `LM Studio ${kind} failed for "${model}" because this backend rejected both max_output_tokens and max_tokens.`,
823
+ "MetriLLM cannot safely continue without an explicit output limit because benchmarks rely on bounded generation.",
824
+ backendMessage ? `Backend error: ${backendMessage}` : void 0
825
+ ].filter(Boolean).join(" ")
826
+ );
827
+ }
828
+ function buildNativeChatBody(model, prompt, options, stream, includeSampling, outputLimitMode) {
808
829
  const reasoning = buildNativeThinkingOption(options?.think);
830
+ const outputLimit = options?.num_predict !== void 0 ? options.num_predict : 512;
809
831
  return {
810
832
  model,
811
833
  input: prompt,
812
834
  temperature: options?.temperature ?? 0,
813
835
  ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
814
836
  ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
815
- max_tokens: options?.num_predict ?? 512,
837
+ ...outputLimitMode === "preferred" ? { max_output_tokens: outputLimit } : {},
838
+ ...outputLimitMode === "legacy" ? { max_tokens: outputLimit } : {},
816
839
  stream,
817
840
  ...reasoning !== void 0 ? { reasoning } : {},
818
841
  ...options?.think === false ? { system_prompt: NON_THINKING_SYSTEM_PROMPT } : {}
819
842
  };
820
843
  }
844
+ var MAX_NEGOTIATE_RETRIES = 5;
845
+ async function negotiateRequest(kind, model, cacheKey, options, makeRequest) {
846
+ let includeSampling = true;
847
+ let outputLimitMode = outputLimitModeCache.get(cacheKey) ?? "preferred";
848
+ const triedOutputLimitModes = /* @__PURE__ */ new Set([outputLimitMode]);
849
+ let resp = await makeRequest(includeSampling, outputLimitMode);
850
+ let retries = 0;
851
+ while (!resp.ok && retries < MAX_NEGOTIATE_RETRIES) {
852
+ retries++;
853
+ const body = await resp.text().catch(() => "");
854
+ if (includeSampling && hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
855
+ includeSampling = false;
856
+ resp = await makeRequest(includeSampling, outputLimitMode);
857
+ continue;
858
+ }
859
+ if (isUnsupportedOutputLimitMessage(resp.status, body, outputLimitMode)) {
860
+ const nextMode = outputLimitMode === "preferred" ? "legacy" : !triedOutputLimitModes.has("preferred") ? "preferred" : null;
861
+ if (!nextMode) {
862
+ throw buildUnsupportedOutputLimitNegotiationError(kind, model, body);
863
+ }
864
+ outputLimitMode = nextMode;
865
+ triedOutputLimitModes.add(outputLimitMode);
866
+ resp = await makeRequest(includeSampling, outputLimitMode);
867
+ continue;
868
+ }
869
+ throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
870
+ }
871
+ if (!resp.ok) {
872
+ const body = await resp.text().catch(() => "");
873
+ throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
874
+ }
875
+ outputLimitModeCache.set(cacheKey, outputLimitMode);
876
+ return resp;
877
+ }
821
878
  function getNativeStatNumber(value) {
822
879
  if (typeof value !== "number" || !Number.isFinite(value) || value < 0) return void 0;
823
880
  return value;
@@ -1530,25 +1587,18 @@ async function generate2(model, prompt, options) {
1530
1587
  try {
1531
1588
  const baseUrl = getLMStudioBaseUrl();
1532
1589
  const url = new URL("/api/v1/chat", baseUrl);
1533
- const doRequest = (includeSampling) => fetch(url, {
1534
- method: "POST",
1535
- headers: getLMStudioHeaders(),
1536
- body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, includeSampling)),
1537
- signal: controller.signal
1538
- });
1539
- let resp = await doRequest(true);
1540
- if (!resp.ok) {
1541
- const body = await resp.text().catch(() => "");
1542
- if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
1543
- resp = await doRequest(false);
1544
- } else {
1545
- throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
1546
- }
1547
- }
1548
- if (!resp.ok) {
1549
- const body = await resp.text().catch(() => "");
1550
- throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
1551
- }
1590
+ const resp = await negotiateRequest(
1591
+ "generate",
1592
+ model,
1593
+ baseUrl,
1594
+ options,
1595
+ (sampling, limitMode) => fetch(url, {
1596
+ method: "POST",
1597
+ headers: getLMStudioHeaders(),
1598
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, sampling, limitMode)),
1599
+ signal: controller.signal
1600
+ })
1601
+ );
1552
1602
  const payload = await resp.json();
1553
1603
  const nativeResponse = extractNativeResponse(payload);
1554
1604
  const response = nativeResponse.response;
@@ -1606,25 +1656,18 @@ async function generateStream2(model, prompt, callbacks, options) {
1606
1656
  };
1607
1657
  try {
1608
1658
  resetStallTimer();
1609
- const doRequest = (includeSampling) => fetch(url, {
1610
- method: "POST",
1611
- headers: getLMStudioHeaders(),
1612
- body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, includeSampling)),
1613
- signal: controller.signal
1614
- });
1615
- let resp = await doRequest(true);
1616
- if (!resp.ok) {
1617
- const body = await resp.text().catch(() => "");
1618
- if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
1619
- resp = await doRequest(false);
1620
- } else {
1621
- throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
1622
- }
1623
- }
1624
- if (!resp.ok) {
1625
- const body = await resp.text().catch(() => "");
1626
- throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
1627
- }
1659
+ const resp = await negotiateRequest(
1660
+ "stream",
1661
+ model,
1662
+ baseUrl,
1663
+ options,
1664
+ (sampling, limitMode) => fetch(url, {
1665
+ method: "POST",
1666
+ headers: getLMStudioHeaders(),
1667
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, sampling, limitMode)),
1668
+ signal: controller.signal
1669
+ })
1670
+ );
1628
1671
  if (!resp.body) {
1629
1672
  throw new Error("LM Studio stream response body is empty");
1630
1673
  }