metrillm 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.mjs +89 -44
  2. package/package.json +1 -1
package/dist/index.mjs CHANGED
@@ -5500,9 +5500,17 @@ function hasSamplingOverrides2(options) {
5500
5500
  function isUnsupportedSamplingMessage(status, text) {
5501
5501
  if (status !== 400 && status !== 422) return false;
5502
5502
  const lower = text.toLowerCase();
5503
- const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
5504
- if (!mentionsSampling) return false;
5505
- return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
5503
+ if (!/\b(seed|top_p|topp)\b/.test(lower)) return false;
5504
+ return UNSUPPORTED_SAMPLING_FIELD_PATTERN.test(lower);
5505
+ }
5506
+ function isUnsupportedOutputLimitMessage(status, text, mode) {
5507
+ if (status !== 400 && status !== 422) return false;
5508
+ const lower = text.toLowerCase();
5509
+ const fieldName = mode === "legacy" ? "max_tokens" : "max_output_tokens";
5510
+ const alternateFieldName = mode === "legacy" ? "max_output_tokens" : "max_tokens";
5511
+ const mentionsUnsupportedCurrentField = lower.includes(fieldName) && UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN.test(lower);
5512
+ const mentionsRequiredAlternateField = lower.includes(alternateFieldName) && /\b(required|missing)\b/.test(lower);
5513
+ return mentionsUnsupportedCurrentField || mentionsRequiredAlternateField;
5506
5514
  }
5507
5515
  function extractLMStudioErrorMessage(body) {
5508
5516
  const trimmed = body.trim();
@@ -5536,20 +5544,65 @@ function buildLMStudioRequestError(kind, model, status, statusText, body) {
5536
5544
  const suffix = backendMessage ? ` ${backendMessage}` : "";
5537
5545
  return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
5538
5546
  }
5539
- function buildNativeChatBody(model, prompt, options, stream, includeSampling) {
5547
+ function buildUnsupportedOutputLimitNegotiationError(kind, model, body) {
5548
+ const backendMessage = extractLMStudioErrorMessage(body);
5549
+ return new Error(
5550
+ [
5551
+ `LM Studio ${kind} failed for "${model}" because this backend rejected both max_output_tokens and max_tokens.`,
5552
+ "MetriLLM cannot safely continue without an explicit output limit because benchmarks rely on bounded generation.",
5553
+ backendMessage ? `Backend error: ${backendMessage}` : void 0
5554
+ ].filter(Boolean).join(" ")
5555
+ );
5556
+ }
5557
+ function buildNativeChatBody(model, prompt, options, stream, includeSampling, outputLimitMode) {
5540
5558
  const reasoning = buildNativeThinkingOption(options?.think);
5559
+ const outputLimit = options?.num_predict !== void 0 ? options.num_predict : 512;
5541
5560
  return {
5542
5561
  model,
5543
5562
  input: prompt,
5544
5563
  temperature: options?.temperature ?? 0,
5545
5564
  ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
5546
5565
  ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
5547
- max_tokens: options?.num_predict ?? 512,
5566
+ ...outputLimitMode === "preferred" ? { max_output_tokens: outputLimit } : {},
5567
+ ...outputLimitMode === "legacy" ? { max_tokens: outputLimit } : {},
5548
5568
  stream,
5549
5569
  ...reasoning !== void 0 ? { reasoning } : {},
5550
5570
  ...options?.think === false ? { system_prompt: NON_THINKING_SYSTEM_PROMPT } : {}
5551
5571
  };
5552
5572
  }
5573
+ async function negotiateRequest(kind, model, cacheKey, options, makeRequest) {
5574
+ let includeSampling = true;
5575
+ let outputLimitMode = outputLimitModeCache.get(cacheKey) ?? "preferred";
5576
+ const triedOutputLimitModes = /* @__PURE__ */ new Set([outputLimitMode]);
5577
+ let resp = await makeRequest(includeSampling, outputLimitMode);
5578
+ let retries = 0;
5579
+ while (!resp.ok && retries < MAX_NEGOTIATE_RETRIES) {
5580
+ retries++;
5581
+ const body = await resp.text().catch(() => "");
5582
+ if (includeSampling && hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
5583
+ includeSampling = false;
5584
+ resp = await makeRequest(includeSampling, outputLimitMode);
5585
+ continue;
5586
+ }
5587
+ if (isUnsupportedOutputLimitMessage(resp.status, body, outputLimitMode)) {
5588
+ const nextMode = outputLimitMode === "preferred" ? "legacy" : !triedOutputLimitModes.has("preferred") ? "preferred" : null;
5589
+ if (!nextMode) {
5590
+ throw buildUnsupportedOutputLimitNegotiationError(kind, model, body);
5591
+ }
5592
+ outputLimitMode = nextMode;
5593
+ triedOutputLimitModes.add(outputLimitMode);
5594
+ resp = await makeRequest(includeSampling, outputLimitMode);
5595
+ continue;
5596
+ }
5597
+ throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
5598
+ }
5599
+ if (!resp.ok) {
5600
+ const body = await resp.text().catch(() => "");
5601
+ throw buildLMStudioRequestError(kind, model, resp.status, resp.statusText, body);
5602
+ }
5603
+ outputLimitModeCache.set(cacheKey, outputLimitMode);
5604
+ return resp;
5605
+ }
5553
5606
  function getNativeStatNumber(value) {
5554
5607
  if (typeof value !== "number" || !Number.isFinite(value) || value < 0) return void 0;
5555
5608
  return value;
@@ -6258,25 +6311,18 @@ async function generate2(model, prompt, options) {
6258
6311
  try {
6259
6312
  const baseUrl = getLMStudioBaseUrl();
6260
6313
  const url = new URL("/api/v1/chat", baseUrl);
6261
- const doRequest = (includeSampling) => fetch(url, {
6262
- method: "POST",
6263
- headers: getLMStudioHeaders(),
6264
- body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, includeSampling)),
6265
- signal: controller.signal
6266
- });
6267
- let resp = await doRequest(true);
6268
- if (!resp.ok) {
6269
- const body = await resp.text().catch(() => "");
6270
- if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
6271
- resp = await doRequest(false);
6272
- } else {
6273
- throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
6274
- }
6275
- }
6276
- if (!resp.ok) {
6277
- const body = await resp.text().catch(() => "");
6278
- throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
6279
- }
6314
+ const resp = await negotiateRequest(
6315
+ "generate",
6316
+ model,
6317
+ baseUrl,
6318
+ options,
6319
+ (sampling, limitMode) => fetch(url, {
6320
+ method: "POST",
6321
+ headers: getLMStudioHeaders(),
6322
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, false, sampling, limitMode)),
6323
+ signal: controller.signal
6324
+ })
6325
+ );
6280
6326
  const payload = await resp.json();
6281
6327
  const nativeResponse = extractNativeResponse(payload);
6282
6328
  const response = nativeResponse.response;
@@ -6334,25 +6380,18 @@ async function generateStream2(model, prompt, callbacks, options) {
6334
6380
  };
6335
6381
  try {
6336
6382
  resetStallTimer();
6337
- const doRequest = (includeSampling) => fetch(url, {
6338
- method: "POST",
6339
- headers: getLMStudioHeaders(),
6340
- body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, includeSampling)),
6341
- signal: controller.signal
6342
- });
6343
- let resp = await doRequest(true);
6344
- if (!resp.ok) {
6345
- const body = await resp.text().catch(() => "");
6346
- if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
6347
- resp = await doRequest(false);
6348
- } else {
6349
- throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
6350
- }
6351
- }
6352
- if (!resp.ok) {
6353
- const body = await resp.text().catch(() => "");
6354
- throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
6355
- }
6383
+ const resp = await negotiateRequest(
6384
+ "stream",
6385
+ model,
6386
+ baseUrl,
6387
+ options,
6388
+ (sampling, limitMode) => fetch(url, {
6389
+ method: "POST",
6390
+ headers: getLMStudioHeaders(),
6391
+ body: JSON.stringify(buildNativeChatBody(model, prompt, options, true, sampling, limitMode)),
6392
+ signal: controller.signal
6393
+ })
6394
+ );
6356
6395
  if (!resp.body) {
6357
6396
  throw new Error("LM Studio stream response body is empty");
6358
6397
  }
@@ -6481,7 +6520,7 @@ function abortOngoingRequests2() {
6481
6520
  }
6482
6521
  activeAbortControllers.clear();
6483
6522
  }
6484
- var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS2, LM_STUDIO_CLI_TIMEOUT_MS, SHARED_STREAM_STALL_TIMEOUT_ENV2, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, LM_STUDIO_CLI_PATH_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache, NON_THINKING_SYSTEM_PROMPT;
6523
+ var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS2, LM_STUDIO_CLI_TIMEOUT_MS, SHARED_STREAM_STALL_TIMEOUT_ENV2, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, LM_STUDIO_CLI_PATH_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache, outputLimitModeCache, NON_THINKING_SYSTEM_PROMPT, UNSUPPORTED_SAMPLING_FIELD_PATTERN, UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN, MAX_NEGOTIATE_RETRIES;
6485
6524
  var init_lm_studio_client = __esm({
6486
6525
  "src/core/lm-studio-client.ts"() {
6487
6526
  init_utils();
@@ -6499,12 +6538,16 @@ var init_lm_studio_client = __esm({
6499
6538
  activeAbortControllers = /* @__PURE__ */ new Set();
6500
6539
  directorySizeCache = /* @__PURE__ */ new Map();
6501
6540
  modelDefinitionCache = /* @__PURE__ */ new Map();
6541
+ outputLimitModeCache = /* @__PURE__ */ new Map();
6502
6542
  NON_THINKING_SYSTEM_PROMPT = [
6503
6543
  "You are in non-thinking mode for benchmark reproducibility.",
6504
6544
  "Return only the final answer.",
6505
6545
  "Do not output internal reasoning, chain-of-thought, or scratchpad.",
6506
6546
  "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
6507
6547
  ].join(" ");
6548
+ UNSUPPORTED_SAMPLING_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/;
6549
+ UNSUPPORTED_OUTPUT_LIMIT_FIELD_PATTERN = /unrecognized|unknown|not support|unsupported|unexpected|additional|extra|invalid field/;
6550
+ MAX_NEGOTIATE_RETRIES = 5;
6508
6551
  }
6509
6552
  });
6510
6553
 
@@ -53938,6 +53981,7 @@ async function selectWithArrows(title, options, config = {}) {
53938
53981
  };
53939
53982
  readline4.emitKeypressEvents(stdin);
53940
53983
  stdin.resume();
53984
+ stdin.ref?.();
53941
53985
  if (stdin.isTTY) {
53942
53986
  stdin.setRawMode(true);
53943
53987
  }
@@ -54020,6 +54064,7 @@ ${source_default.dim(message)}
54020
54064
  };
54021
54065
  readline4.emitKeypressEvents(stdin);
54022
54066
  stdin.resume();
54067
+ stdin.ref?.();
54023
54068
  if (stdin.isTTY) stdin.setRawMode(true);
54024
54069
  stdout.write("\x1B[?25l");
54025
54070
  stdin.on("keypress", onKeypress);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "metrillm",
3
- "version": "0.2.4",
3
+ "version": "0.2.5",
4
4
  "description": "Benchmark your local LLM models — speed, quality & hardware fitness verdict",
5
5
  "type": "module",
6
6
  "bin": {