metrillm 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +23 -18
  2. package/dist/index.mjs +275 -51
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -13,7 +13,8 @@
13
13
  > Think Geekbench, but for local LLMs on your actual hardware.
14
14
 
15
15
  ```bash
16
- npx metrillm@latest bench
16
+ npm install -g metrillm
17
+ metrillm bench
17
18
  ```
18
19
 
19
20
  <p align="center">
@@ -56,26 +57,19 @@ npx metrillm@latest bench
56
57
  > [Ollama](https://ollama.com/) or [LM Studio](https://lmstudio.ai/).
57
58
 
58
59
  ```bash
59
- # Run directly (no install)
60
- npx metrillm@latest bench
61
-
62
- # Or install globally
63
- npm i -g metrillm
60
+ # Install globally
61
+ npm install -g metrillm
64
62
  metrillm bench
65
63
 
66
- # Homebrew (no global npm install)
67
- # One-liner install (without pre-tapping):
68
- brew install MetriLLM/metrillm/metrillm
64
+ # Alternative package managers
65
+ pnpm add -g metrillm
66
+ bun add -g metrillm
69
67
 
70
- # Or one-time tap for short install command:
71
- brew tap MetriLLM/metrillm
72
- # Then:
73
- brew install metrillm
74
- metrillm bench
68
+ # Homebrew
69
+ brew install MetriLLM/metrillm/metrillm
75
70
 
76
- # Alternative package managers
77
- pnpm dlx metrillm@latest bench
78
- bunx metrillm@latest bench
71
+ # Or run without installing
72
+ npx metrillm@latest bench
79
73
  ```
80
74
 
81
75
  ## Usage
@@ -134,6 +128,17 @@ For very large models, tune timeout flags:
134
128
  - `--coding-timeout-ms` (default `240000`)
135
129
  - `--lm-studio-stream-stall-timeout-ms` (default `180000`, `0` disables stall timeout)
136
130
 
131
+ Benchmark Profile v1 (applied to all benchmark prompts):
132
+ - `temperature=0`
133
+ - `top_p=1`
134
+ - `seed=42`
135
+ - `thinking` follows your benchmark mode (`--thinking` / `--no-thinking`)
136
+ - Context window stays runtime default (`context=runtime-default`) and is recorded as such in metadata.
137
+
138
+ LM Studio non-thinking guard:
139
+ - When benchmark mode requests non-thinking (`--no-thinking` or default), MetriLLM now aborts if the model still emits reasoning traces (for result comparability).
140
+ - To disable it in LM Studio for affected models, put this at the top of the model chat template: `{%- set enable_thinking = false %}` then eject/reload the model.
141
+
137
142
  ## How Scoring Works
138
143
 
139
144
  **Hardware Fit Score** (0-100) — how well the model runs on your machine:
@@ -258,7 +263,7 @@ The tap formula lives in `Formula/metrillm.rb`.
258
263
  ./scripts/update-homebrew-formula.sh
259
264
 
260
265
  # Or pin a specific version
261
- ./scripts/update-homebrew-formula.sh 0.2.0
266
+ ./scripts/update-homebrew-formula.sh 0.2.1
262
267
  ```
263
268
 
264
269
  After updating the formula, commit and push so users can install/update with:
package/dist/index.mjs CHANGED
@@ -4793,7 +4793,8 @@ function stripThinkTags(text) {
4793
4793
  }
4794
4794
  function hasThinkingContent(response, thinkingField) {
4795
4795
  if (thinkingField && thinkingField.trim().length > 0) return true;
4796
- return /<think(?:ing)?[\s>]/i.test(response);
4796
+ if (/<think(?:ing)?[\s>]/i.test(response)) return true;
4797
+ return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
4797
4798
  }
4798
4799
  function estimateTokenCount(text) {
4799
4800
  if (!text) return 0;
@@ -5246,25 +5247,50 @@ async function listRunningModels() {
5246
5247
  function setDefaultKeepAlive(keepAlive) {
5247
5248
  defaultKeepAlive = keepAlive;
5248
5249
  }
5250
+ function hasSamplingOverrides(options) {
5251
+ return options?.top_p !== void 0 || options?.seed !== void 0;
5252
+ }
5253
+ function isUnsupportedSamplingOptionError(err) {
5254
+ const message = err instanceof Error ? err.message : String(err);
5255
+ const lower = message.toLowerCase();
5256
+ const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
5257
+ if (!mentionsSampling) return false;
5258
+ return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
5259
+ }
5260
+ function buildGenerateRequest(model, prompt, options, includeSampling) {
5261
+ return {
5262
+ model,
5263
+ prompt,
5264
+ stream: true,
5265
+ keep_alive: options?.keep_alive ?? defaultKeepAlive,
5266
+ ...options?.think !== void 0 ? { think: options.think } : {},
5267
+ options: {
5268
+ temperature: options?.temperature ?? 0,
5269
+ ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
5270
+ ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
5271
+ num_predict: options?.num_predict ?? 512
5272
+ }
5273
+ };
5274
+ }
5249
5275
  async function generate(model, prompt, options) {
5250
5276
  return generateStream(model, prompt, void 0, options);
5251
5277
  }
5252
5278
  async function generateStream(model, prompt, callbacks, options) {
5253
- const stream = await withTimeout(
5254
- client.generate({
5255
- model,
5256
- prompt,
5257
- stream: true,
5258
- keep_alive: options?.keep_alive ?? defaultKeepAlive,
5259
- ...options?.think !== void 0 ? { think: options.think } : {},
5260
- options: {
5261
- temperature: options?.temperature ?? 0,
5262
- num_predict: options?.num_predict ?? 512
5263
- }
5264
- }),
5279
+ const initializeStream = (includeSampling) => withTimeout(
5280
+ client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
5265
5281
  OLLAMA_INIT_TIMEOUT_MS,
5266
5282
  "Ollama generate initialization"
5267
5283
  );
5284
+ let stream;
5285
+ try {
5286
+ stream = await initializeStream(true);
5287
+ } catch (err) {
5288
+ if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
5289
+ stream = await initializeStream(false);
5290
+ } else {
5291
+ throw err;
5292
+ }
5293
+ }
5268
5294
  let fullResponse = "";
5269
5295
  let fullThinking = "";
5270
5296
  let result2 = null;
@@ -5344,6 +5370,23 @@ var init_ollama_client = __esm({
5344
5370
  STREAM_STALL_TIMEOUT_MS = 3e4;
5345
5371
  }
5346
5372
  });
5373
+ function hasThinkingLeakText(response) {
5374
+ return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
5375
+ }
5376
+ function assertThinkingModeRespected(model, think, response, reasoning) {
5377
+ if (think !== false) return;
5378
+ if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
5379
+ throw new Error(
5380
+ [
5381
+ `LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
5382
+ "In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
5383
+ "If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
5384
+ "Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
5385
+ "Then eject/reload the model and run the benchmark again."
5386
+ ].join(" ")
5387
+ );
5388
+ }
5389
+ }
5347
5390
  function buildThinkingConfig(think) {
5348
5391
  if (think === void 0) return {};
5349
5392
  const effort = think ? "high" : "low";
@@ -5353,6 +5396,65 @@ function buildThinkingConfig(think) {
5353
5396
  reasoning: { effort }
5354
5397
  };
5355
5398
  }
5399
+ function hasSamplingOverrides2(options) {
5400
+ return options?.top_p !== void 0 || options?.seed !== void 0;
5401
+ }
5402
+ function isUnsupportedSamplingMessage(status, text) {
5403
+ if (status !== 400 && status !== 422) return false;
5404
+ const lower = text.toLowerCase();
5405
+ const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
5406
+ if (!mentionsSampling) return false;
5407
+ return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
5408
+ }
5409
+ function extractLMStudioErrorMessage(body) {
5410
+ const trimmed = body.trim();
5411
+ if (!trimmed) return "";
5412
+ try {
5413
+ const parsed = JSON.parse(trimmed);
5414
+ const message = parsed.error?.message;
5415
+ if (typeof message === "string" && message.trim().length > 0) {
5416
+ return message.trim();
5417
+ }
5418
+ } catch {
5419
+ }
5420
+ return trimmed;
5421
+ }
5422
+ function isModelLoadGuardrailError(message) {
5423
+ const lower = message.toLowerCase();
5424
+ if (!lower.includes("failed to load model")) return false;
5425
+ return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
5426
+ }
5427
+ function buildLMStudioRequestError(kind, model, status, statusText, body) {
5428
+ const backendMessage = extractLMStudioErrorMessage(body);
5429
+ if (isModelLoadGuardrailError(backendMessage)) {
5430
+ return new Error(
5431
+ [
5432
+ `LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
5433
+ "In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
5434
+ `Backend error: ${backendMessage}`
5435
+ ].join(" ")
5436
+ );
5437
+ }
5438
+ const suffix = backendMessage ? ` ${backendMessage}` : "";
5439
+ return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
5440
+ }
5441
+ function buildChatCompletionBody(model, prompt, options, stream, includeSampling) {
5442
+ const messages = options?.think === false ? [
5443
+ { role: "system", content: NON_THINKING_SYSTEM_PROMPT },
5444
+ { role: "user", content: prompt }
5445
+ ] : [{ role: "user", content: prompt }];
5446
+ return {
5447
+ model,
5448
+ messages,
5449
+ temperature: options?.temperature ?? 0,
5450
+ ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
5451
+ ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
5452
+ max_tokens: options?.num_predict ?? 512,
5453
+ stream,
5454
+ ...stream ? { stream_options: { include_usage: true } } : {},
5455
+ ...buildThinkingConfig(options?.think)
5456
+ };
5457
+ }
5356
5458
  function parseNonNegativeInt(value) {
5357
5459
  if (!/^\d+$/.test(value)) return null;
5358
5460
  const parsed = Number.parseInt(value, 10);
@@ -5437,7 +5539,7 @@ async function pathIsDirectory(targetPath) {
5437
5539
  try {
5438
5540
  const stat = await promises.stat(targetPath);
5439
5541
  return stat.isDirectory();
5440
- } catch {
5542
+ } catch (_err) {
5441
5543
  return false;
5442
5544
  }
5443
5545
  }
@@ -5837,27 +5939,30 @@ async function generate2(model, prompt, options) {
5837
5939
  try {
5838
5940
  const baseUrl = getLMStudioBaseUrl();
5839
5941
  const url = new URL("/v1/chat/completions", baseUrl);
5840
- const resp = await fetch(url, {
5942
+ const doRequest = (includeSampling) => fetch(url, {
5841
5943
  method: "POST",
5842
5944
  headers: getLMStudioHeaders(),
5843
- body: JSON.stringify({
5844
- model,
5845
- messages: [{ role: "user", content: prompt }],
5846
- temperature: options?.temperature ?? 0,
5847
- max_tokens: options?.num_predict ?? 512,
5848
- stream: false,
5849
- ...buildThinkingConfig(options?.think)
5850
- }),
5945
+ body: JSON.stringify(buildChatCompletionBody(model, prompt, options, false, includeSampling)),
5851
5946
  signal: controller.signal
5852
5947
  });
5948
+ let resp = await doRequest(true);
5853
5949
  if (!resp.ok) {
5854
5950
  const body = await resp.text().catch(() => "");
5855
- throw new Error(`LM Studio generate failed (${resp.status} ${resp.statusText}) ${body}`.trim());
5951
+ if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
5952
+ resp = await doRequest(false);
5953
+ } else {
5954
+ throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
5955
+ }
5956
+ }
5957
+ if (!resp.ok) {
5958
+ const body = await resp.text().catch(() => "");
5959
+ throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
5856
5960
  }
5857
5961
  const payload = await resp.json();
5858
5962
  const choice = extractChoice2(payload);
5859
5963
  const response = extractContent(choice);
5860
5964
  const reasoning = extractReasoning(choice);
5965
+ assertThinkingModeRespected(model, options?.think, response, reasoning);
5861
5966
  const usage = extractUsage(payload);
5862
5967
  const totalDuration = Math.max(0, Date.now() - start) * 1e6;
5863
5968
  return {
@@ -5898,23 +6003,24 @@ async function generateStream2(model, prompt, callbacks, options) {
5898
6003
  };
5899
6004
  try {
5900
6005
  resetStallTimer();
5901
- const resp = await fetch(url, {
6006
+ const doRequest = (includeSampling) => fetch(url, {
5902
6007
  method: "POST",
5903
6008
  headers: getLMStudioHeaders(),
5904
- body: JSON.stringify({
5905
- model,
5906
- messages: [{ role: "user", content: prompt }],
5907
- temperature: options?.temperature ?? 0,
5908
- max_tokens: options?.num_predict ?? 512,
5909
- stream: true,
5910
- stream_options: { include_usage: true },
5911
- ...buildThinkingConfig(options?.think)
5912
- }),
6009
+ body: JSON.stringify(buildChatCompletionBody(model, prompt, options, true, includeSampling)),
5913
6010
  signal: controller.signal
5914
6011
  });
6012
+ let resp = await doRequest(true);
5915
6013
  if (!resp.ok) {
5916
6014
  const body = await resp.text().catch(() => "");
5917
- throw new Error(`LM Studio stream failed (${resp.status} ${resp.statusText}) ${body}`.trim());
6015
+ if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
6016
+ resp = await doRequest(false);
6017
+ } else {
6018
+ throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
6019
+ }
6020
+ }
6021
+ if (!resp.ok) {
6022
+ const body = await resp.text().catch(() => "");
6023
+ throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
5918
6024
  }
5919
6025
  if (!resp.body) {
5920
6026
  throw new Error("LM Studio stream response body is empty");
@@ -5994,6 +6100,7 @@ async function generateStream2(model, prompt, callbacks, options) {
5994
6100
  evalCount: usage?.completion_tokens ?? 0,
5995
6101
  evalDuration: Math.max(1, evalDurationMs) * 1e6
5996
6102
  };
6103
+ assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
5997
6104
  callbacks?.onDone?.(result2);
5998
6105
  return result2;
5999
6106
  } catch (err) {
@@ -6017,7 +6124,7 @@ function abortOngoingRequests2() {
6017
6124
  }
6018
6125
  activeAbortControllers.clear();
6019
6126
  }
6020
- var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache;
6127
+ var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache, NON_THINKING_SYSTEM_PROMPT;
6021
6128
  var init_lm_studio_client = __esm({
6022
6129
  "src/core/lm-studio-client.ts"() {
6023
6130
  DEFAULT_LM_STUDIO_BASE_URL = "http://127.0.0.1:1234";
@@ -6031,6 +6138,12 @@ var init_lm_studio_client = __esm({
6031
6138
  activeAbortControllers = /* @__PURE__ */ new Set();
6032
6139
  directorySizeCache = /* @__PURE__ */ new Map();
6033
6140
  modelDefinitionCache = /* @__PURE__ */ new Map();
6141
+ NON_THINKING_SYSTEM_PROMPT = [
6142
+ "You are in non-thinking mode for benchmark reproducibility.",
6143
+ "Return only the final answer.",
6144
+ "Do not output internal reasoning, chain-of-thought, or scratchpad.",
6145
+ "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
6146
+ ].join(" ");
6034
6147
  }
6035
6148
  });
6036
6149
 
@@ -6187,7 +6300,7 @@ var require_package = __commonJS({
6187
6300
  "node_modules/systeminformation/package.json"(exports$1, module) {
6188
6301
  module.exports = {
6189
6302
  name: "systeminformation",
6190
- version: "5.31.1",
6303
+ version: "5.31.2",
6191
6304
  description: "Advanced, lightweight system and OS information library",
6192
6305
  license: "MIT",
6193
6306
  author: "Sebastian Hildebrandt <hildebrandt@plus-innovations.com> (https://plus-innovations.com)",
@@ -7520,6 +7633,38 @@ var require_util = __commonJS({
7520
7633
  }
7521
7634
  function getAppleModel(key) {
7522
7635
  const appleModelIds = [
7636
+ {
7637
+ key: "Mac17,2",
7638
+ name: "MacBook",
7639
+ size: "14-inch",
7640
+ processor: "M5",
7641
+ year: "2025",
7642
+ additional: ""
7643
+ },
7644
+ {
7645
+ key: "Mac16,13",
7646
+ name: "MacBook Air",
7647
+ size: "15-inch",
7648
+ processor: "M4",
7649
+ year: "2025",
7650
+ additional: ""
7651
+ },
7652
+ {
7653
+ key: "Mac16,12",
7654
+ name: "MacBook Air",
7655
+ size: "13-inch",
7656
+ processor: "M4",
7657
+ year: "2025",
7658
+ additional: ""
7659
+ },
7660
+ {
7661
+ key: "Mac15,13",
7662
+ name: "MacBook Air",
7663
+ size: "15-inch",
7664
+ processor: "M3",
7665
+ year: "2024",
7666
+ additional: ""
7667
+ },
7523
7668
  {
7524
7669
  key: "Mac15,12",
7525
7670
  name: "MacBook Air",
@@ -27226,6 +27371,38 @@ var init_progress = __esm({
27226
27371
  }
27227
27372
  });
27228
27373
 
27374
+ // src/benchmarks/profile.ts
27375
+ function withBenchmarkProfile(opts = {}) {
27376
+ return {
27377
+ temperature: BENCHMARK_PROFILE_TEMPERATURE,
27378
+ top_p: BENCHMARK_PROFILE_TOP_P,
27379
+ seed: BENCHMARK_PROFILE_SEED,
27380
+ ...opts
27381
+ };
27382
+ }
27383
+ function buildBenchmarkProfileMetadata(thinkEnabled) {
27384
+ return {
27385
+ version: BENCHMARK_PROFILE_VERSION,
27386
+ sampling: {
27387
+ temperature: BENCHMARK_PROFILE_TEMPERATURE,
27388
+ topP: BENCHMARK_PROFILE_TOP_P,
27389
+ seed: BENCHMARK_PROFILE_SEED
27390
+ },
27391
+ thinkingMode: thinkEnabled ? "enabled" : "disabled",
27392
+ contextWindowTokens: null,
27393
+ contextPolicy: "runtime-default"
27394
+ };
27395
+ }
27396
+ var BENCHMARK_PROFILE_VERSION, BENCHMARK_PROFILE_SEED, BENCHMARK_PROFILE_TOP_P, BENCHMARK_PROFILE_TEMPERATURE;
27397
+ var init_profile = __esm({
27398
+ "src/benchmarks/profile.ts"() {
27399
+ BENCHMARK_PROFILE_VERSION = "v1";
27400
+ BENCHMARK_PROFILE_SEED = 42;
27401
+ BENCHMARK_PROFILE_TOP_P = 1;
27402
+ BENCHMARK_PROFILE_TEMPERATURE = 0;
27403
+ }
27404
+ });
27405
+
27229
27406
  // src/benchmarks/performance.ts
27230
27407
  async function optionalProbe(probe, fallback2) {
27231
27408
  try {
@@ -27257,9 +27434,11 @@ async function runPerformanceBench(model, options = {}) {
27257
27434
  ]);
27258
27435
  const warmup = await withTimeout(
27259
27436
  generateStream3(model, WARMUP_PROMPT, void 0, {
27260
- num_predict: 32,
27261
- think: options.think,
27262
- stall_timeout_ms: options.streamStallTimeoutMs
27437
+ ...withBenchmarkProfile({
27438
+ num_predict: 32,
27439
+ think: options.think,
27440
+ stall_timeout_ms: options.streamStallTimeoutMs
27441
+ })
27263
27442
  }),
27264
27443
  warmupTimeoutMs,
27265
27444
  "Model warmup",
@@ -27314,11 +27493,11 @@ async function runPerformanceBench(model, options = {}) {
27314
27493
  }
27315
27494
  }
27316
27495
  },
27317
- {
27496
+ withBenchmarkProfile({
27318
27497
  num_predict: 256,
27319
27498
  think: options.think,
27320
27499
  stall_timeout_ms: options.streamStallTimeoutMs
27321
- }
27500
+ })
27322
27501
  ),
27323
27502
  promptTimeoutMs,
27324
27503
  "Performance benchmark",
@@ -27431,6 +27610,7 @@ var init_performance = __esm({
27431
27610
  init_hardware();
27432
27611
  init_utils();
27433
27612
  init_progress();
27613
+ init_profile();
27434
27614
  WARMUP_PROMPT = "Say hello in one word.";
27435
27615
  BENCH_PROMPTS = [
27436
27616
  "Explain the concept of recursion in programming in 3 sentences.",
@@ -27823,7 +28003,7 @@ Answer:`;
27823
28003
  const startTime = Date.now();
27824
28004
  try {
27825
28005
  const result2 = await withTimeout(
27826
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
28006
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
27827
28007
  timeoutMs,
27828
28008
  "Reasoning question",
27829
28009
  abortOngoingRequests3
@@ -27870,6 +28050,7 @@ var init_reasoning2 = __esm({
27870
28050
  init_utils();
27871
28051
  init_progress();
27872
28052
  init_reasoning();
28053
+ init_profile();
27873
28054
  questions = reasoning_default;
27874
28055
  DEFAULT_REASONING_TIMEOUT_MS = 12e4;
27875
28056
  }
@@ -28166,7 +28347,7 @@ Answer:`;
28166
28347
  const startTime = Date.now();
28167
28348
  try {
28168
28349
  const result2 = await withTimeout(
28169
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
28350
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
28170
28351
  timeoutMs,
28171
28352
  "Math problem",
28172
28353
  abortOngoingRequests3
@@ -28213,6 +28394,7 @@ var init_math2 = __esm({
28213
28394
  init_utils();
28214
28395
  init_progress();
28215
28396
  init_math();
28397
+ init_profile();
28216
28398
  problems = math_default;
28217
28399
  DEFAULT_MATH_TIMEOUT_MS = 12e4;
28218
28400
  }
@@ -31985,7 +32167,7 @@ Reply with ONLY the function code, no explanation.`;
31985
32167
  const startTime = Date.now();
31986
32168
  try {
31987
32169
  const result2 = await withTimeout(
31988
- generate3(model, prompt, { temperature: 0, num_predict: 2048, think: opts?.think }),
32170
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
31989
32171
  timeoutMs,
31990
32172
  "Coding task",
31991
32173
  abortOngoingRequests3
@@ -32045,6 +32227,7 @@ var init_coding2 = __esm({
32045
32227
  init_utils();
32046
32228
  init_progress();
32047
32229
  init_coding();
32230
+ init_profile();
32048
32231
  VALID_IDENTIFIER_RE = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/;
32049
32232
  tasks = coding_default;
32050
32233
  DIFFICULTY_WEIGHT = {
@@ -32423,7 +32606,7 @@ async function runInstructionFollowingBench(model, opts) {
32423
32606
  const startTime = Date.now();
32424
32607
  try {
32425
32608
  const result2 = await withTimeout(
32426
- generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
32609
+ generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
32427
32610
  timeoutMs,
32428
32611
  "Instruction following task",
32429
32612
  abortOngoingRequests3
@@ -32469,6 +32652,7 @@ var init_instruction_following2 = __esm({
32469
32652
  init_utils();
32470
32653
  init_progress();
32471
32654
  init_instruction_following();
32655
+ init_profile();
32472
32656
  questions2 = instruction_following_default;
32473
32657
  DEFAULT_INSTRUCTION_FOLLOWING_TIMEOUT_MS = 12e4;
32474
32658
  }
@@ -32823,7 +33007,7 @@ async function runStructuredOutputBench(model, opts) {
32823
33007
  const startTime = Date.now();
32824
33008
  try {
32825
33009
  const result2 = await withTimeout(
32826
- generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
33010
+ generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
32827
33011
  timeoutMs,
32828
33012
  "Structured output task",
32829
33013
  abortOngoingRequests3
@@ -32869,6 +33053,7 @@ var init_structured_output2 = __esm({
32869
33053
  init_utils();
32870
33054
  init_progress();
32871
33055
  init_structured_output();
33056
+ init_profile();
32872
33057
  questions3 = structured_output_default;
32873
33058
  DEFAULT_STRUCTURED_OUTPUT_TIMEOUT_MS = 12e4;
32874
33059
  }
@@ -33090,7 +33275,7 @@ async function runMultilingualBench(model, opts) {
33090
33275
  const startTime = Date.now();
33091
33276
  try {
33092
33277
  const result2 = await withTimeout(
33093
- generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
33278
+ generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
33094
33279
  timeoutMs,
33095
33280
  "Multilingual task",
33096
33281
  abortOngoingRequests3
@@ -33138,6 +33323,7 @@ var init_multilingual2 = __esm({
33138
33323
  init_utils();
33139
33324
  init_progress();
33140
33325
  init_multilingual();
33326
+ init_profile();
33141
33327
  questions4 = multilingual_default;
33142
33328
  DEFAULT_MULTILINGUAL_TIMEOUT_MS = 12e4;
33143
33329
  NEGATION_PATTERNS = [
@@ -35323,6 +35509,25 @@ function getLevel(score) {
35323
35509
  if (score >= 25) return "Weak";
35324
35510
  return "Poor";
35325
35511
  }
35512
+ function summarizeCategoryIssues(name, details) {
35513
+ let crashes = 0;
35514
+ let timeouts = 0;
35515
+ let errors = 0;
35516
+ for (const detail of details) {
35517
+ const actual = detail.actual ?? "";
35518
+ if (/^TIMEOUT\b/i.test(actual)) {
35519
+ timeouts++;
35520
+ continue;
35521
+ }
35522
+ if (/^ERROR:/i.test(actual)) {
35523
+ errors++;
35524
+ if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
35525
+ crashes++;
35526
+ }
35527
+ }
35528
+ }
35529
+ return { name, crashes, timeouts, errors };
35530
+ }
35326
35531
  function printHardwareTable(hw) {
35327
35532
  const table = new import_cli_table3.default({
35328
35533
  head: [source_default.bold("Hardware"), source_default.bold("Value")],
@@ -35441,6 +35646,18 @@ function printQualityTable(quality, timePenalties) {
35441
35646
  ]);
35442
35647
  }
35443
35648
  console.log(table.toString());
35649
+ const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
35650
+ if (issueSummaries.length > 0) {
35651
+ console.log(source_default.yellow("Execution issues detected during quality benchmark:"));
35652
+ for (const summary of issueSummaries) {
35653
+ const parts = [];
35654
+ if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
35655
+ const nonCrashErrors = summary.errors - summary.crashes;
35656
+ if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
35657
+ if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
35658
+ console.log(source_default.yellow(` \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
35659
+ }
35660
+ }
35444
35661
  }
35445
35662
  function printSummaryTable(results) {
35446
35663
  const termWidth = process.stdout.columns || 80;
@@ -52270,6 +52487,11 @@ async function benchCommand(options) {
52270
52487
  if (!silent && thinkEnabled) {
52271
52488
  infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
52272
52489
  }
52490
+ if (!silent) {
52491
+ infoMsg(
52492
+ `Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
52493
+ );
52494
+ }
52273
52495
  try {
52274
52496
  const results = [];
52275
52497
  const failedModels = [];
@@ -52358,7 +52580,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
52358
52580
  promptPackVersion: PROMPT_PACK_VERSION,
52359
52581
  runtimeVersion,
52360
52582
  runtimeBackend: getRuntimeName(),
52361
- modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
52583
+ modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat(),
52584
+ benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
52362
52585
  }
52363
52586
  };
52364
52587
  const rawLogHash = createHash("sha256").update(JSON.stringify(partialResult)).digest("hex");
@@ -52533,7 +52756,8 @@ var init_bench = __esm({
52533
52756
  init_telemetry();
52534
52757
  init_terminal();
52535
52758
  init_thinking_prompt();
52536
- BENCHMARK_SPEC_VERSION = "0.2.0";
52759
+ init_profile();
52760
+ BENCHMARK_SPEC_VERSION = "0.2.1";
52537
52761
  PROMPT_PACK_VERSION = "0.1.0";
52538
52762
  }
52539
52763
  });
@@ -53794,7 +54018,7 @@ var init_cli_main = __esm({
53794
54018
  program2 = new Command();
53795
54019
  program2.name("metrillm").description(
53796
54020
  "Benchmark local LLMs for hardware fit and task quality, then compute a global verdict"
53797
- ).version("0.1.0").hook("preAction", (_thisCommand, actionCommand) => {
54021
+ ).version("0.2.1").hook("preAction", (_thisCommand, actionCommand) => {
53798
54022
  if (!actionCommand.opts()?.json) printBanner();
53799
54023
  });
53800
54024
  program2.option(
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "metrillm",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "description": "Benchmark your local LLM models — speed, quality & hardware fitness verdict",
5
5
  "type": "module",
6
6
  "bin": {
@@ -49,7 +49,7 @@
49
49
  "ollama": "^0.5.12",
50
50
  "ora": "^8.1.1",
51
51
  "posthog-node": "^5.26.0",
52
- "systeminformation": "^5.23.5"
52
+ "systeminformation": "^5.31.2"
53
53
  },
54
54
  "devDependencies": {
55
55
  "@types/node": "^22.10.0",