npm - metrillm - Versions diffs - 0.2.0 → 0.2.1 - Mend

metrillm 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -13,7 +13,8 @@
 > Think Geekbench, but for local LLMs on your actual hardware.
 ```bash
-npx metrillm@latest bench
+npm install -g metrillm
+metrillm bench
 ```
 <p align="center">
@@ -56,26 +57,19 @@ npx metrillm@latest bench
 > [Ollama](https://ollama.com/) or [LM Studio](https://lmstudio.ai/).
 ```bash
-# Run directly (no install)
-npx metrillm@latest bench
-# Or install globally
-npm i -g metrillm
+# Install globally
+npm install -g metrillm
 metrillm bench
-# Homebrew (no global npm install)
-# One-liner install (without pre-tapping):
-brew install MetriLLM/metrillm/metrillm
+# Alternative package managers
+pnpm add -g metrillm
+bun add -g metrillm
-# Or one-time tap for short install command:
-brew tap MetriLLM/metrillm
-# Then:
-brew install metrillm
-metrillm bench
+# Homebrew
+brew install MetriLLM/metrillm/metrillm
-# Alternative package managers
-pnpm dlx metrillm@latest bench
-bunx metrillm@latest bench
+# Or run without installing
+npx metrillm@latest bench
 ```
 ## Usage
@@ -134,6 +128,17 @@ For very large models, tune timeout flags:
 - `--coding-timeout-ms` (default `240000`)
 - `--lm-studio-stream-stall-timeout-ms` (default `180000`, `0` disables stall timeout)
+Benchmark Profile v1 (applied to all benchmark prompts):
+- `temperature=0`
+- `top_p=1`
+- `seed=42`
+- `thinking` follows your benchmark mode (`--thinking` / `--no-thinking`)
+- Context window stays runtime default (`context=runtime-default`) and is recorded as such in metadata.
+LM Studio non-thinking guard:
+- When benchmark mode requests non-thinking (`--no-thinking` or default), MetriLLM now aborts if the model still emits reasoning traces (for result comparability).
+- To disable it in LM Studio for affected models, put this at the top of the model chat template: `{%- set enable_thinking = false %}` then eject/reload the model.
 ## How Scoring Works
 **Hardware Fit Score** (0-100) — how well the model runs on your machine:
@@ -258,7 +263,7 @@ The tap formula lives in `Formula/metrillm.rb`.
 ./scripts/update-homebrew-formula.sh
 # Or pin a specific version
-./scripts/update-homebrew-formula.sh 0.2.0
+./scripts/update-homebrew-formula.sh 0.2.1
 ```
 After updating the formula, commit and push so users can install/update with:

package/dist/index.mjs CHANGED Viewed

@@ -4793,7 +4793,8 @@ function stripThinkTags(text) {
 }
 function hasThinkingContent(response, thinkingField) {
   if (thinkingField && thinkingField.trim().length > 0) return true;
-  return /<think(?:ing)?[\s>]/i.test(response);
+  if (/<think(?:ing)?[\s>]/i.test(response)) return true;
+  return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
 }
 function estimateTokenCount(text) {
   if (!text) return 0;
@@ -5246,25 +5247,50 @@ async function listRunningModels() {
 function setDefaultKeepAlive(keepAlive) {
   defaultKeepAlive = keepAlive;
 }
+function hasSamplingOverrides(options) {
+  return options?.top_p !== void 0 || options?.seed !== void 0;
+}
+function isUnsupportedSamplingOptionError(err) {
+  const message = err instanceof Error ? err.message : String(err);
+  const lower = message.toLowerCase();
+  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
+  if (!mentionsSampling) return false;
+  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+}
+function buildGenerateRequest(model, prompt, options, includeSampling) {
+  return {
+    model,
+    prompt,
+    stream: true,
+    keep_alive: options?.keep_alive ?? defaultKeepAlive,
+    ...options?.think !== void 0 ? { think: options.think } : {},
+    options: {
+      temperature: options?.temperature ?? 0,
+      ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
+      ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
+      num_predict: options?.num_predict ?? 512
+    }
+  };
+}
 async function generate(model, prompt, options) {
   return generateStream(model, prompt, void 0, options);
 }
 async function generateStream(model, prompt, callbacks, options) {
-  const stream = await withTimeout(
-    client.generate({
-      model,
-      prompt,
-      stream: true,
-      keep_alive: options?.keep_alive ?? defaultKeepAlive,
-      ...options?.think !== void 0 ? { think: options.think } : {},
-      options: {
-        temperature: options?.temperature ?? 0,
-        num_predict: options?.num_predict ?? 512
-      }
-    }),
+  const initializeStream = (includeSampling) => withTimeout(
+    client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
     OLLAMA_INIT_TIMEOUT_MS,
     "Ollama generate initialization"
   );
+  let stream;
+  try {
+    stream = await initializeStream(true);
+  } catch (err) {
+    if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
+      stream = await initializeStream(false);
+    } else {
+      throw err;
+    }
+  }
   let fullResponse = "";
   let fullThinking = "";
   let result2 = null;
@@ -5344,6 +5370,23 @@ var init_ollama_client = __esm({
     STREAM_STALL_TIMEOUT_MS = 3e4;
   }
 });
+function hasThinkingLeakText(response) {
+  return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
+}
+function assertThinkingModeRespected(model, think, response, reasoning) {
+  if (think !== false) return;
+  if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
+    throw new Error(
+      [
+        `LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
+        "In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
+        "If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
+        "Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
+        "Then eject/reload the model and run the benchmark again."
+      ].join(" ")
+    );
+  }
+}
 function buildThinkingConfig(think) {
   if (think === void 0) return {};
   const effort = think ? "high" : "low";
@@ -5353,6 +5396,65 @@ function buildThinkingConfig(think) {
     reasoning: { effort }
   };
 }
+function hasSamplingOverrides2(options) {
+  return options?.top_p !== void 0 || options?.seed !== void 0;
+}
+function isUnsupportedSamplingMessage(status, text) {
+  if (status !== 400 && status !== 422) return false;
+  const lower = text.toLowerCase();
+  const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
+  if (!mentionsSampling) return false;
+  return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
+}
+function extractLMStudioErrorMessage(body) {
+  const trimmed = body.trim();
+  if (!trimmed) return "";
+  try {
+    const parsed = JSON.parse(trimmed);
+    const message = parsed.error?.message;
+    if (typeof message === "string" && message.trim().length > 0) {
+      return message.trim();
+    }
+  } catch {
+  }
+  return trimmed;
+}
+function isModelLoadGuardrailError(message) {
+  const lower = message.toLowerCase();
+  if (!lower.includes("failed to load model")) return false;
+  return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
+}
+function buildLMStudioRequestError(kind, model, status, statusText, body) {
+  const backendMessage = extractLMStudioErrorMessage(body);
+  if (isModelLoadGuardrailError(backendMessage)) {
+    return new Error(
+      [
+        `LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
+        "In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
+        `Backend error: ${backendMessage}`
+      ].join(" ")
+    );
+  }
+  const suffix = backendMessage ? ` ${backendMessage}` : "";
+  return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
+}
+function buildChatCompletionBody(model, prompt, options, stream, includeSampling) {
+  const messages = options?.think === false ? [
+    { role: "system", content: NON_THINKING_SYSTEM_PROMPT },
+    { role: "user", content: prompt }
+  ] : [{ role: "user", content: prompt }];
+  return {
+    model,
+    messages,
+    temperature: options?.temperature ?? 0,
+    ...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
+    ...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
+    max_tokens: options?.num_predict ?? 512,
+    stream,
+    ...stream ? { stream_options: { include_usage: true } } : {},
+    ...buildThinkingConfig(options?.think)
+  };
+}
 function parseNonNegativeInt(value) {
   if (!/^\d+$/.test(value)) return null;
   const parsed = Number.parseInt(value, 10);
@@ -5437,7 +5539,7 @@ async function pathIsDirectory(targetPath) {
   try {
     const stat = await promises.stat(targetPath);
     return stat.isDirectory();
-  } catch {
+  } catch (_err) {
     return false;
   }
 }
@@ -5837,27 +5939,30 @@ async function generate2(model, prompt, options) {
   try {
     const baseUrl = getLMStudioBaseUrl();
     const url = new URL("/v1/chat/completions", baseUrl);
-    const resp = await fetch(url, {
+    const doRequest = (includeSampling) => fetch(url, {
       method: "POST",
       headers: getLMStudioHeaders(),
-      body: JSON.stringify({
-        model,
-        messages: [{ role: "user", content: prompt }],
-        temperature: options?.temperature ?? 0,
-        max_tokens: options?.num_predict ?? 512,
-        stream: false,
-        ...buildThinkingConfig(options?.think)
-      }),
+      body: JSON.stringify(buildChatCompletionBody(model, prompt, options, false, includeSampling)),
       signal: controller.signal
     });
+    let resp = await doRequest(true);
     if (!resp.ok) {
       const body = await resp.text().catch(() => "");
-      throw new Error(`LM Studio generate failed (${resp.status} ${resp.statusText}) ${body}`.trim());
+      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+        resp = await doRequest(false);
+      } else {
+        throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
+      }
+    }
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
     }
     const payload = await resp.json();
     const choice = extractChoice2(payload);
     const response = extractContent(choice);
     const reasoning = extractReasoning(choice);
+    assertThinkingModeRespected(model, options?.think, response, reasoning);
     const usage = extractUsage(payload);
     const totalDuration = Math.max(0, Date.now() - start) * 1e6;
     return {
@@ -5898,23 +6003,24 @@ async function generateStream2(model, prompt, callbacks, options) {
   };
   try {
     resetStallTimer();
-    const resp = await fetch(url, {
+    const doRequest = (includeSampling) => fetch(url, {
       method: "POST",
       headers: getLMStudioHeaders(),
-      body: JSON.stringify({
-        model,
-        messages: [{ role: "user", content: prompt }],
-        temperature: options?.temperature ?? 0,
-        max_tokens: options?.num_predict ?? 512,
-        stream: true,
-        stream_options: { include_usage: true },
-        ...buildThinkingConfig(options?.think)
-      }),
+      body: JSON.stringify(buildChatCompletionBody(model, prompt, options, true, includeSampling)),
       signal: controller.signal
     });
+    let resp = await doRequest(true);
     if (!resp.ok) {
       const body = await resp.text().catch(() => "");
-      throw new Error(`LM Studio stream failed (${resp.status} ${resp.statusText}) ${body}`.trim());
+      if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
+        resp = await doRequest(false);
+      } else {
+        throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
+      }
+    }
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
     }
     if (!resp.body) {
       throw new Error("LM Studio stream response body is empty");
@@ -5994,6 +6100,7 @@ async function generateStream2(model, prompt, callbacks, options) {
       evalCount: usage?.completion_tokens ?? 0,
       evalDuration: Math.max(1, evalDurationMs) * 1e6
     };
+    assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
     callbacks?.onDone?.(result2);
     return result2;
   } catch (err) {
@@ -6017,7 +6124,7 @@ function abortOngoingRequests2() {
   }
   activeAbortControllers.clear();
 }
-var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache;
+var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache, NON_THINKING_SYSTEM_PROMPT;
 var init_lm_studio_client = __esm({
   "src/core/lm-studio-client.ts"() {
     DEFAULT_LM_STUDIO_BASE_URL = "http://127.0.0.1:1234";
@@ -6031,6 +6138,12 @@ var init_lm_studio_client = __esm({
     activeAbortControllers = /* @__PURE__ */ new Set();
     directorySizeCache = /* @__PURE__ */ new Map();
     modelDefinitionCache = /* @__PURE__ */ new Map();
+    NON_THINKING_SYSTEM_PROMPT = [
+      "You are in non-thinking mode for benchmark reproducibility.",
+      "Return only the final answer.",
+      "Do not output internal reasoning, chain-of-thought, or scratchpad.",
+      "Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
+    ].join(" ");
   }
 });
@@ -6187,7 +6300,7 @@ var require_package = __commonJS({
   "node_modules/systeminformation/package.json"(exports$1, module) {
     module.exports = {
       name: "systeminformation",
-      version: "5.31.1",
+      version: "5.31.2",
       description: "Advanced, lightweight system and OS information library",
       license: "MIT",
       author: "Sebastian Hildebrandt <hildebrandt@plus-innovations.com> (https://plus-innovations.com)",
@@ -7520,6 +7633,38 @@ var require_util = __commonJS({
     }
     function getAppleModel(key) {
       const appleModelIds = [
+        {
+          key: "Mac17,2",
+          name: "MacBook",
+          size: "14-inch",
+          processor: "M5",
+          year: "2025",
+          additional: ""
+        },
+        {
+          key: "Mac16,13",
+          name: "MacBook Air",
+          size: "15-inch",
+          processor: "M4",
+          year: "2025",
+          additional: ""
+        },
+        {
+          key: "Mac16,12",
+          name: "MacBook Air",
+          size: "13-inch",
+          processor: "M4",
+          year: "2025",
+          additional: ""
+        },
+        {
+          key: "Mac15,13",
+          name: "MacBook Air",
+          size: "15-inch",
+          processor: "M3",
+          year: "2024",
+          additional: ""
+        },
         {
           key: "Mac15,12",
           name: "MacBook Air",
@@ -27226,6 +27371,38 @@ var init_progress = __esm({
   }
 });
+// src/benchmarks/profile.ts
+function withBenchmarkProfile(opts = {}) {
+  return {
+    temperature: BENCHMARK_PROFILE_TEMPERATURE,
+    top_p: BENCHMARK_PROFILE_TOP_P,
+    seed: BENCHMARK_PROFILE_SEED,
+    ...opts
+  };
+}
+function buildBenchmarkProfileMetadata(thinkEnabled) {
+  return {
+    version: BENCHMARK_PROFILE_VERSION,
+    sampling: {
+      temperature: BENCHMARK_PROFILE_TEMPERATURE,
+      topP: BENCHMARK_PROFILE_TOP_P,
+      seed: BENCHMARK_PROFILE_SEED
+    },
+    thinkingMode: thinkEnabled ? "enabled" : "disabled",
+    contextWindowTokens: null,
+    contextPolicy: "runtime-default"
+  };
+}
+var BENCHMARK_PROFILE_VERSION, BENCHMARK_PROFILE_SEED, BENCHMARK_PROFILE_TOP_P, BENCHMARK_PROFILE_TEMPERATURE;
+var init_profile = __esm({
+  "src/benchmarks/profile.ts"() {
+    BENCHMARK_PROFILE_VERSION = "v1";
+    BENCHMARK_PROFILE_SEED = 42;
+    BENCHMARK_PROFILE_TOP_P = 1;
+    BENCHMARK_PROFILE_TEMPERATURE = 0;
+  }
+});
 // src/benchmarks/performance.ts
 async function optionalProbe(probe, fallback2) {
   try {
@@ -27257,9 +27434,11 @@ async function runPerformanceBench(model, options = {}) {
     ]);
     const warmup = await withTimeout(
       generateStream3(model, WARMUP_PROMPT, void 0, {
-        num_predict: 32,
-        think: options.think,
-        stall_timeout_ms: options.streamStallTimeoutMs
+        ...withBenchmarkProfile({
+          num_predict: 32,
+          think: options.think,
+          stall_timeout_ms: options.streamStallTimeoutMs
+        })
       }),
       warmupTimeoutMs,
       "Model warmup",
@@ -27314,11 +27493,11 @@ async function runPerformanceBench(model, options = {}) {
                 }
               }
             },
-            {
+            withBenchmarkProfile({
               num_predict: 256,
               think: options.think,
               stall_timeout_ms: options.streamStallTimeoutMs
-            }
+            })
           ),
           promptTimeoutMs,
           "Performance benchmark",
@@ -27431,6 +27610,7 @@ var init_performance = __esm({
     init_hardware();
     init_utils();
     init_progress();
+    init_profile();
     WARMUP_PROMPT = "Say hello in one word.";
     BENCH_PROMPTS = [
       "Explain the concept of recursion in programming in 3 sentences.",
@@ -27823,7 +28003,7 @@ Answer:`;
       const startTime = Date.now();
       try {
         const result2 = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Reasoning question",
           abortOngoingRequests3
@@ -27870,6 +28050,7 @@ var init_reasoning2 = __esm({
     init_utils();
     init_progress();
     init_reasoning();
+    init_profile();
     questions = reasoning_default;
     DEFAULT_REASONING_TIMEOUT_MS = 12e4;
   }
@@ -28166,7 +28347,7 @@ Answer:`;
       const startTime = Date.now();
       try {
         const result2 = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Math problem",
           abortOngoingRequests3
@@ -28213,6 +28394,7 @@ var init_math2 = __esm({
     init_utils();
     init_progress();
     init_math();
+    init_profile();
     problems = math_default;
     DEFAULT_MATH_TIMEOUT_MS = 12e4;
   }
@@ -31985,7 +32167,7 @@ Reply with ONLY the function code, no explanation.`;
       const startTime = Date.now();
       try {
         const result2 = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 2048, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
           timeoutMs,
           "Coding task",
           abortOngoingRequests3
@@ -32045,6 +32227,7 @@ var init_coding2 = __esm({
     init_utils();
     init_progress();
     init_coding();
+    init_profile();
     VALID_IDENTIFIER_RE = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/;
     tasks = coding_default;
     DIFFICULTY_WEIGHT = {
@@ -32423,7 +32606,7 @@ async function runInstructionFollowingBench(model, opts) {
       const startTime = Date.now();
       try {
         const result2 = await withTimeout(
-          generate3(model, prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Instruction following task",
           abortOngoingRequests3
@@ -32469,6 +32652,7 @@ var init_instruction_following2 = __esm({
     init_utils();
     init_progress();
     init_instruction_following();
+    init_profile();
     questions2 = instruction_following_default;
     DEFAULT_INSTRUCTION_FOLLOWING_TIMEOUT_MS = 12e4;
   }
@@ -32823,7 +33007,7 @@ async function runStructuredOutputBench(model, opts) {
       const startTime = Date.now();
       try {
         const result2 = await withTimeout(
-          generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Structured output task",
           abortOngoingRequests3
@@ -32869,6 +33053,7 @@ var init_structured_output2 = __esm({
     init_utils();
     init_progress();
     init_structured_output();
+    init_profile();
     questions3 = structured_output_default;
     DEFAULT_STRUCTURED_OUTPUT_TIMEOUT_MS = 12e4;
   }
@@ -33090,7 +33275,7 @@ async function runMultilingualBench(model, opts) {
       const startTime = Date.now();
       try {
         const result2 = await withTimeout(
-          generate3(model, q.prompt, { temperature: 0, num_predict: 1024, think: opts?.think }),
+          generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
           timeoutMs,
           "Multilingual task",
           abortOngoingRequests3
@@ -33138,6 +33323,7 @@ var init_multilingual2 = __esm({
     init_utils();
     init_progress();
     init_multilingual();
+    init_profile();
     questions4 = multilingual_default;
     DEFAULT_MULTILINGUAL_TIMEOUT_MS = 12e4;
     NEGATION_PATTERNS = [
@@ -35323,6 +35509,25 @@ function getLevel(score) {
   if (score >= 25) return "Weak";
   return "Poor";
 }
+function summarizeCategoryIssues(name, details) {
+  let crashes = 0;
+  let timeouts = 0;
+  let errors = 0;
+  for (const detail of details) {
+    const actual = detail.actual ?? "";
+    if (/^TIMEOUT\b/i.test(actual)) {
+      timeouts++;
+      continue;
+    }
+    if (/^ERROR:/i.test(actual)) {
+      errors++;
+      if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
+        crashes++;
+      }
+    }
+  }
+  return { name, crashes, timeouts, errors };
+}
 function printHardwareTable(hw) {
   const table = new import_cli_table3.default({
     head: [source_default.bold("Hardware"), source_default.bold("Value")],
@@ -35441,6 +35646,18 @@ function printQualityTable(quality, timePenalties) {
     ]);
   }
   console.log(table.toString());
+  const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
+  if (issueSummaries.length > 0) {
+    console.log(source_default.yellow("Execution issues detected during quality benchmark:"));
+    for (const summary of issueSummaries) {
+      const parts = [];
+      if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
+      const nonCrashErrors = summary.errors - summary.crashes;
+      if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
+      if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
+      console.log(source_default.yellow(`  \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
+    }
+  }
 }
 function printSummaryTable(results) {
   const termWidth = process.stdout.columns || 80;
@@ -52270,6 +52487,11 @@ async function benchCommand(options) {
   if (!silent && thinkEnabled) {
     infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
   }
+  if (!silent) {
+    infoMsg(
+      `Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
+    );
+  }
   try {
     const results = [];
     const failedModels = [];
@@ -52358,7 +52580,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
             promptPackVersion: PROMPT_PACK_VERSION,
             runtimeVersion,
             runtimeBackend: getRuntimeName(),
-            modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
+            modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat(),
+            benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
           }
         };
         const rawLogHash = createHash("sha256").update(JSON.stringify(partialResult)).digest("hex");
@@ -52533,7 +52756,8 @@ var init_bench = __esm({
     init_telemetry();
     init_terminal();
     init_thinking_prompt();
-    BENCHMARK_SPEC_VERSION = "0.2.0";
+    init_profile();
+    BENCHMARK_SPEC_VERSION = "0.2.1";
     PROMPT_PACK_VERSION = "0.1.0";
   }
 });
@@ -53794,7 +54018,7 @@ var init_cli_main = __esm({
     program2 = new Command();
     program2.name("metrillm").description(
       "Benchmark local LLMs for hardware fit and task quality, then compute a global verdict"
-    ).version("0.1.0").hook("preAction", (_thisCommand, actionCommand) => {
+    ).version("0.2.1").hook("preAction", (_thisCommand, actionCommand) => {
       if (!actionCommand.opts()?.json) printBanner();
     });
     program2.option(

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "metrillm",
-  "version": "0.2.0",
+  "version": "0.2.1",
   "description": "Benchmark your local LLM models — speed, quality & hardware fitness verdict",
   "type": "module",
   "bin": {
@@ -49,7 +49,7 @@
     "ollama": "^0.5.12",
     "ora": "^8.1.1",
     "posthog-node": "^5.26.0",
-    "systeminformation": "^5.23.5"
+    "systeminformation": "^5.31.2"
   },
   "devDependencies": {
     "@types/node": "^22.10.0",