metrillm 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -18
- package/dist/index.mjs +275 -51
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
> Think Geekbench, but for local LLMs on your actual hardware.
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
|
-
|
|
16
|
+
npm install -g metrillm
|
|
17
|
+
metrillm bench
|
|
17
18
|
```
|
|
18
19
|
|
|
19
20
|
<p align="center">
|
|
@@ -56,26 +57,19 @@ npx metrillm@latest bench
|
|
|
56
57
|
> [Ollama](https://ollama.com/) or [LM Studio](https://lmstudio.ai/).
|
|
57
58
|
|
|
58
59
|
```bash
|
|
59
|
-
#
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
# Or install globally
|
|
63
|
-
npm i -g metrillm
|
|
60
|
+
# Install globally
|
|
61
|
+
npm install -g metrillm
|
|
64
62
|
metrillm bench
|
|
65
63
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
|
|
64
|
+
# Alternative package managers
|
|
65
|
+
pnpm add -g metrillm
|
|
66
|
+
bun add -g metrillm
|
|
69
67
|
|
|
70
|
-
#
|
|
71
|
-
brew
|
|
72
|
-
# Then:
|
|
73
|
-
brew install metrillm
|
|
74
|
-
metrillm bench
|
|
68
|
+
# Homebrew
|
|
69
|
+
brew install MetriLLM/metrillm/metrillm
|
|
75
70
|
|
|
76
|
-
#
|
|
77
|
-
|
|
78
|
-
bunx metrillm@latest bench
|
|
71
|
+
# Or run without installing
|
|
72
|
+
npx metrillm@latest bench
|
|
79
73
|
```
|
|
80
74
|
|
|
81
75
|
## Usage
|
|
@@ -134,6 +128,17 @@ For very large models, tune timeout flags:
|
|
|
134
128
|
- `--coding-timeout-ms` (default `240000`)
|
|
135
129
|
- `--lm-studio-stream-stall-timeout-ms` (default `180000`, `0` disables stall timeout)
|
|
136
130
|
|
|
131
|
+
Benchmark Profile v1 (applied to all benchmark prompts):
|
|
132
|
+
- `temperature=0`
|
|
133
|
+
- `top_p=1`
|
|
134
|
+
- `seed=42`
|
|
135
|
+
- `thinking` follows your benchmark mode (`--thinking` / `--no-thinking`)
|
|
136
|
+
- Context window stays runtime default (`context=runtime-default`) and is recorded as such in metadata.
|
|
137
|
+
|
|
138
|
+
LM Studio non-thinking guard:
|
|
139
|
+
- When benchmark mode requests non-thinking (`--no-thinking` or default), MetriLLM now aborts if the model still emits reasoning traces (for result comparability).
|
|
140
|
+
- To disable it in LM Studio for affected models, put this at the top of the model chat template: `{%- set enable_thinking = false %}` then eject/reload the model.
|
|
141
|
+
|
|
137
142
|
## How Scoring Works
|
|
138
143
|
|
|
139
144
|
**Hardware Fit Score** (0-100) — how well the model runs on your machine:
|
|
@@ -258,7 +263,7 @@ The tap formula lives in `Formula/metrillm.rb`.
|
|
|
258
263
|
./scripts/update-homebrew-formula.sh
|
|
259
264
|
|
|
260
265
|
# Or pin a specific version
|
|
261
|
-
./scripts/update-homebrew-formula.sh 0.2.
|
|
266
|
+
./scripts/update-homebrew-formula.sh 0.2.1
|
|
262
267
|
```
|
|
263
268
|
|
|
264
269
|
After updating the formula, commit and push so users can install/update with:
|
package/dist/index.mjs
CHANGED
|
@@ -4793,7 +4793,8 @@ function stripThinkTags(text) {
|
|
|
4793
4793
|
}
|
|
4794
4794
|
function hasThinkingContent(response, thinkingField) {
|
|
4795
4795
|
if (thinkingField && thinkingField.trim().length > 0) return true;
|
|
4796
|
-
|
|
4796
|
+
if (/<think(?:ing)?[\s>]/i.test(response)) return true;
|
|
4797
|
+
return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
|
|
4797
4798
|
}
|
|
4798
4799
|
function estimateTokenCount(text) {
|
|
4799
4800
|
if (!text) return 0;
|
|
@@ -5246,25 +5247,50 @@ async function listRunningModels() {
|
|
|
5246
5247
|
function setDefaultKeepAlive(keepAlive) {
|
|
5247
5248
|
defaultKeepAlive = keepAlive;
|
|
5248
5249
|
}
|
|
5250
|
+
function hasSamplingOverrides(options) {
|
|
5251
|
+
return options?.top_p !== void 0 || options?.seed !== void 0;
|
|
5252
|
+
}
|
|
5253
|
+
function isUnsupportedSamplingOptionError(err) {
|
|
5254
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
5255
|
+
const lower = message.toLowerCase();
|
|
5256
|
+
const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
|
|
5257
|
+
if (!mentionsSampling) return false;
|
|
5258
|
+
return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
|
|
5259
|
+
}
|
|
5260
|
+
function buildGenerateRequest(model, prompt, options, includeSampling) {
|
|
5261
|
+
return {
|
|
5262
|
+
model,
|
|
5263
|
+
prompt,
|
|
5264
|
+
stream: true,
|
|
5265
|
+
keep_alive: options?.keep_alive ?? defaultKeepAlive,
|
|
5266
|
+
...options?.think !== void 0 ? { think: options.think } : {},
|
|
5267
|
+
options: {
|
|
5268
|
+
temperature: options?.temperature ?? 0,
|
|
5269
|
+
...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
|
|
5270
|
+
...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
|
|
5271
|
+
num_predict: options?.num_predict ?? 512
|
|
5272
|
+
}
|
|
5273
|
+
};
|
|
5274
|
+
}
|
|
5249
5275
|
async function generate(model, prompt, options) {
|
|
5250
5276
|
return generateStream(model, prompt, void 0, options);
|
|
5251
5277
|
}
|
|
5252
5278
|
async function generateStream(model, prompt, callbacks, options) {
|
|
5253
|
-
const
|
|
5254
|
-
client.generate(
|
|
5255
|
-
model,
|
|
5256
|
-
prompt,
|
|
5257
|
-
stream: true,
|
|
5258
|
-
keep_alive: options?.keep_alive ?? defaultKeepAlive,
|
|
5259
|
-
...options?.think !== void 0 ? { think: options.think } : {},
|
|
5260
|
-
options: {
|
|
5261
|
-
temperature: options?.temperature ?? 0,
|
|
5262
|
-
num_predict: options?.num_predict ?? 512
|
|
5263
|
-
}
|
|
5264
|
-
}),
|
|
5279
|
+
const initializeStream = (includeSampling) => withTimeout(
|
|
5280
|
+
client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
|
|
5265
5281
|
OLLAMA_INIT_TIMEOUT_MS,
|
|
5266
5282
|
"Ollama generate initialization"
|
|
5267
5283
|
);
|
|
5284
|
+
let stream;
|
|
5285
|
+
try {
|
|
5286
|
+
stream = await initializeStream(true);
|
|
5287
|
+
} catch (err) {
|
|
5288
|
+
if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
|
|
5289
|
+
stream = await initializeStream(false);
|
|
5290
|
+
} else {
|
|
5291
|
+
throw err;
|
|
5292
|
+
}
|
|
5293
|
+
}
|
|
5268
5294
|
let fullResponse = "";
|
|
5269
5295
|
let fullThinking = "";
|
|
5270
5296
|
let result2 = null;
|
|
@@ -5344,6 +5370,23 @@ var init_ollama_client = __esm({
|
|
|
5344
5370
|
STREAM_STALL_TIMEOUT_MS = 3e4;
|
|
5345
5371
|
}
|
|
5346
5372
|
});
|
|
5373
|
+
function hasThinkingLeakText(response) {
|
|
5374
|
+
return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
|
|
5375
|
+
}
|
|
5376
|
+
function assertThinkingModeRespected(model, think, response, reasoning) {
|
|
5377
|
+
if (think !== false) return;
|
|
5378
|
+
if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
|
|
5379
|
+
throw new Error(
|
|
5380
|
+
[
|
|
5381
|
+
`LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
|
|
5382
|
+
"In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
|
|
5383
|
+
"If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
|
|
5384
|
+
"Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
|
|
5385
|
+
"Then eject/reload the model and run the benchmark again."
|
|
5386
|
+
].join(" ")
|
|
5387
|
+
);
|
|
5388
|
+
}
|
|
5389
|
+
}
|
|
5347
5390
|
function buildThinkingConfig(think) {
|
|
5348
5391
|
if (think === void 0) return {};
|
|
5349
5392
|
const effort = think ? "high" : "low";
|
|
@@ -5353,6 +5396,65 @@ function buildThinkingConfig(think) {
|
|
|
5353
5396
|
reasoning: { effort }
|
|
5354
5397
|
};
|
|
5355
5398
|
}
|
|
5399
|
+
function hasSamplingOverrides2(options) {
|
|
5400
|
+
return options?.top_p !== void 0 || options?.seed !== void 0;
|
|
5401
|
+
}
|
|
5402
|
+
function isUnsupportedSamplingMessage(status, text) {
|
|
5403
|
+
if (status !== 400 && status !== 422) return false;
|
|
5404
|
+
const lower = text.toLowerCase();
|
|
5405
|
+
const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
|
|
5406
|
+
if (!mentionsSampling) return false;
|
|
5407
|
+
return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
|
|
5408
|
+
}
|
|
5409
|
+
function extractLMStudioErrorMessage(body) {
|
|
5410
|
+
const trimmed = body.trim();
|
|
5411
|
+
if (!trimmed) return "";
|
|
5412
|
+
try {
|
|
5413
|
+
const parsed = JSON.parse(trimmed);
|
|
5414
|
+
const message = parsed.error?.message;
|
|
5415
|
+
if (typeof message === "string" && message.trim().length > 0) {
|
|
5416
|
+
return message.trim();
|
|
5417
|
+
}
|
|
5418
|
+
} catch {
|
|
5419
|
+
}
|
|
5420
|
+
return trimmed;
|
|
5421
|
+
}
|
|
5422
|
+
function isModelLoadGuardrailError(message) {
|
|
5423
|
+
const lower = message.toLowerCase();
|
|
5424
|
+
if (!lower.includes("failed to load model")) return false;
|
|
5425
|
+
return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
|
|
5426
|
+
}
|
|
5427
|
+
function buildLMStudioRequestError(kind, model, status, statusText, body) {
|
|
5428
|
+
const backendMessage = extractLMStudioErrorMessage(body);
|
|
5429
|
+
if (isModelLoadGuardrailError(backendMessage)) {
|
|
5430
|
+
return new Error(
|
|
5431
|
+
[
|
|
5432
|
+
`LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
|
|
5433
|
+
"In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
|
|
5434
|
+
`Backend error: ${backendMessage}`
|
|
5435
|
+
].join(" ")
|
|
5436
|
+
);
|
|
5437
|
+
}
|
|
5438
|
+
const suffix = backendMessage ? ` ${backendMessage}` : "";
|
|
5439
|
+
return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
|
|
5440
|
+
}
|
|
5441
|
+
function buildChatCompletionBody(model, prompt, options, stream, includeSampling) {
|
|
5442
|
+
const messages = options?.think === false ? [
|
|
5443
|
+
{ role: "system", content: NON_THINKING_SYSTEM_PROMPT },
|
|
5444
|
+
{ role: "user", content: prompt }
|
|
5445
|
+
] : [{ role: "user", content: prompt }];
|
|
5446
|
+
return {
|
|
5447
|
+
model,
|
|
5448
|
+
messages,
|
|
5449
|
+
temperature: options?.temperature ?? 0,
|
|
5450
|
+
...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
|
|
5451
|
+
...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
|
|
5452
|
+
max_tokens: options?.num_predict ?? 512,
|
|
5453
|
+
stream,
|
|
5454
|
+
...stream ? { stream_options: { include_usage: true } } : {},
|
|
5455
|
+
...buildThinkingConfig(options?.think)
|
|
5456
|
+
};
|
|
5457
|
+
}
|
|
5356
5458
|
function parseNonNegativeInt(value) {
|
|
5357
5459
|
if (!/^\d+$/.test(value)) return null;
|
|
5358
5460
|
const parsed = Number.parseInt(value, 10);
|
|
@@ -5437,7 +5539,7 @@ async function pathIsDirectory(targetPath) {
|
|
|
5437
5539
|
try {
|
|
5438
5540
|
const stat = await promises.stat(targetPath);
|
|
5439
5541
|
return stat.isDirectory();
|
|
5440
|
-
} catch {
|
|
5542
|
+
} catch (_err) {
|
|
5441
5543
|
return false;
|
|
5442
5544
|
}
|
|
5443
5545
|
}
|
|
@@ -5837,27 +5939,30 @@ async function generate2(model, prompt, options) {
|
|
|
5837
5939
|
try {
|
|
5838
5940
|
const baseUrl = getLMStudioBaseUrl();
|
|
5839
5941
|
const url = new URL("/v1/chat/completions", baseUrl);
|
|
5840
|
-
const
|
|
5942
|
+
const doRequest = (includeSampling) => fetch(url, {
|
|
5841
5943
|
method: "POST",
|
|
5842
5944
|
headers: getLMStudioHeaders(),
|
|
5843
|
-
body: JSON.stringify(
|
|
5844
|
-
model,
|
|
5845
|
-
messages: [{ role: "user", content: prompt }],
|
|
5846
|
-
temperature: options?.temperature ?? 0,
|
|
5847
|
-
max_tokens: options?.num_predict ?? 512,
|
|
5848
|
-
stream: false,
|
|
5849
|
-
...buildThinkingConfig(options?.think)
|
|
5850
|
-
}),
|
|
5945
|
+
body: JSON.stringify(buildChatCompletionBody(model, prompt, options, false, includeSampling)),
|
|
5851
5946
|
signal: controller.signal
|
|
5852
5947
|
});
|
|
5948
|
+
let resp = await doRequest(true);
|
|
5853
5949
|
if (!resp.ok) {
|
|
5854
5950
|
const body = await resp.text().catch(() => "");
|
|
5855
|
-
|
|
5951
|
+
if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
|
|
5952
|
+
resp = await doRequest(false);
|
|
5953
|
+
} else {
|
|
5954
|
+
throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
|
|
5955
|
+
}
|
|
5956
|
+
}
|
|
5957
|
+
if (!resp.ok) {
|
|
5958
|
+
const body = await resp.text().catch(() => "");
|
|
5959
|
+
throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
|
|
5856
5960
|
}
|
|
5857
5961
|
const payload = await resp.json();
|
|
5858
5962
|
const choice = extractChoice2(payload);
|
|
5859
5963
|
const response = extractContent(choice);
|
|
5860
5964
|
const reasoning = extractReasoning(choice);
|
|
5965
|
+
assertThinkingModeRespected(model, options?.think, response, reasoning);
|
|
5861
5966
|
const usage = extractUsage(payload);
|
|
5862
5967
|
const totalDuration = Math.max(0, Date.now() - start) * 1e6;
|
|
5863
5968
|
return {
|
|
@@ -5898,23 +6003,24 @@ async function generateStream2(model, prompt, callbacks, options) {
|
|
|
5898
6003
|
};
|
|
5899
6004
|
try {
|
|
5900
6005
|
resetStallTimer();
|
|
5901
|
-
const
|
|
6006
|
+
const doRequest = (includeSampling) => fetch(url, {
|
|
5902
6007
|
method: "POST",
|
|
5903
6008
|
headers: getLMStudioHeaders(),
|
|
5904
|
-
body: JSON.stringify(
|
|
5905
|
-
model,
|
|
5906
|
-
messages: [{ role: "user", content: prompt }],
|
|
5907
|
-
temperature: options?.temperature ?? 0,
|
|
5908
|
-
max_tokens: options?.num_predict ?? 512,
|
|
5909
|
-
stream: true,
|
|
5910
|
-
stream_options: { include_usage: true },
|
|
5911
|
-
...buildThinkingConfig(options?.think)
|
|
5912
|
-
}),
|
|
6009
|
+
body: JSON.stringify(buildChatCompletionBody(model, prompt, options, true, includeSampling)),
|
|
5913
6010
|
signal: controller.signal
|
|
5914
6011
|
});
|
|
6012
|
+
let resp = await doRequest(true);
|
|
5915
6013
|
if (!resp.ok) {
|
|
5916
6014
|
const body = await resp.text().catch(() => "");
|
|
5917
|
-
|
|
6015
|
+
if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
|
|
6016
|
+
resp = await doRequest(false);
|
|
6017
|
+
} else {
|
|
6018
|
+
throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
|
|
6019
|
+
}
|
|
6020
|
+
}
|
|
6021
|
+
if (!resp.ok) {
|
|
6022
|
+
const body = await resp.text().catch(() => "");
|
|
6023
|
+
throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
|
|
5918
6024
|
}
|
|
5919
6025
|
if (!resp.body) {
|
|
5920
6026
|
throw new Error("LM Studio stream response body is empty");
|
|
@@ -5994,6 +6100,7 @@ async function generateStream2(model, prompt, callbacks, options) {
|
|
|
5994
6100
|
evalCount: usage?.completion_tokens ?? 0,
|
|
5995
6101
|
evalDuration: Math.max(1, evalDurationMs) * 1e6
|
|
5996
6102
|
};
|
|
6103
|
+
assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
|
|
5997
6104
|
callbacks?.onDone?.(result2);
|
|
5998
6105
|
return result2;
|
|
5999
6106
|
} catch (err) {
|
|
@@ -6017,7 +6124,7 @@ function abortOngoingRequests2() {
|
|
|
6017
6124
|
}
|
|
6018
6125
|
activeAbortControllers.clear();
|
|
6019
6126
|
}
|
|
6020
|
-
var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache;
|
|
6127
|
+
var DEFAULT_LM_STUDIO_BASE_URL, LM_STUDIO_INIT_TIMEOUT_MS, LM_STUDIO_METADATA_TIMEOUT_MS, DEFAULT_STREAM_STALL_TIMEOUT_MS, DEFAULT_LM_STUDIO_HOME_DIR, DEFAULT_LM_STUDIO_MODELS_DIR, LM_STUDIO_HOME_DIR_ENV, LM_STUDIO_MODELS_DIR_ENV, activeAbortControllers, directorySizeCache, modelDefinitionCache, NON_THINKING_SYSTEM_PROMPT;
|
|
6021
6128
|
var init_lm_studio_client = __esm({
|
|
6022
6129
|
"src/core/lm-studio-client.ts"() {
|
|
6023
6130
|
DEFAULT_LM_STUDIO_BASE_URL = "http://127.0.0.1:1234";
|
|
@@ -6031,6 +6138,12 @@ var init_lm_studio_client = __esm({
|
|
|
6031
6138
|
activeAbortControllers = /* @__PURE__ */ new Set();
|
|
6032
6139
|
directorySizeCache = /* @__PURE__ */ new Map();
|
|
6033
6140
|
modelDefinitionCache = /* @__PURE__ */ new Map();
|
|
6141
|
+
NON_THINKING_SYSTEM_PROMPT = [
|
|
6142
|
+
"You are in non-thinking mode for benchmark reproducibility.",
|
|
6143
|
+
"Return only the final answer.",
|
|
6144
|
+
"Do not output internal reasoning, chain-of-thought, or scratchpad.",
|
|
6145
|
+
"Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
|
|
6146
|
+
].join(" ");
|
|
6034
6147
|
}
|
|
6035
6148
|
});
|
|
6036
6149
|
|
|
@@ -6187,7 +6300,7 @@ var require_package = __commonJS({
|
|
|
6187
6300
|
"node_modules/systeminformation/package.json"(exports$1, module) {
|
|
6188
6301
|
module.exports = {
|
|
6189
6302
|
name: "systeminformation",
|
|
6190
|
-
version: "5.31.
|
|
6303
|
+
version: "5.31.2",
|
|
6191
6304
|
description: "Advanced, lightweight system and OS information library",
|
|
6192
6305
|
license: "MIT",
|
|
6193
6306
|
author: "Sebastian Hildebrandt <hildebrandt@plus-innovations.com> (https://plus-innovations.com)",
|
|
@@ -7520,6 +7633,38 @@ var require_util = __commonJS({
|
|
|
7520
7633
|
}
|
|
7521
7634
|
function getAppleModel(key) {
|
|
7522
7635
|
const appleModelIds = [
|
|
7636
|
+
{
|
|
7637
|
+
key: "Mac17,2",
|
|
7638
|
+
name: "MacBook",
|
|
7639
|
+
size: "14-inch",
|
|
7640
|
+
processor: "M5",
|
|
7641
|
+
year: "2025",
|
|
7642
|
+
additional: ""
|
|
7643
|
+
},
|
|
7644
|
+
{
|
|
7645
|
+
key: "Mac16,13",
|
|
7646
|
+
name: "MacBook Air",
|
|
7647
|
+
size: "15-inch",
|
|
7648
|
+
processor: "M4",
|
|
7649
|
+
year: "2025",
|
|
7650
|
+
additional: ""
|
|
7651
|
+
},
|
|
7652
|
+
{
|
|
7653
|
+
key: "Mac16,12",
|
|
7654
|
+
name: "MacBook Air",
|
|
7655
|
+
size: "13-inch",
|
|
7656
|
+
processor: "M4",
|
|
7657
|
+
year: "2025",
|
|
7658
|
+
additional: ""
|
|
7659
|
+
},
|
|
7660
|
+
{
|
|
7661
|
+
key: "Mac15,13",
|
|
7662
|
+
name: "MacBook Air",
|
|
7663
|
+
size: "15-inch",
|
|
7664
|
+
processor: "M3",
|
|
7665
|
+
year: "2024",
|
|
7666
|
+
additional: ""
|
|
7667
|
+
},
|
|
7523
7668
|
{
|
|
7524
7669
|
key: "Mac15,12",
|
|
7525
7670
|
name: "MacBook Air",
|
|
@@ -27226,6 +27371,38 @@ var init_progress = __esm({
|
|
|
27226
27371
|
}
|
|
27227
27372
|
});
|
|
27228
27373
|
|
|
27374
|
+
// src/benchmarks/profile.ts
|
|
27375
|
+
function withBenchmarkProfile(opts = {}) {
|
|
27376
|
+
return {
|
|
27377
|
+
temperature: BENCHMARK_PROFILE_TEMPERATURE,
|
|
27378
|
+
top_p: BENCHMARK_PROFILE_TOP_P,
|
|
27379
|
+
seed: BENCHMARK_PROFILE_SEED,
|
|
27380
|
+
...opts
|
|
27381
|
+
};
|
|
27382
|
+
}
|
|
27383
|
+
function buildBenchmarkProfileMetadata(thinkEnabled) {
|
|
27384
|
+
return {
|
|
27385
|
+
version: BENCHMARK_PROFILE_VERSION,
|
|
27386
|
+
sampling: {
|
|
27387
|
+
temperature: BENCHMARK_PROFILE_TEMPERATURE,
|
|
27388
|
+
topP: BENCHMARK_PROFILE_TOP_P,
|
|
27389
|
+
seed: BENCHMARK_PROFILE_SEED
|
|
27390
|
+
},
|
|
27391
|
+
thinkingMode: thinkEnabled ? "enabled" : "disabled",
|
|
27392
|
+
contextWindowTokens: null,
|
|
27393
|
+
contextPolicy: "runtime-default"
|
|
27394
|
+
};
|
|
27395
|
+
}
|
|
27396
|
+
var BENCHMARK_PROFILE_VERSION, BENCHMARK_PROFILE_SEED, BENCHMARK_PROFILE_TOP_P, BENCHMARK_PROFILE_TEMPERATURE;
|
|
27397
|
+
var init_profile = __esm({
|
|
27398
|
+
"src/benchmarks/profile.ts"() {
|
|
27399
|
+
BENCHMARK_PROFILE_VERSION = "v1";
|
|
27400
|
+
BENCHMARK_PROFILE_SEED = 42;
|
|
27401
|
+
BENCHMARK_PROFILE_TOP_P = 1;
|
|
27402
|
+
BENCHMARK_PROFILE_TEMPERATURE = 0;
|
|
27403
|
+
}
|
|
27404
|
+
});
|
|
27405
|
+
|
|
27229
27406
|
// src/benchmarks/performance.ts
|
|
27230
27407
|
async function optionalProbe(probe, fallback2) {
|
|
27231
27408
|
try {
|
|
@@ -27257,9 +27434,11 @@ async function runPerformanceBench(model, options = {}) {
|
|
|
27257
27434
|
]);
|
|
27258
27435
|
const warmup = await withTimeout(
|
|
27259
27436
|
generateStream3(model, WARMUP_PROMPT, void 0, {
|
|
27260
|
-
|
|
27261
|
-
|
|
27262
|
-
|
|
27437
|
+
...withBenchmarkProfile({
|
|
27438
|
+
num_predict: 32,
|
|
27439
|
+
think: options.think,
|
|
27440
|
+
stall_timeout_ms: options.streamStallTimeoutMs
|
|
27441
|
+
})
|
|
27263
27442
|
}),
|
|
27264
27443
|
warmupTimeoutMs,
|
|
27265
27444
|
"Model warmup",
|
|
@@ -27314,11 +27493,11 @@ async function runPerformanceBench(model, options = {}) {
|
|
|
27314
27493
|
}
|
|
27315
27494
|
}
|
|
27316
27495
|
},
|
|
27317
|
-
{
|
|
27496
|
+
withBenchmarkProfile({
|
|
27318
27497
|
num_predict: 256,
|
|
27319
27498
|
think: options.think,
|
|
27320
27499
|
stall_timeout_ms: options.streamStallTimeoutMs
|
|
27321
|
-
}
|
|
27500
|
+
})
|
|
27322
27501
|
),
|
|
27323
27502
|
promptTimeoutMs,
|
|
27324
27503
|
"Performance benchmark",
|
|
@@ -27431,6 +27610,7 @@ var init_performance = __esm({
|
|
|
27431
27610
|
init_hardware();
|
|
27432
27611
|
init_utils();
|
|
27433
27612
|
init_progress();
|
|
27613
|
+
init_profile();
|
|
27434
27614
|
WARMUP_PROMPT = "Say hello in one word.";
|
|
27435
27615
|
BENCH_PROMPTS = [
|
|
27436
27616
|
"Explain the concept of recursion in programming in 3 sentences.",
|
|
@@ -27823,7 +28003,7 @@ Answer:`;
|
|
|
27823
28003
|
const startTime = Date.now();
|
|
27824
28004
|
try {
|
|
27825
28005
|
const result2 = await withTimeout(
|
|
27826
|
-
generate3(model, prompt, {
|
|
28006
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
27827
28007
|
timeoutMs,
|
|
27828
28008
|
"Reasoning question",
|
|
27829
28009
|
abortOngoingRequests3
|
|
@@ -27870,6 +28050,7 @@ var init_reasoning2 = __esm({
|
|
|
27870
28050
|
init_utils();
|
|
27871
28051
|
init_progress();
|
|
27872
28052
|
init_reasoning();
|
|
28053
|
+
init_profile();
|
|
27873
28054
|
questions = reasoning_default;
|
|
27874
28055
|
DEFAULT_REASONING_TIMEOUT_MS = 12e4;
|
|
27875
28056
|
}
|
|
@@ -28166,7 +28347,7 @@ Answer:`;
|
|
|
28166
28347
|
const startTime = Date.now();
|
|
28167
28348
|
try {
|
|
28168
28349
|
const result2 = await withTimeout(
|
|
28169
|
-
generate3(model, prompt, {
|
|
28350
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
28170
28351
|
timeoutMs,
|
|
28171
28352
|
"Math problem",
|
|
28172
28353
|
abortOngoingRequests3
|
|
@@ -28213,6 +28394,7 @@ var init_math2 = __esm({
|
|
|
28213
28394
|
init_utils();
|
|
28214
28395
|
init_progress();
|
|
28215
28396
|
init_math();
|
|
28397
|
+
init_profile();
|
|
28216
28398
|
problems = math_default;
|
|
28217
28399
|
DEFAULT_MATH_TIMEOUT_MS = 12e4;
|
|
28218
28400
|
}
|
|
@@ -31985,7 +32167,7 @@ Reply with ONLY the function code, no explanation.`;
|
|
|
31985
32167
|
const startTime = Date.now();
|
|
31986
32168
|
try {
|
|
31987
32169
|
const result2 = await withTimeout(
|
|
31988
|
-
generate3(model, prompt, {
|
|
32170
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
|
|
31989
32171
|
timeoutMs,
|
|
31990
32172
|
"Coding task",
|
|
31991
32173
|
abortOngoingRequests3
|
|
@@ -32045,6 +32227,7 @@ var init_coding2 = __esm({
|
|
|
32045
32227
|
init_utils();
|
|
32046
32228
|
init_progress();
|
|
32047
32229
|
init_coding();
|
|
32230
|
+
init_profile();
|
|
32048
32231
|
VALID_IDENTIFIER_RE = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/;
|
|
32049
32232
|
tasks = coding_default;
|
|
32050
32233
|
DIFFICULTY_WEIGHT = {
|
|
@@ -32423,7 +32606,7 @@ async function runInstructionFollowingBench(model, opts) {
|
|
|
32423
32606
|
const startTime = Date.now();
|
|
32424
32607
|
try {
|
|
32425
32608
|
const result2 = await withTimeout(
|
|
32426
|
-
generate3(model, prompt, {
|
|
32609
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
32427
32610
|
timeoutMs,
|
|
32428
32611
|
"Instruction following task",
|
|
32429
32612
|
abortOngoingRequests3
|
|
@@ -32469,6 +32652,7 @@ var init_instruction_following2 = __esm({
|
|
|
32469
32652
|
init_utils();
|
|
32470
32653
|
init_progress();
|
|
32471
32654
|
init_instruction_following();
|
|
32655
|
+
init_profile();
|
|
32472
32656
|
questions2 = instruction_following_default;
|
|
32473
32657
|
DEFAULT_INSTRUCTION_FOLLOWING_TIMEOUT_MS = 12e4;
|
|
32474
32658
|
}
|
|
@@ -32823,7 +33007,7 @@ async function runStructuredOutputBench(model, opts) {
|
|
|
32823
33007
|
const startTime = Date.now();
|
|
32824
33008
|
try {
|
|
32825
33009
|
const result2 = await withTimeout(
|
|
32826
|
-
generate3(model, q.prompt, {
|
|
33010
|
+
generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
32827
33011
|
timeoutMs,
|
|
32828
33012
|
"Structured output task",
|
|
32829
33013
|
abortOngoingRequests3
|
|
@@ -32869,6 +33053,7 @@ var init_structured_output2 = __esm({
|
|
|
32869
33053
|
init_utils();
|
|
32870
33054
|
init_progress();
|
|
32871
33055
|
init_structured_output();
|
|
33056
|
+
init_profile();
|
|
32872
33057
|
questions3 = structured_output_default;
|
|
32873
33058
|
DEFAULT_STRUCTURED_OUTPUT_TIMEOUT_MS = 12e4;
|
|
32874
33059
|
}
|
|
@@ -33090,7 +33275,7 @@ async function runMultilingualBench(model, opts) {
|
|
|
33090
33275
|
const startTime = Date.now();
|
|
33091
33276
|
try {
|
|
33092
33277
|
const result2 = await withTimeout(
|
|
33093
|
-
generate3(model, q.prompt, {
|
|
33278
|
+
generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
33094
33279
|
timeoutMs,
|
|
33095
33280
|
"Multilingual task",
|
|
33096
33281
|
abortOngoingRequests3
|
|
@@ -33138,6 +33323,7 @@ var init_multilingual2 = __esm({
|
|
|
33138
33323
|
init_utils();
|
|
33139
33324
|
init_progress();
|
|
33140
33325
|
init_multilingual();
|
|
33326
|
+
init_profile();
|
|
33141
33327
|
questions4 = multilingual_default;
|
|
33142
33328
|
DEFAULT_MULTILINGUAL_TIMEOUT_MS = 12e4;
|
|
33143
33329
|
NEGATION_PATTERNS = [
|
|
@@ -35323,6 +35509,25 @@ function getLevel(score) {
|
|
|
35323
35509
|
if (score >= 25) return "Weak";
|
|
35324
35510
|
return "Poor";
|
|
35325
35511
|
}
|
|
35512
|
+
function summarizeCategoryIssues(name, details) {
|
|
35513
|
+
let crashes = 0;
|
|
35514
|
+
let timeouts = 0;
|
|
35515
|
+
let errors = 0;
|
|
35516
|
+
for (const detail of details) {
|
|
35517
|
+
const actual = detail.actual ?? "";
|
|
35518
|
+
if (/^TIMEOUT\b/i.test(actual)) {
|
|
35519
|
+
timeouts++;
|
|
35520
|
+
continue;
|
|
35521
|
+
}
|
|
35522
|
+
if (/^ERROR:/i.test(actual)) {
|
|
35523
|
+
errors++;
|
|
35524
|
+
if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
|
|
35525
|
+
crashes++;
|
|
35526
|
+
}
|
|
35527
|
+
}
|
|
35528
|
+
}
|
|
35529
|
+
return { name, crashes, timeouts, errors };
|
|
35530
|
+
}
|
|
35326
35531
|
function printHardwareTable(hw) {
|
|
35327
35532
|
const table = new import_cli_table3.default({
|
|
35328
35533
|
head: [source_default.bold("Hardware"), source_default.bold("Value")],
|
|
@@ -35441,6 +35646,18 @@ function printQualityTable(quality, timePenalties) {
|
|
|
35441
35646
|
]);
|
|
35442
35647
|
}
|
|
35443
35648
|
console.log(table.toString());
|
|
35649
|
+
const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
|
|
35650
|
+
if (issueSummaries.length > 0) {
|
|
35651
|
+
console.log(source_default.yellow("Execution issues detected during quality benchmark:"));
|
|
35652
|
+
for (const summary of issueSummaries) {
|
|
35653
|
+
const parts = [];
|
|
35654
|
+
if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
|
|
35655
|
+
const nonCrashErrors = summary.errors - summary.crashes;
|
|
35656
|
+
if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
|
|
35657
|
+
if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
|
|
35658
|
+
console.log(source_default.yellow(` \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
|
|
35659
|
+
}
|
|
35660
|
+
}
|
|
35444
35661
|
}
|
|
35445
35662
|
function printSummaryTable(results) {
|
|
35446
35663
|
const termWidth = process.stdout.columns || 80;
|
|
@@ -52270,6 +52487,11 @@ async function benchCommand(options) {
|
|
|
52270
52487
|
if (!silent && thinkEnabled) {
|
|
52271
52488
|
infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
|
|
52272
52489
|
}
|
|
52490
|
+
if (!silent) {
|
|
52491
|
+
infoMsg(
|
|
52492
|
+
`Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
|
|
52493
|
+
);
|
|
52494
|
+
}
|
|
52273
52495
|
try {
|
|
52274
52496
|
const results = [];
|
|
52275
52497
|
const failedModels = [];
|
|
@@ -52358,7 +52580,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
|
|
|
52358
52580
|
promptPackVersion: PROMPT_PACK_VERSION,
|
|
52359
52581
|
runtimeVersion,
|
|
52360
52582
|
runtimeBackend: getRuntimeName(),
|
|
52361
|
-
modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
|
|
52583
|
+
modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat(),
|
|
52584
|
+
benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
|
|
52362
52585
|
}
|
|
52363
52586
|
};
|
|
52364
52587
|
const rawLogHash = createHash("sha256").update(JSON.stringify(partialResult)).digest("hex");
|
|
@@ -52533,7 +52756,8 @@ var init_bench = __esm({
|
|
|
52533
52756
|
init_telemetry();
|
|
52534
52757
|
init_terminal();
|
|
52535
52758
|
init_thinking_prompt();
|
|
52536
|
-
|
|
52759
|
+
init_profile();
|
|
52760
|
+
BENCHMARK_SPEC_VERSION = "0.2.1";
|
|
52537
52761
|
PROMPT_PACK_VERSION = "0.1.0";
|
|
52538
52762
|
}
|
|
52539
52763
|
});
|
|
@@ -53794,7 +54018,7 @@ var init_cli_main = __esm({
|
|
|
53794
54018
|
program2 = new Command();
|
|
53795
54019
|
program2.name("metrillm").description(
|
|
53796
54020
|
"Benchmark local LLMs for hardware fit and task quality, then compute a global verdict"
|
|
53797
|
-
).version("0.1
|
|
54021
|
+
).version("0.2.1").hook("preAction", (_thisCommand, actionCommand) => {
|
|
53798
54022
|
if (!actionCommand.opts()?.json) printBanner();
|
|
53799
54023
|
});
|
|
53800
54024
|
program2.option(
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "metrillm",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "Benchmark your local LLM models — speed, quality & hardware fitness verdict",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -49,7 +49,7 @@
|
|
|
49
49
|
"ollama": "^0.5.12",
|
|
50
50
|
"ora": "^8.1.1",
|
|
51
51
|
"posthog-node": "^5.26.0",
|
|
52
|
-
"systeminformation": "^5.
|
|
52
|
+
"systeminformation": "^5.31.2"
|
|
53
53
|
},
|
|
54
54
|
"devDependencies": {
|
|
55
55
|
"@types/node": "^22.10.0",
|