metrillm-mcp 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +228 -49
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -110,7 +110,8 @@ function stripThinkTags(text) {
|
|
|
110
110
|
}
|
|
111
111
|
function hasThinkingContent(response, thinkingField) {
|
|
112
112
|
if (thinkingField && thinkingField.trim().length > 0) return true;
|
|
113
|
-
|
|
113
|
+
if (/<think(?:ing)?[\s>]/i.test(response)) return true;
|
|
114
|
+
return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response);
|
|
114
115
|
}
|
|
115
116
|
function estimateTokenCount(text) {
|
|
116
117
|
if (!text) return 0;
|
|
@@ -564,25 +565,50 @@ var defaultKeepAlive;
|
|
|
564
565
|
function setDefaultKeepAlive(keepAlive) {
|
|
565
566
|
defaultKeepAlive = keepAlive;
|
|
566
567
|
}
|
|
568
|
+
function hasSamplingOverrides(options) {
|
|
569
|
+
return options?.top_p !== void 0 || options?.seed !== void 0;
|
|
570
|
+
}
|
|
571
|
+
function isUnsupportedSamplingOptionError(err) {
|
|
572
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
573
|
+
const lower = message.toLowerCase();
|
|
574
|
+
const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
|
|
575
|
+
if (!mentionsSampling) return false;
|
|
576
|
+
return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
|
|
577
|
+
}
|
|
578
|
+
function buildGenerateRequest(model, prompt, options, includeSampling) {
|
|
579
|
+
return {
|
|
580
|
+
model,
|
|
581
|
+
prompt,
|
|
582
|
+
stream: true,
|
|
583
|
+
keep_alive: options?.keep_alive ?? defaultKeepAlive,
|
|
584
|
+
...options?.think !== void 0 ? { think: options.think } : {},
|
|
585
|
+
options: {
|
|
586
|
+
temperature: options?.temperature ?? 0,
|
|
587
|
+
...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
|
|
588
|
+
...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
|
|
589
|
+
num_predict: options?.num_predict ?? 512
|
|
590
|
+
}
|
|
591
|
+
};
|
|
592
|
+
}
|
|
567
593
|
async function generate(model, prompt, options) {
|
|
568
594
|
return generateStream(model, prompt, void 0, options);
|
|
569
595
|
}
|
|
570
596
|
async function generateStream(model, prompt, callbacks, options) {
|
|
571
|
-
const
|
|
572
|
-
client.generate(
|
|
573
|
-
model,
|
|
574
|
-
prompt,
|
|
575
|
-
stream: true,
|
|
576
|
-
keep_alive: options?.keep_alive ?? defaultKeepAlive,
|
|
577
|
-
...options?.think !== void 0 ? { think: options.think } : {},
|
|
578
|
-
options: {
|
|
579
|
-
temperature: options?.temperature ?? 0,
|
|
580
|
-
num_predict: options?.num_predict ?? 512
|
|
581
|
-
}
|
|
582
|
-
}),
|
|
597
|
+
const initializeStream = (includeSampling) => withTimeout(
|
|
598
|
+
client.generate(buildGenerateRequest(model, prompt, options, includeSampling)),
|
|
583
599
|
OLLAMA_INIT_TIMEOUT_MS,
|
|
584
600
|
"Ollama generate initialization"
|
|
585
601
|
);
|
|
602
|
+
let stream;
|
|
603
|
+
try {
|
|
604
|
+
stream = await initializeStream(true);
|
|
605
|
+
} catch (err) {
|
|
606
|
+
if (hasSamplingOverrides(options) && isUnsupportedSamplingOptionError(err)) {
|
|
607
|
+
stream = await initializeStream(false);
|
|
608
|
+
} else {
|
|
609
|
+
throw err;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
586
612
|
let fullResponse = "";
|
|
587
613
|
let fullThinking = "";
|
|
588
614
|
let result = null;
|
|
@@ -668,6 +694,29 @@ var defaultKeepAlive2;
|
|
|
668
694
|
var activeAbortControllers = /* @__PURE__ */ new Set();
|
|
669
695
|
var directorySizeCache = /* @__PURE__ */ new Map();
|
|
670
696
|
var modelDefinitionCache = /* @__PURE__ */ new Map();
|
|
697
|
+
var NON_THINKING_SYSTEM_PROMPT = [
|
|
698
|
+
"You are in non-thinking mode for benchmark reproducibility.",
|
|
699
|
+
"Return only the final answer.",
|
|
700
|
+
"Do not output internal reasoning, chain-of-thought, or scratchpad.",
|
|
701
|
+
"Never output tags or sections like <think>, </think>, [THINK], [/THINK], or Thinking Process."
|
|
702
|
+
].join(" ");
|
|
703
|
+
function hasThinkingLeakText(response) {
|
|
704
|
+
return /^\s*(?:thinking|thought)\s+process\s*:/i.test(response) || /\[(?:\/)?THINK(?:ING)?\]/i.test(response);
|
|
705
|
+
}
|
|
706
|
+
function assertThinkingModeRespected(model, think, response, reasoning) {
|
|
707
|
+
if (think !== false) return;
|
|
708
|
+
if (reasoning.trim().length > 0 || /<think(?:ing)?[\s>]/i.test(response) || hasThinkingLeakText(response)) {
|
|
709
|
+
throw new Error(
|
|
710
|
+
[
|
|
711
|
+
`LM Studio model "${model}" still emitted thinking content while non-thinking mode is requested.`,
|
|
712
|
+
"In LM Studio, add this at the top of the model chat template: {%- set enable_thinking = false %}.",
|
|
713
|
+
"If this model does not expose a Prompt/Chat Template editor in LM Studio (e.g. some GPT-OSS builds), non-thinking mode cannot be enforced from the API.",
|
|
714
|
+
"Use --thinking for this model, or benchmark a model/runtime that supports explicit non-thinking control.",
|
|
715
|
+
"Then eject/reload the model and run the benchmark again."
|
|
716
|
+
].join(" ")
|
|
717
|
+
);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
671
720
|
function buildThinkingConfig(think) {
|
|
672
721
|
if (think === void 0) return {};
|
|
673
722
|
const effort = think ? "high" : "low";
|
|
@@ -677,6 +726,65 @@ function buildThinkingConfig(think) {
|
|
|
677
726
|
reasoning: { effort }
|
|
678
727
|
};
|
|
679
728
|
}
|
|
729
|
+
function hasSamplingOverrides2(options) {
|
|
730
|
+
return options?.top_p !== void 0 || options?.seed !== void 0;
|
|
731
|
+
}
|
|
732
|
+
function isUnsupportedSamplingMessage(status, text) {
|
|
733
|
+
if (status !== 400 && status !== 422) return false;
|
|
734
|
+
const lower = text.toLowerCase();
|
|
735
|
+
const mentionsSampling = /\b(seed|top_p|topp)\b/.test(lower);
|
|
736
|
+
if (!mentionsSampling) return false;
|
|
737
|
+
return /unrecognized|unknown|not support|unsupported|invalid|unexpected|additional|extra/.test(lower);
|
|
738
|
+
}
|
|
739
|
+
function extractLMStudioErrorMessage(body) {
|
|
740
|
+
const trimmed = body.trim();
|
|
741
|
+
if (!trimmed) return "";
|
|
742
|
+
try {
|
|
743
|
+
const parsed = JSON.parse(trimmed);
|
|
744
|
+
const message = parsed.error?.message;
|
|
745
|
+
if (typeof message === "string" && message.trim().length > 0) {
|
|
746
|
+
return message.trim();
|
|
747
|
+
}
|
|
748
|
+
} catch {
|
|
749
|
+
}
|
|
750
|
+
return trimmed;
|
|
751
|
+
}
|
|
752
|
+
function isModelLoadGuardrailError(message) {
|
|
753
|
+
const lower = message.toLowerCase();
|
|
754
|
+
if (!lower.includes("failed to load model")) return false;
|
|
755
|
+
return lower.includes("insufficient system resources") || lower.includes("overload your system") || lower.includes("loading guardrails");
|
|
756
|
+
}
|
|
757
|
+
function buildLMStudioRequestError(kind, model, status, statusText, body) {
|
|
758
|
+
const backendMessage = extractLMStudioErrorMessage(body);
|
|
759
|
+
if (isModelLoadGuardrailError(backendMessage)) {
|
|
760
|
+
return new Error(
|
|
761
|
+
[
|
|
762
|
+
`LM Studio could not load model "${model}" due to insufficient system resources (model loading guardrails).`,
|
|
763
|
+
"In LM Studio: unload other models, reduce loaded context length, or relax model loading guardrails in Settings.",
|
|
764
|
+
`Backend error: ${backendMessage}`
|
|
765
|
+
].join(" ")
|
|
766
|
+
);
|
|
767
|
+
}
|
|
768
|
+
const suffix = backendMessage ? ` ${backendMessage}` : "";
|
|
769
|
+
return new Error(`LM Studio ${kind} failed (${status} ${statusText})${suffix}`.trim());
|
|
770
|
+
}
|
|
771
|
+
function buildChatCompletionBody(model, prompt, options, stream, includeSampling) {
|
|
772
|
+
const messages = options?.think === false ? [
|
|
773
|
+
{ role: "system", content: NON_THINKING_SYSTEM_PROMPT },
|
|
774
|
+
{ role: "user", content: prompt }
|
|
775
|
+
] : [{ role: "user", content: prompt }];
|
|
776
|
+
return {
|
|
777
|
+
model,
|
|
778
|
+
messages,
|
|
779
|
+
temperature: options?.temperature ?? 0,
|
|
780
|
+
...includeSampling && options?.top_p !== void 0 ? { top_p: options.top_p } : {},
|
|
781
|
+
...includeSampling && options?.seed !== void 0 ? { seed: options.seed } : {},
|
|
782
|
+
max_tokens: options?.num_predict ?? 512,
|
|
783
|
+
stream,
|
|
784
|
+
...stream ? { stream_options: { include_usage: true } } : {},
|
|
785
|
+
...buildThinkingConfig(options?.think)
|
|
786
|
+
};
|
|
787
|
+
}
|
|
680
788
|
function parseNonNegativeInt(value) {
|
|
681
789
|
if (!/^\d+$/.test(value)) return null;
|
|
682
790
|
const parsed = Number.parseInt(value, 10);
|
|
@@ -761,7 +869,7 @@ async function pathIsDirectory(targetPath) {
|
|
|
761
869
|
try {
|
|
762
870
|
const stat = await fs.stat(targetPath);
|
|
763
871
|
return stat.isDirectory();
|
|
764
|
-
} catch {
|
|
872
|
+
} catch (_err) {
|
|
765
873
|
return false;
|
|
766
874
|
}
|
|
767
875
|
}
|
|
@@ -1165,27 +1273,30 @@ async function generate2(model, prompt, options) {
|
|
|
1165
1273
|
try {
|
|
1166
1274
|
const baseUrl = getLMStudioBaseUrl();
|
|
1167
1275
|
const url = new URL("/v1/chat/completions", baseUrl);
|
|
1168
|
-
const
|
|
1276
|
+
const doRequest = (includeSampling) => fetch(url, {
|
|
1169
1277
|
method: "POST",
|
|
1170
1278
|
headers: getLMStudioHeaders(),
|
|
1171
|
-
body: JSON.stringify(
|
|
1172
|
-
model,
|
|
1173
|
-
messages: [{ role: "user", content: prompt }],
|
|
1174
|
-
temperature: options?.temperature ?? 0,
|
|
1175
|
-
max_tokens: options?.num_predict ?? 512,
|
|
1176
|
-
stream: false,
|
|
1177
|
-
...buildThinkingConfig(options?.think)
|
|
1178
|
-
}),
|
|
1279
|
+
body: JSON.stringify(buildChatCompletionBody(model, prompt, options, false, includeSampling)),
|
|
1179
1280
|
signal: controller.signal
|
|
1180
1281
|
});
|
|
1282
|
+
let resp = await doRequest(true);
|
|
1181
1283
|
if (!resp.ok) {
|
|
1182
1284
|
const body = await resp.text().catch(() => "");
|
|
1183
|
-
|
|
1285
|
+
if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
|
|
1286
|
+
resp = await doRequest(false);
|
|
1287
|
+
} else {
|
|
1288
|
+
throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
|
|
1289
|
+
}
|
|
1290
|
+
}
|
|
1291
|
+
if (!resp.ok) {
|
|
1292
|
+
const body = await resp.text().catch(() => "");
|
|
1293
|
+
throw buildLMStudioRequestError("generate", model, resp.status, resp.statusText, body);
|
|
1184
1294
|
}
|
|
1185
1295
|
const payload = await resp.json();
|
|
1186
1296
|
const choice = extractChoice2(payload);
|
|
1187
1297
|
const response = extractContent(choice);
|
|
1188
1298
|
const reasoning = extractReasoning(choice);
|
|
1299
|
+
assertThinkingModeRespected(model, options?.think, response, reasoning);
|
|
1189
1300
|
const usage = extractUsage(payload);
|
|
1190
1301
|
const totalDuration = Math.max(0, Date.now() - start) * 1e6;
|
|
1191
1302
|
return {
|
|
@@ -1226,23 +1337,24 @@ async function generateStream2(model, prompt, callbacks, options) {
|
|
|
1226
1337
|
};
|
|
1227
1338
|
try {
|
|
1228
1339
|
resetStallTimer();
|
|
1229
|
-
const
|
|
1340
|
+
const doRequest = (includeSampling) => fetch(url, {
|
|
1230
1341
|
method: "POST",
|
|
1231
1342
|
headers: getLMStudioHeaders(),
|
|
1232
|
-
body: JSON.stringify(
|
|
1233
|
-
model,
|
|
1234
|
-
messages: [{ role: "user", content: prompt }],
|
|
1235
|
-
temperature: options?.temperature ?? 0,
|
|
1236
|
-
max_tokens: options?.num_predict ?? 512,
|
|
1237
|
-
stream: true,
|
|
1238
|
-
stream_options: { include_usage: true },
|
|
1239
|
-
...buildThinkingConfig(options?.think)
|
|
1240
|
-
}),
|
|
1343
|
+
body: JSON.stringify(buildChatCompletionBody(model, prompt, options, true, includeSampling)),
|
|
1241
1344
|
signal: controller.signal
|
|
1242
1345
|
});
|
|
1346
|
+
let resp = await doRequest(true);
|
|
1243
1347
|
if (!resp.ok) {
|
|
1244
1348
|
const body = await resp.text().catch(() => "");
|
|
1245
|
-
|
|
1349
|
+
if (hasSamplingOverrides2(options) && isUnsupportedSamplingMessage(resp.status, body)) {
|
|
1350
|
+
resp = await doRequest(false);
|
|
1351
|
+
} else {
|
|
1352
|
+
throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
if (!resp.ok) {
|
|
1356
|
+
const body = await resp.text().catch(() => "");
|
|
1357
|
+
throw buildLMStudioRequestError("stream", model, resp.status, resp.statusText, body);
|
|
1246
1358
|
}
|
|
1247
1359
|
if (!resp.body) {
|
|
1248
1360
|
throw new Error("LM Studio stream response body is empty");
|
|
@@ -1322,6 +1434,7 @@ async function generateStream2(model, prompt, callbacks, options) {
|
|
|
1322
1434
|
evalCount: usage?.completion_tokens ?? 0,
|
|
1323
1435
|
evalDuration: Math.max(1, evalDurationMs) * 1e6
|
|
1324
1436
|
};
|
|
1437
|
+
assertThinkingModeRespected(model, options?.think, fullResponse, fullThinking);
|
|
1325
1438
|
callbacks?.onDone?.(result);
|
|
1326
1439
|
return result;
|
|
1327
1440
|
} catch (err) {
|
|
@@ -1816,6 +1929,33 @@ function errorMsg(text) {
|
|
|
1816
1929
|
console.log(chalk.red(` ${CROSS_MARK} ${text}`));
|
|
1817
1930
|
}
|
|
1818
1931
|
|
|
1932
|
+
// ../src/benchmarks/profile.ts
|
|
1933
|
+
var BENCHMARK_PROFILE_VERSION = "v1";
|
|
1934
|
+
var BENCHMARK_PROFILE_SEED = 42;
|
|
1935
|
+
var BENCHMARK_PROFILE_TOP_P = 1;
|
|
1936
|
+
var BENCHMARK_PROFILE_TEMPERATURE = 0;
|
|
1937
|
+
function withBenchmarkProfile(opts = {}) {
|
|
1938
|
+
return {
|
|
1939
|
+
temperature: BENCHMARK_PROFILE_TEMPERATURE,
|
|
1940
|
+
top_p: BENCHMARK_PROFILE_TOP_P,
|
|
1941
|
+
seed: BENCHMARK_PROFILE_SEED,
|
|
1942
|
+
...opts
|
|
1943
|
+
};
|
|
1944
|
+
}
|
|
1945
|
+
function buildBenchmarkProfileMetadata(thinkEnabled) {
|
|
1946
|
+
return {
|
|
1947
|
+
version: BENCHMARK_PROFILE_VERSION,
|
|
1948
|
+
sampling: {
|
|
1949
|
+
temperature: BENCHMARK_PROFILE_TEMPERATURE,
|
|
1950
|
+
topP: BENCHMARK_PROFILE_TOP_P,
|
|
1951
|
+
seed: BENCHMARK_PROFILE_SEED
|
|
1952
|
+
},
|
|
1953
|
+
thinkingMode: thinkEnabled ? "enabled" : "disabled",
|
|
1954
|
+
contextWindowTokens: null,
|
|
1955
|
+
contextPolicy: "runtime-default"
|
|
1956
|
+
};
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1819
1959
|
// ../src/benchmarks/performance.ts
|
|
1820
1960
|
var WARMUP_PROMPT = "Say hello in one word.";
|
|
1821
1961
|
var BENCH_PROMPTS = [
|
|
@@ -1857,9 +1997,11 @@ async function runPerformanceBench(model, options = {}) {
|
|
|
1857
1997
|
]);
|
|
1858
1998
|
const warmup = await withTimeout(
|
|
1859
1999
|
generateStream3(model, WARMUP_PROMPT, void 0, {
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
2000
|
+
...withBenchmarkProfile({
|
|
2001
|
+
num_predict: 32,
|
|
2002
|
+
think: options.think,
|
|
2003
|
+
stall_timeout_ms: options.streamStallTimeoutMs
|
|
2004
|
+
})
|
|
1863
2005
|
}),
|
|
1864
2006
|
warmupTimeoutMs,
|
|
1865
2007
|
"Model warmup",
|
|
@@ -1914,11 +2056,11 @@ async function runPerformanceBench(model, options = {}) {
|
|
|
1914
2056
|
}
|
|
1915
2057
|
}
|
|
1916
2058
|
},
|
|
1917
|
-
{
|
|
2059
|
+
withBenchmarkProfile({
|
|
1918
2060
|
num_predict: 256,
|
|
1919
2061
|
think: options.think,
|
|
1920
2062
|
stall_timeout_ms: options.streamStallTimeoutMs
|
|
1921
|
-
}
|
|
2063
|
+
})
|
|
1922
2064
|
),
|
|
1923
2065
|
promptTimeoutMs,
|
|
1924
2066
|
"Performance benchmark",
|
|
@@ -2401,7 +2543,7 @@ Answer:`;
|
|
|
2401
2543
|
const startTime = Date.now();
|
|
2402
2544
|
try {
|
|
2403
2545
|
const result = await withTimeout(
|
|
2404
|
-
generate3(model, prompt, {
|
|
2546
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
2405
2547
|
timeoutMs,
|
|
2406
2548
|
"Reasoning question",
|
|
2407
2549
|
abortOngoingRequests3
|
|
@@ -2730,7 +2872,7 @@ Answer:`;
|
|
|
2730
2872
|
const startTime = Date.now();
|
|
2731
2873
|
try {
|
|
2732
2874
|
const result = await withTimeout(
|
|
2733
|
-
generate3(model, prompt, {
|
|
2875
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
2734
2876
|
timeoutMs,
|
|
2735
2877
|
"Math problem",
|
|
2736
2878
|
abortOngoingRequests3
|
|
@@ -6613,7 +6755,7 @@ Reply with ONLY the function code, no explanation.`;
|
|
|
6613
6755
|
const startTime = Date.now();
|
|
6614
6756
|
try {
|
|
6615
6757
|
const result = await withTimeout(
|
|
6616
|
-
generate3(model, prompt, {
|
|
6758
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 2048, think: opts?.think })),
|
|
6617
6759
|
timeoutMs,
|
|
6618
6760
|
"Coding task",
|
|
6619
6761
|
abortOngoingRequests3
|
|
@@ -6968,7 +7110,7 @@ async function runInstructionFollowingBench(model, opts) {
|
|
|
6968
7110
|
const startTime = Date.now();
|
|
6969
7111
|
try {
|
|
6970
7112
|
const result = await withTimeout(
|
|
6971
|
-
generate3(model, prompt, {
|
|
7113
|
+
generate3(model, prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
6972
7114
|
timeoutMs,
|
|
6973
7115
|
"Instruction following task",
|
|
6974
7116
|
abortOngoingRequests3
|
|
@@ -7354,7 +7496,7 @@ async function runStructuredOutputBench(model, opts) {
|
|
|
7354
7496
|
const startTime = Date.now();
|
|
7355
7497
|
try {
|
|
7356
7498
|
const result = await withTimeout(
|
|
7357
|
-
generate3(model, q.prompt, {
|
|
7499
|
+
generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
7358
7500
|
timeoutMs,
|
|
7359
7501
|
"Structured output task",
|
|
7360
7502
|
abortOngoingRequests3
|
|
@@ -7613,7 +7755,7 @@ async function runMultilingualBench(model, opts) {
|
|
|
7613
7755
|
const startTime = Date.now();
|
|
7614
7756
|
try {
|
|
7615
7757
|
const result = await withTimeout(
|
|
7616
|
-
generate3(model, q.prompt, {
|
|
7758
|
+
generate3(model, q.prompt, withBenchmarkProfile({ num_predict: 1024, think: opts?.think })),
|
|
7617
7759
|
timeoutMs,
|
|
7618
7760
|
"Multilingual task",
|
|
7619
7761
|
abortOngoingRequests3
|
|
@@ -8015,6 +8157,25 @@ function getLevel(score) {
|
|
|
8015
8157
|
if (score >= 25) return "Weak";
|
|
8016
8158
|
return "Poor";
|
|
8017
8159
|
}
|
|
8160
|
+
function summarizeCategoryIssues(name, details) {
|
|
8161
|
+
let crashes = 0;
|
|
8162
|
+
let timeouts = 0;
|
|
8163
|
+
let errors = 0;
|
|
8164
|
+
for (const detail of details) {
|
|
8165
|
+
const actual = detail.actual ?? "";
|
|
8166
|
+
if (/^TIMEOUT\b/i.test(actual)) {
|
|
8167
|
+
timeouts++;
|
|
8168
|
+
continue;
|
|
8169
|
+
}
|
|
8170
|
+
if (/^ERROR:/i.test(actual)) {
|
|
8171
|
+
errors++;
|
|
8172
|
+
if (/model has crashed|has crashed without additional information|model crashed/i.test(actual)) {
|
|
8173
|
+
crashes++;
|
|
8174
|
+
}
|
|
8175
|
+
}
|
|
8176
|
+
}
|
|
8177
|
+
return { name, crashes, timeouts, errors };
|
|
8178
|
+
}
|
|
8018
8179
|
function printHardwareTable(hw) {
|
|
8019
8180
|
const table = new Table({
|
|
8020
8181
|
head: [chalk3.bold("Hardware"), chalk3.bold("Value")],
|
|
@@ -8133,6 +8294,18 @@ function printQualityTable(quality, timePenalties) {
|
|
|
8133
8294
|
]);
|
|
8134
8295
|
}
|
|
8135
8296
|
console.log(table.toString());
|
|
8297
|
+
const issueSummaries = categories.map((cat) => summarizeCategoryIssues(cat.name, cat.result.details)).filter((summary) => summary.errors > 0 || summary.timeouts > 0);
|
|
8298
|
+
if (issueSummaries.length > 0) {
|
|
8299
|
+
console.log(chalk3.yellow("Execution issues detected during quality benchmark:"));
|
|
8300
|
+
for (const summary of issueSummaries) {
|
|
8301
|
+
const parts = [];
|
|
8302
|
+
if (summary.crashes > 0) parts.push(`${summary.crashes} crash${summary.crashes > 1 ? "es" : ""}`);
|
|
8303
|
+
const nonCrashErrors = summary.errors - summary.crashes;
|
|
8304
|
+
if (nonCrashErrors > 0) parts.push(`${nonCrashErrors} error${nonCrashErrors > 1 ? "s" : ""}`);
|
|
8305
|
+
if (summary.timeouts > 0) parts.push(`${summary.timeouts} timeout${summary.timeouts > 1 ? "s" : ""}`);
|
|
8306
|
+
console.log(chalk3.yellow(` \u2022 ${summary.name}: ${parts.join(", ")} (scored as incorrect)`));
|
|
8307
|
+
}
|
|
8308
|
+
}
|
|
8136
8309
|
}
|
|
8137
8310
|
function printSummaryTable(results) {
|
|
8138
8311
|
const termWidth = process.stdout.columns || 80;
|
|
@@ -9008,7 +9181,7 @@ async function promptThinkingMode() {
|
|
|
9008
9181
|
}
|
|
9009
9182
|
|
|
9010
9183
|
// ../src/commands/bench.ts
|
|
9011
|
-
var BENCHMARK_SPEC_VERSION = "0.2.
|
|
9184
|
+
var BENCHMARK_SPEC_VERSION = "0.2.1";
|
|
9012
9185
|
var PROMPT_PACK_VERSION = "0.1.0";
|
|
9013
9186
|
async function benchCommand(options) {
|
|
9014
9187
|
if (options.backend !== void 0) {
|
|
@@ -9101,6 +9274,11 @@ async function benchCommand(options) {
|
|
|
9101
9274
|
if (!silent && thinkEnabled) {
|
|
9102
9275
|
infoMsg("Thinking mode enabled \u2014 models that support it will use extended reasoning.");
|
|
9103
9276
|
}
|
|
9277
|
+
if (!silent) {
|
|
9278
|
+
infoMsg(
|
|
9279
|
+
`Benchmark profile ${BENCHMARK_PROFILE_VERSION}: temperature=0, top_p=1, seed=42, context=runtime default.`
|
|
9280
|
+
);
|
|
9281
|
+
}
|
|
9104
9282
|
try {
|
|
9105
9283
|
const results = [];
|
|
9106
9284
|
const failedModels = [];
|
|
@@ -9189,7 +9367,8 @@ ${tl}${h.repeat(innerWidth)}${tr}`));
|
|
|
9189
9367
|
promptPackVersion: PROMPT_PACK_VERSION,
|
|
9190
9368
|
runtimeVersion,
|
|
9191
9369
|
runtimeBackend: getRuntimeName(),
|
|
9192
|
-
modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat()
|
|
9370
|
+
modelFormat: matchedModel?.modelFormat ?? getRuntimeModelFormat(),
|
|
9371
|
+
benchmarkProfile: buildBenchmarkProfileMetadata(thinkEnabled)
|
|
9193
9372
|
}
|
|
9194
9373
|
};
|
|
9195
9374
|
const rawLogHash = createHash3("sha256").update(JSON.stringify(partialResult)).digest("hex");
|
|
@@ -9550,7 +9729,7 @@ async function handleShareResult(args) {
|
|
|
9550
9729
|
// src/index.ts
|
|
9551
9730
|
var server = new McpServer({
|
|
9552
9731
|
name: "metrillm",
|
|
9553
|
-
version: "0.1
|
|
9732
|
+
version: "0.2.1"
|
|
9554
9733
|
});
|
|
9555
9734
|
for (const def of toolDefinitions) {
|
|
9556
9735
|
switch (def.name) {
|