llm-checker 3.5.15 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -8
- package/analyzer/compatibility.js +5 -0
- package/analyzer/performance.js +5 -4
- package/bin/cli.js +5 -39
- package/bin/enhanced_cli.js +449 -24
- package/bin/mcp-server.mjs +266 -101
- package/package.json +13 -8
- package/src/ai/multi-objective-selector.js +118 -11
- package/src/calibration/calibration-manager.js +4 -1
- package/src/data/model-database.js +489 -5
- package/src/data/registry-ingestors.js +751 -0
- package/src/data/registry-recommender.js +514 -0
- package/src/data/seed/README.md +11 -3
- package/src/data/seed/models.db +0 -0
- package/src/data/sync-manager.js +32 -18
- package/src/hardware/backends/apple-silicon.js +5 -1
- package/src/hardware/backends/cuda-detector.js +47 -19
- package/src/hardware/backends/intel-detector.js +6 -2
- package/src/hardware/backends/rocm-detector.js +6 -2
- package/src/hardware/detector.js +57 -30
- package/src/hardware/unified-detector.js +129 -25
- package/src/index.js +68 -4
- package/src/models/ai-check-selector.js +36 -5
- package/src/models/deterministic-selector.js +179 -18
- package/src/models/expanded_database.js +9 -5
- package/src/models/intelligent-selector.js +87 -1
- package/src/models/moe-assumptions.js +11 -0
- package/src/models/requirements.js +16 -11
- package/src/models/scoring-core.js +341 -0
- package/src/models/scoring-engine.js +9 -2
- package/src/ollama/capacity-planner.js +15 -2
- package/src/ollama/client.js +70 -30
- package/src/ollama/enhanced-client.js +20 -2
- package/src/ollama/manager.js +14 -2
- package/src/policy/cli-policy.js +8 -2
- package/src/policy/policy-engine.js +2 -1
- package/src/provenance/model-provenance.js +4 -1
- package/src/ui/cli-theme.js +47 -7
- package/src/ui/interactive-panel.js +162 -24
package/bin/mcp-server.mjs
CHANGED
|
@@ -20,6 +20,7 @@ import { promisify } from "util";
|
|
|
20
20
|
import { fileURLToPath } from "url";
|
|
21
21
|
import { dirname, join } from "path";
|
|
22
22
|
import { readdir, stat } from "fs/promises";
|
|
23
|
+
import { readFileSync } from "fs";
|
|
23
24
|
import http from "http";
|
|
24
25
|
import os from "os";
|
|
25
26
|
|
|
@@ -28,8 +29,20 @@ const __filename = fileURLToPath(import.meta.url);
|
|
|
28
29
|
const __dirname = dirname(__filename);
|
|
29
30
|
|
|
30
31
|
const CLI_PATH = join(__dirname, "enhanced_cli.js");
|
|
32
|
+
const PACKAGE_JSON_PATH = join(__dirname, "..", "package.json");
|
|
31
33
|
const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://localhost:11434";
|
|
32
34
|
|
|
35
|
+
// Read the package version dynamically so the advertised MCP server version
|
|
36
|
+
// never drifts from package.json. Falls back to "0.0.0" if unreadable.
|
|
37
|
+
function readPackageVersion(packagePath = PACKAGE_JSON_PATH) {
|
|
38
|
+
try {
|
|
39
|
+
const pkg = JSON.parse(readFileSync(packagePath, "utf8"));
|
|
40
|
+
return typeof pkg.version === "string" && pkg.version ? pkg.version : "0.0.0";
|
|
41
|
+
} catch {
|
|
42
|
+
return "0.0.0";
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
33
46
|
// ============================================================================
|
|
34
47
|
// HELPERS
|
|
35
48
|
// ============================================================================
|
|
@@ -50,8 +63,13 @@ async function run(args, timeout = 120000) {
|
|
|
50
63
|
});
|
|
51
64
|
return clean(stdout || stderr);
|
|
52
65
|
} catch (err) {
|
|
53
|
-
|
|
54
|
-
|
|
66
|
+
// M8: the CLI exited non-zero. Do NOT silently return captured stdout as if
|
|
67
|
+
// it succeeded — that masks failures from the caller (no error signal).
|
|
68
|
+
// Throw so the tool handler's catch surfaces isError, while preserving the
|
|
69
|
+
// captured output in the error message for diagnostics.
|
|
70
|
+
const captured = clean(err.stdout || err.stderr || "");
|
|
71
|
+
const detail = captured ? `${err.message}\n${captured}` : err.message;
|
|
72
|
+
throw new Error(`llm-checker failed: ${detail}`);
|
|
55
73
|
}
|
|
56
74
|
}
|
|
57
75
|
|
|
@@ -109,6 +127,100 @@ function tryParseJSON(text) {
|
|
|
109
127
|
}
|
|
110
128
|
}
|
|
111
129
|
|
|
130
|
+
// ----------------------------------------------------------------------------
|
|
131
|
+
// Pure helpers (exported for unit testing)
|
|
132
|
+
// ----------------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
// M2: Compute generation speed in tokens/sec from an Ollama /api/generate
|
|
135
|
+
// response. Ollama reports durations in nanoseconds. Only compute a finite
|
|
136
|
+
// value when BOTH eval_count and eval_duration are positive; otherwise return
|
|
137
|
+
// null so callers can render "n/a" instead of dividing by a bogus 1ns fallback
|
|
138
|
+
// (which produced absurd numbers like billions of tok/s).
|
|
139
|
+
function tokensPerSecond(evalCount, evalDurationNs) {
|
|
140
|
+
const count = Number(evalCount);
|
|
141
|
+
const durNs = Number(evalDurationNs);
|
|
142
|
+
if (!Number.isFinite(count) || !Number.isFinite(durNs)) return null;
|
|
143
|
+
if (count <= 0 || durNs <= 0) return null;
|
|
144
|
+
return (count / durNs) * 1e9;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Format a tokens/sec value (possibly null) for display. Renders "n/a" when
|
|
148
|
+
// the value is unavailable, otherwise a fixed-precision number.
|
|
149
|
+
function formatTokPerSec(value) {
|
|
150
|
+
if (value === null || value === undefined || !Number.isFinite(value)) return "n/a";
|
|
151
|
+
return value.toFixed(1);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// M1: Map the structured `hw-detect --json` object to the small set of facts
|
|
155
|
+
// the optimizer/cleanup tools need, sourced from typed fields instead of
|
|
156
|
+
// regex-scraping human-readable CLI text.
|
|
157
|
+
// - tier: summary.hardwareTier, upper-cased (e.g. "MEDIUM_HIGH"); "UNKNOWN" if absent.
|
|
158
|
+
// - vramGB: summary.totalVRAM (unified/dedicated GPU memory budget).
|
|
159
|
+
// - maxGB: largest model size that fits. Mirrors the detector's
|
|
160
|
+
// getMaxModelSize(): effectiveMemory - 2GB headroom. Falls back to
|
|
161
|
+
// totalVRAM/systemRAM derivations, then a sane 15GB default.
|
|
162
|
+
function mapHardwareJson(json) {
|
|
163
|
+
const summary = (json && typeof json === "object" && json.summary) || {};
|
|
164
|
+
|
|
165
|
+
const rawTier = summary.hardwareTier;
|
|
166
|
+
const tier = typeof rawTier === "string" && rawTier.trim()
|
|
167
|
+
? rawTier.trim().toUpperCase()
|
|
168
|
+
: "UNKNOWN";
|
|
169
|
+
|
|
170
|
+
const vramGB = Number.isFinite(Number(summary.totalVRAM)) ? Number(summary.totalVRAM) : null;
|
|
171
|
+
|
|
172
|
+
let maxGB;
|
|
173
|
+
if (Number.isFinite(Number(summary.effectiveMemory))) {
|
|
174
|
+
maxGB = Math.max(0, Math.round(Number(summary.effectiveMemory) - 2));
|
|
175
|
+
} else if (Number.isFinite(Number(summary.totalVRAM)) && Number(summary.totalVRAM) > 0) {
|
|
176
|
+
maxGB = Math.max(0, Math.round(Number(summary.totalVRAM) - 2));
|
|
177
|
+
} else if (Number.isFinite(Number(summary.systemRAM)) && Number(summary.systemRAM) > 0) {
|
|
178
|
+
maxGB = Math.max(0, Math.round(Number(summary.systemRAM) - 2));
|
|
179
|
+
} else {
|
|
180
|
+
maxGB = 15; // sane fallback when JSON lacks memory fields
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
return { tier, vramGB, maxGB };
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// M7: Map of framework/marker filenames AND directories to their labels.
|
|
187
|
+
// Some markers are directories (e.g. ".github" -> "GitHub Actions") whose
|
|
188
|
+
// names start with a dot; the directory scan skips dotfiles, so these must be
|
|
189
|
+
// detected explicitly before the dotfile skip.
|
|
190
|
+
const FRAMEWORK_MARKERS = {
|
|
191
|
+
"package.json": "Node.js",
|
|
192
|
+
"Cargo.toml": "Rust/Cargo",
|
|
193
|
+
"go.mod": "Go Modules",
|
|
194
|
+
"requirements.txt": "Python/pip",
|
|
195
|
+
"pyproject.toml": "Python",
|
|
196
|
+
"Gemfile": "Ruby/Bundler",
|
|
197
|
+
"pom.xml": "Java/Maven",
|
|
198
|
+
"build.gradle": "Java/Gradle",
|
|
199
|
+
"composer.json": "PHP/Composer",
|
|
200
|
+
"Anchor.toml": "Solana/Anchor",
|
|
201
|
+
"hardhat.config.js": "Ethereum/Hardhat",
|
|
202
|
+
"foundry.toml": "Ethereum/Foundry",
|
|
203
|
+
"CMakeLists.txt": "CMake",
|
|
204
|
+
"Makefile": "Make",
|
|
205
|
+
"Dockerfile": "Docker",
|
|
206
|
+
"docker-compose.yml": "Docker Compose",
|
|
207
|
+
".github": "GitHub Actions",
|
|
208
|
+
"next.config.js": "Next.js",
|
|
209
|
+
"next.config.mjs": "Next.js",
|
|
210
|
+
"vite.config.ts": "Vite",
|
|
211
|
+
"tailwind.config.js": "Tailwind CSS",
|
|
212
|
+
"tsconfig.json": "TypeScript",
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
// Returns the framework label for a given directory/file entry name, or null
|
|
216
|
+
// if the name is not a recognized marker. Pure + synchronous so it is unit
|
|
217
|
+
// testable (regression for M7: ".github" must resolve to "GitHub Actions").
|
|
218
|
+
function detectFrameworkMarker(name) {
|
|
219
|
+
return Object.prototype.hasOwnProperty.call(FRAMEWORK_MARKERS, name)
|
|
220
|
+
? FRAMEWORK_MARKERS[name]
|
|
221
|
+
: null;
|
|
222
|
+
}
|
|
223
|
+
|
|
112
224
|
function formatExportBlock(envObject) {
|
|
113
225
|
if (!envObject || typeof envObject !== "object") return "";
|
|
114
226
|
const entries = Object.entries(envObject).filter(([, value]) => value !== undefined && value !== null);
|
|
@@ -185,9 +297,11 @@ const ALLOWED_CLI_COMMANDS = new Set([
|
|
|
185
297
|
// MCP SERVER
|
|
186
298
|
// ============================================================================
|
|
187
299
|
|
|
300
|
+
const SERVER_VERSION = readPackageVersion();
|
|
301
|
+
|
|
188
302
|
const server = new McpServer({
|
|
189
303
|
name: "llm-checker",
|
|
190
|
-
version:
|
|
304
|
+
version: SERVER_VERSION,
|
|
191
305
|
});
|
|
192
306
|
|
|
193
307
|
// ============================================================================
|
|
@@ -733,9 +847,7 @@ server.tool(
|
|
|
733
847
|
async ({ model, prompt }) => {
|
|
734
848
|
try {
|
|
735
849
|
const data = await ollamaAPI("/api/generate", { model, prompt, stream: false }, 300000);
|
|
736
|
-
const tokPerSec = data.eval_count
|
|
737
|
-
? ((data.eval_count / data.eval_duration) * 1e9).toFixed(1)
|
|
738
|
-
: "?";
|
|
850
|
+
const tokPerSec = formatTokPerSec(tokensPerSecond(data.eval_count, data.eval_duration));
|
|
739
851
|
const result = [
|
|
740
852
|
`MODEL: ${model}`,
|
|
741
853
|
`RESPONSE: ${data.response}`,
|
|
@@ -776,7 +888,12 @@ server.tool(
|
|
|
776
888
|
{},
|
|
777
889
|
async () => {
|
|
778
890
|
try {
|
|
779
|
-
|
|
891
|
+
// M1: source hardware facts from structured `hw-detect --json` instead of
|
|
892
|
+
// regex-scraping human-readable CLI text.
|
|
893
|
+
const hwJsonText = await run(["hw-detect", "--json"]);
|
|
894
|
+
const hwJson = tryParseJSON(hwJsonText);
|
|
895
|
+
const { tier, vramGB } = mapHardwareJson(hwJson || {});
|
|
896
|
+
|
|
780
897
|
const totalMem = os.totalmem();
|
|
781
898
|
const freeMem = os.freemem();
|
|
782
899
|
const cpuCount = os.cpus().length;
|
|
@@ -784,10 +901,6 @@ server.tool(
|
|
|
784
901
|
const freeGB = Math.round(freeMem / 1e9);
|
|
785
902
|
const platform = os.platform();
|
|
786
903
|
|
|
787
|
-
// Parse tier from hw-detect output
|
|
788
|
-
const tierMatch = hwResult.match(/Tier:\s*(\w[\w\s]*)/i);
|
|
789
|
-
const tier = tierMatch ? tierMatch[1].trim().toUpperCase() : "UNKNOWN";
|
|
790
|
-
|
|
791
904
|
// Determine GPU layers
|
|
792
905
|
const isApple = platform === "darwin";
|
|
793
906
|
let numGPU = 999; // Apple Silicon = all layers on GPU
|
|
@@ -845,7 +958,7 @@ server.tool(
|
|
|
845
958
|
`OLLAMA OPTIMIZATION FOR YOUR SYSTEM`,
|
|
846
959
|
`====================================`,
|
|
847
960
|
`Hardware: ${cpuCount} cores, ${totalGB}GB total RAM, ${freeGB}GB free`,
|
|
848
|
-
`Platform: ${platform} | Tier: ${tier}`,
|
|
961
|
+
`Platform: ${platform} | Tier: ${tier}${vramGB !== null ? ` | VRAM: ${vramGB}GB` : ""}`,
|
|
849
962
|
``,
|
|
850
963
|
`RECOMMENDED ENVIRONMENT VARIABLES:`,
|
|
851
964
|
`----------------------------------`,
|
|
@@ -883,42 +996,43 @@ server.tool(
|
|
|
883
996
|
|
|
884
997
|
server.tool(
|
|
885
998
|
"benchmark",
|
|
886
|
-
"Benchmark a local Ollama model: measure tokens/sec, load time, and generation speed
|
|
999
|
+
"Benchmark a local Ollama model: measure tokens/sec, load time, and generation speed. With no custom prompt it runs the SAME standardized prompt 3 times (true warm-up iterations); the first (cold) run is excluded from the tok/s average and load time is reported from that first run. A custom prompt is run once.",
|
|
887
1000
|
{
|
|
888
1001
|
model: z.string().describe("Model name to benchmark (e.g. 'qwen2.5-coder:7b')"),
|
|
889
1002
|
prompt: z
|
|
890
1003
|
.string()
|
|
891
1004
|
.optional()
|
|
892
|
-
.describe("Custom benchmark prompt (default: standardized coding
|
|
1005
|
+
.describe("Custom benchmark prompt (default: standardized coding prompt run 3x)"),
|
|
893
1006
|
},
|
|
894
1007
|
async ({ model, prompt }) => {
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
1008
|
+
// M4: when no custom prompt is given, run the SAME prompt N times so these
|
|
1009
|
+
// are real iterations of one workload (warm-up effects, not different
|
|
1010
|
+
// tasks). The first run is cold (includes model load) and is excluded from
|
|
1011
|
+
// the tok/s average; its load time is reported separately.
|
|
1012
|
+
const STANDARD_PROMPT =
|
|
1013
|
+
"Write a Python function to find the nth Fibonacci number using memoization. Include type hints.";
|
|
1014
|
+
const usingCustom = Boolean(prompt);
|
|
1015
|
+
const benchPrompt = usingCustom ? prompt : STANDARD_PROMPT;
|
|
1016
|
+
const iterations = usingCustom ? 1 : 3;
|
|
902
1017
|
|
|
903
1018
|
try {
|
|
904
1019
|
const results = [];
|
|
905
|
-
for (let i = 0; i <
|
|
1020
|
+
for (let i = 0; i < iterations; i++) {
|
|
906
1021
|
const data = await ollamaAPI(
|
|
907
1022
|
"/api/generate",
|
|
908
|
-
{ model, prompt:
|
|
1023
|
+
{ model, prompt: benchPrompt, stream: false },
|
|
909
1024
|
300000
|
|
910
1025
|
);
|
|
911
1026
|
|
|
912
1027
|
const evalTokens = data.eval_count || 0;
|
|
913
|
-
|
|
914
|
-
const tokPerSec = (
|
|
1028
|
+
// M2: only a finite value when both eval_count and eval_duration are > 0.
|
|
1029
|
+
const tokPerSec = tokensPerSecond(data.eval_count, data.eval_duration);
|
|
915
1030
|
const totalSec = data.total_duration ? data.total_duration / 1e9 : 0;
|
|
916
1031
|
const loadMs = data.load_duration ? data.load_duration / 1e6 : 0;
|
|
917
1032
|
const promptTokens = data.prompt_eval_count || 0;
|
|
918
1033
|
const promptMs = data.prompt_eval_duration ? data.prompt_eval_duration / 1e6 : 0;
|
|
919
1034
|
|
|
920
1035
|
results.push({
|
|
921
|
-
prompt: benchPrompts[i].slice(0, 60) + (benchPrompts[i].length > 60 ? "..." : ""),
|
|
922
1036
|
evalTokens,
|
|
923
1037
|
tokPerSec,
|
|
924
1038
|
totalSec,
|
|
@@ -929,30 +1043,40 @@ server.tool(
|
|
|
929
1043
|
});
|
|
930
1044
|
}
|
|
931
1045
|
|
|
932
|
-
//
|
|
933
|
-
|
|
934
|
-
const
|
|
935
|
-
const
|
|
1046
|
+
// tok/s average over WARM runs only (exclude the cold first run when we
|
|
1047
|
+
// have more than one iteration). Drop unavailable (null) measurements.
|
|
1048
|
+
const warmRuns = iterations > 1 ? results.slice(1) : results;
|
|
1049
|
+
const warmTokRates = warmRuns.map((r) => r.tokPerSec).filter((v) => v !== null);
|
|
1050
|
+
const avgTokPerSec = warmTokRates.length > 0
|
|
1051
|
+
? warmTokRates.reduce((s, v) => s + v, 0) / warmTokRates.length
|
|
1052
|
+
: null;
|
|
1053
|
+
|
|
1054
|
+
const warmTotals = warmRuns.map((r) => r.totalSec);
|
|
1055
|
+
const avgTotalSec = warmTotals.reduce((s, v) => s + v, 0) / warmTotals.length;
|
|
1056
|
+
// Load time is a cold-start cost: report it from the first run only.
|
|
1057
|
+
const coldLoadMs = results[0]?.loadMs ?? 0;
|
|
936
1058
|
const totalTokens = results.reduce((s, r) => s + r.evalTokens, 0);
|
|
937
1059
|
|
|
1060
|
+
const promptPreview = benchPrompt.slice(0, 60) + (benchPrompt.length > 60 ? "..." : "");
|
|
1061
|
+
|
|
938
1062
|
const output = [
|
|
939
1063
|
`BENCHMARK: ${model}`,
|
|
940
1064
|
`${"=".repeat(60)}`,
|
|
941
|
-
`
|
|
1065
|
+
`Prompt: "${promptPreview}"`,
|
|
1066
|
+
`Iterations: ${results.length}${iterations > 1 ? " (run 1 = cold, excluded from speed average)" : ""}`,
|
|
942
1067
|
``,
|
|
943
1068
|
...results.map((r, i) => [
|
|
944
|
-
`--- Run ${i + 1} ---`,
|
|
945
|
-
`
|
|
946
|
-
`Generated: ${r.evalTokens} tokens at ${r.tokPerSec.toFixed(1)} tok/s`,
|
|
1069
|
+
`--- Run ${i + 1}${iterations > 1 && i === 0 ? " (cold)" : ""} ---`,
|
|
1070
|
+
`Generated: ${r.evalTokens} tokens at ${formatTokPerSec(r.tokPerSec)} tok/s`,
|
|
947
1071
|
`Total: ${r.totalSec.toFixed(2)}s | Load: ${r.loadMs.toFixed(0)}ms | Prompt eval: ${r.promptMs.toFixed(0)}ms (${r.promptTokens} tokens)`,
|
|
948
1072
|
`Response: "${r.responsePreview}..."`,
|
|
949
1073
|
``,
|
|
950
1074
|
]).flat(),
|
|
951
1075
|
`${"=".repeat(60)}`,
|
|
952
|
-
`AVERAGES:`,
|
|
953
|
-
` Generation speed: ${avgTokPerSec
|
|
1076
|
+
iterations > 1 ? `WARM AVERAGES (excludes cold run 1):` : `RESULTS:`,
|
|
1077
|
+
` Generation speed: ${formatTokPerSec(avgTokPerSec)} tok/s`,
|
|
954
1078
|
` Total time: ${avgTotalSec.toFixed(2)}s`,
|
|
955
|
-
`
|
|
1079
|
+
` Cold load time: ${coldLoadMs.toFixed(0)}ms`,
|
|
956
1080
|
` Total tokens generated: ${totalTokens}`,
|
|
957
1081
|
].join("\n");
|
|
958
1082
|
|
|
@@ -969,7 +1093,7 @@ server.tool(
|
|
|
969
1093
|
|
|
970
1094
|
server.tool(
|
|
971
1095
|
"compare_models",
|
|
972
|
-
"Compare two local Ollama models head-to-head
|
|
1096
|
+
"Compare two local Ollama models head-to-head on the same prompt. Models are run SEQUENTIALLY (model A fully, then model B) so each tok/s measurement is uncontended — running them in parallel would make them fight over GPU/RAM and invalidate the speed comparison.",
|
|
973
1097
|
{
|
|
974
1098
|
model_a: z.string().describe("First model (e.g. 'qwen2.5-coder:7b')"),
|
|
975
1099
|
model_b: z.string().describe("Second model (e.g. 'codellama:7b')"),
|
|
@@ -982,16 +1106,15 @@ server.tool(
|
|
|
982
1106
|
const testPrompt = prompt || "Write a Python function that checks if a string is a valid IPv4 address. Include edge cases.";
|
|
983
1107
|
|
|
984
1108
|
try {
|
|
985
|
-
//
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
]);
|
|
1109
|
+
// M3: run sequentially so the two models do not contend for GPU/RAM.
|
|
1110
|
+
// Each measurement is taken while the other model is not executing.
|
|
1111
|
+
const resultA = await ollamaAPI("/api/generate", { model: model_a, prompt: testPrompt, stream: false }, 300000);
|
|
1112
|
+
const resultB = await ollamaAPI("/api/generate", { model: model_b, prompt: testPrompt, stream: false }, 300000);
|
|
990
1113
|
|
|
991
1114
|
function metrics(data) {
|
|
992
1115
|
const evalTokens = data.eval_count || 0;
|
|
993
|
-
|
|
994
|
-
const tokPerSec = (
|
|
1116
|
+
// M2: null when eval_count/eval_duration are not both positive.
|
|
1117
|
+
const tokPerSec = tokensPerSecond(data.eval_count, data.eval_duration);
|
|
995
1118
|
const totalSec = data.total_duration ? data.total_duration / 1e9 : 0;
|
|
996
1119
|
const loadSec = data.load_duration ? data.load_duration / 1e9 : 0;
|
|
997
1120
|
return { evalTokens, tokPerSec, totalSec, loadSec, response: data.response || "" };
|
|
@@ -1000,17 +1123,23 @@ server.tool(
|
|
|
1000
1123
|
const a = metrics(resultA);
|
|
1001
1124
|
const b = metrics(resultB);
|
|
1002
1125
|
|
|
1003
|
-
|
|
1004
|
-
|
|
1126
|
+
// Only declare a speed winner when both rates are known.
|
|
1127
|
+
let speedWinner;
|
|
1128
|
+
if (a.tokPerSec === null && b.tokPerSec === null) speedWinner = "n/a (no timing data)";
|
|
1129
|
+
else if (a.tokPerSec === null) speedWinner = model_b;
|
|
1130
|
+
else if (b.tokPerSec === null) speedWinner = model_a;
|
|
1131
|
+
else speedWinner = a.tokPerSec >= b.tokPerSec ? model_a : model_b;
|
|
1132
|
+
|
|
1133
|
+
const verbosityWinner = a.evalTokens >= b.evalTokens ? model_a : model_b;
|
|
1005
1134
|
|
|
1006
1135
|
const output = [
|
|
1007
|
-
`HEAD-TO-HEAD COMPARISON`,
|
|
1136
|
+
`HEAD-TO-HEAD COMPARISON (sequential runs, uncontended)`,
|
|
1008
1137
|
`${"=".repeat(70)}`,
|
|
1009
1138
|
`Prompt: "${testPrompt.slice(0, 80)}${testPrompt.length > 80 ? "..." : ""}"`,
|
|
1010
1139
|
``,
|
|
1011
1140
|
`METRIC ${model_a.padEnd(25)} ${model_b.padEnd(25)}`,
|
|
1012
1141
|
`-`.repeat(70),
|
|
1013
|
-
`Speed (tok/s) ${a.tokPerSec
|
|
1142
|
+
`Speed (tok/s) ${formatTokPerSec(a.tokPerSec).padEnd(25)} ${formatTokPerSec(b.tokPerSec).padEnd(25)}`,
|
|
1014
1143
|
`Tokens generated ${String(a.evalTokens).padEnd(25)} ${String(b.evalTokens).padEnd(25)}`,
|
|
1015
1144
|
`Total time ${(a.totalSec.toFixed(2) + "s").padEnd(25)} ${(b.totalSec.toFixed(2) + "s").padEnd(25)}`,
|
|
1016
1145
|
`Load time ${(a.loadSec.toFixed(2) + "s").padEnd(25)} ${(b.loadSec.toFixed(2) + "s").padEnd(25)}`,
|
|
@@ -1042,9 +1171,9 @@ server.tool(
|
|
|
1042
1171
|
{},
|
|
1043
1172
|
async () => {
|
|
1044
1173
|
try {
|
|
1045
|
-
const [tagsData,
|
|
1174
|
+
const [tagsData, hwJsonText] = await Promise.all([
|
|
1046
1175
|
ollamaAPI("/api/tags", null, 10000),
|
|
1047
|
-
run(["hw-detect"]),
|
|
1176
|
+
run(["hw-detect", "--json"]),
|
|
1048
1177
|
]);
|
|
1049
1178
|
|
|
1050
1179
|
if (!tagsData.models || tagsData.models.length === 0) {
|
|
@@ -1054,11 +1183,10 @@ server.tool(
|
|
|
1054
1183
|
const models = tagsData.models;
|
|
1055
1184
|
const totalSize = models.reduce((s, m) => s + (m.size || 0), 0);
|
|
1056
1185
|
|
|
1057
|
-
//
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
const
|
|
1061
|
-
const maxGB = maxSizeMatch ? parseInt(maxSizeMatch[1]) : 15;
|
|
1186
|
+
// M1: source hardware tier + max model size from structured JSON instead
|
|
1187
|
+
// of regex-scraping CLI text. mapHardwareJson() falls back to a sane 15GB
|
|
1188
|
+
// default if the JSON could not be parsed (tryParseJSON -> null).
|
|
1189
|
+
const { maxGB } = mapHardwareJson(tryParseJSON(hwJsonText) || {});
|
|
1062
1190
|
|
|
1063
1191
|
// Analyze each model
|
|
1064
1192
|
const analysis = models.map((m) => {
|
|
@@ -1172,46 +1300,23 @@ server.tool(
|
|
|
1172
1300
|
".sh": "Shell", ".bash": "Shell", ".zsh": "Shell",
|
|
1173
1301
|
};
|
|
1174
1302
|
|
|
1175
|
-
const frameworkFiles = {
|
|
1176
|
-
"package.json": "Node.js",
|
|
1177
|
-
"Cargo.toml": "Rust/Cargo",
|
|
1178
|
-
"go.mod": "Go Modules",
|
|
1179
|
-
"requirements.txt": "Python/pip",
|
|
1180
|
-
"pyproject.toml": "Python",
|
|
1181
|
-
"Gemfile": "Ruby/Bundler",
|
|
1182
|
-
"pom.xml": "Java/Maven",
|
|
1183
|
-
"build.gradle": "Java/Gradle",
|
|
1184
|
-
"composer.json": "PHP/Composer",
|
|
1185
|
-
"Anchor.toml": "Solana/Anchor",
|
|
1186
|
-
"hardhat.config.js": "Ethereum/Hardhat",
|
|
1187
|
-
"foundry.toml": "Ethereum/Foundry",
|
|
1188
|
-
"CMakeLists.txt": "CMake",
|
|
1189
|
-
"Makefile": "Make",
|
|
1190
|
-
"Dockerfile": "Docker",
|
|
1191
|
-
"docker-compose.yml": "Docker Compose",
|
|
1192
|
-
".github": "GitHub Actions",
|
|
1193
|
-
"next.config.js": "Next.js",
|
|
1194
|
-
"next.config.mjs": "Next.js",
|
|
1195
|
-
"vite.config.ts": "Vite",
|
|
1196
|
-
"tailwind.config.js": "Tailwind CSS",
|
|
1197
|
-
"tsconfig.json": "TypeScript",
|
|
1198
|
-
};
|
|
1199
|
-
|
|
1200
1303
|
async function scanDir(dir, depth = 0) {
|
|
1201
1304
|
if (depth > 4) return; // Max depth
|
|
1202
1305
|
try {
|
|
1203
1306
|
const entries = await readdir(dir, { withFileTypes: true });
|
|
1204
1307
|
for (const entry of entries) {
|
|
1308
|
+
// M7: detect known framework markers (incl. dot-directories like
|
|
1309
|
+
// ".github" -> "GitHub Actions") BEFORE skipping dotfiles, otherwise
|
|
1310
|
+
// the dotfile skip below means ".github" can never be matched.
|
|
1311
|
+
const marker = detectFrameworkMarker(entry.name);
|
|
1312
|
+
if (marker) frameworks.add(marker);
|
|
1313
|
+
|
|
1205
1314
|
if (entry.name.startsWith(".") || entry.name === "node_modules" || entry.name === "target" || entry.name === "__pycache__" || entry.name === "dist" || entry.name === "build" || entry.name === "vendor") continue;
|
|
1206
1315
|
|
|
1207
1316
|
const fullPath = join(dir, entry.name);
|
|
1208
1317
|
if (entry.isDirectory()) {
|
|
1209
|
-
if (frameworkFiles[entry.name]) frameworks.add(frameworkFiles[entry.name]);
|
|
1210
1318
|
await scanDir(fullPath, depth + 1);
|
|
1211
1319
|
} else {
|
|
1212
|
-
// Check framework files
|
|
1213
|
-
if (frameworkFiles[entry.name]) frameworks.add(frameworkFiles[entry.name]);
|
|
1214
|
-
|
|
1215
1320
|
// Count by extension
|
|
1216
1321
|
const ext = entry.name.includes(".") ? "." + entry.name.split(".").pop().toLowerCase() : "";
|
|
1217
1322
|
if (extMap[ext]) {
|
|
@@ -1302,8 +1407,14 @@ server.tool(
|
|
|
1302
1407
|
const cpus = os.cpus();
|
|
1303
1408
|
const loadAvg = os.loadavg();
|
|
1304
1409
|
|
|
1305
|
-
// CPU
|
|
1306
|
-
|
|
1410
|
+
// M5: os.loadavg() is always [0,0,0] on Windows, so deriving CPU% from it
|
|
1411
|
+
// reports a misleading 0%. Detect that case (Windows, or an all-zero
|
|
1412
|
+
// loadavg) and show "n/a" instead.
|
|
1413
|
+
const loadAvgUnavailable =
|
|
1414
|
+
os.platform() === "win32" || (loadAvg[0] === 0 && loadAvg[1] === 0 && loadAvg[2] === 0);
|
|
1415
|
+
const cpuPercentLabel = loadAvgUnavailable
|
|
1416
|
+
? "n/a (load average not reported on this platform)"
|
|
1417
|
+
: `${((loadAvg[0] / cpus.length) * 100).toFixed(1)}% (load: ${loadAvg[0].toFixed(2)})`;
|
|
1307
1418
|
|
|
1308
1419
|
// Installed models total size
|
|
1309
1420
|
const installedModels = tagsData.models || [];
|
|
@@ -1317,40 +1428,55 @@ server.tool(
|
|
|
1317
1428
|
`${"=".repeat(60)}`,
|
|
1318
1429
|
``,
|
|
1319
1430
|
`SYSTEM RESOURCES:`,
|
|
1320
|
-
` RAM: ${formatBytes(usedMem)} / ${formatBytes(totalMem)} (${memPercent}% used)`,
|
|
1431
|
+
` System RAM: ${formatBytes(usedMem)} / ${formatBytes(totalMem)} (${memPercent}% used)`,
|
|
1321
1432
|
` Free: ${formatBytes(freeMem)}`,
|
|
1322
|
-
` CPU: ${
|
|
1433
|
+
` CPU: ${cpuPercentLabel} (${cpus.length} cores)`,
|
|
1323
1434
|
``,
|
|
1324
1435
|
`OLLAMA STATUS:`,
|
|
1325
1436
|
` Installed models: ${installedModels.length} (${formatBytes(totalModelSize)} on disk)`,
|
|
1326
1437
|
` Running models: ${runningModels.length}`,
|
|
1327
1438
|
];
|
|
1328
1439
|
|
|
1440
|
+
// M5: surface each running model's VRAM residency from /api/ps.
|
|
1441
|
+
let anyOnGpu = false;
|
|
1329
1442
|
if (runningModels.length > 0) {
|
|
1330
1443
|
lines.push(``, ` LOADED IN MEMORY:`);
|
|
1331
1444
|
for (const m of runningModels) {
|
|
1332
|
-
const
|
|
1333
|
-
const
|
|
1445
|
+
const sizeVram = Number(m.size_vram) || 0;
|
|
1446
|
+
const totalLoaded = Number(m.size) || 0;
|
|
1447
|
+
if (sizeVram > 0) anyOnGpu = true;
|
|
1448
|
+
const vram = sizeVram > 0 ? formatBytes(sizeVram) : "0 B";
|
|
1449
|
+
const total = totalLoaded > 0 ? formatBytes(totalLoaded) : "?";
|
|
1450
|
+
// Portion resident in system RAM (CPU offload) = total - VRAM.
|
|
1451
|
+
const ramPortion = totalLoaded > sizeVram ? formatBytes(totalLoaded - sizeVram) : "0 B";
|
|
1334
1452
|
const expires = m.expires_at ? new Date(m.expires_at).toLocaleTimeString() : "?";
|
|
1335
|
-
lines.push(
|
|
1453
|
+
lines.push(
|
|
1454
|
+
` ${m.name.padEnd(25)} total ${total.padEnd(10)} VRAM ${vram.padEnd(10)} sysRAM ${ramPortion.padEnd(10)} Expires: ${expires}`
|
|
1455
|
+
);
|
|
1336
1456
|
}
|
|
1337
1457
|
} else {
|
|
1338
1458
|
lines.push(``, ` No models currently loaded in memory.`);
|
|
1339
1459
|
}
|
|
1340
1460
|
|
|
1341
|
-
//
|
|
1461
|
+
// M5: the system-RAM free figure is NOT the constraint on GPU boxes —
|
|
1462
|
+
// model residency is bounded by VRAM there. Clarify that this headroom
|
|
1463
|
+
// note reflects system RAM, and call out that VRAM is the real limit when
|
|
1464
|
+
// models are running on the GPU.
|
|
1342
1465
|
const freeGB = freeMem / 1e9;
|
|
1343
1466
|
lines.push(
|
|
1344
1467
|
``,
|
|
1345
|
-
`MEMORY HEADROOM:`,
|
|
1346
|
-
`
|
|
1468
|
+
`MEMORY HEADROOM (system RAM):`,
|
|
1469
|
+
` Free system RAM: ~${freeGB.toFixed(1)}GB`,
|
|
1347
1470
|
freeGB > 12
|
|
1348
|
-
? ` Status: PLENTY — can load 14B+ models comfortably`
|
|
1471
|
+
? ` Status: PLENTY — can load 14B+ models comfortably (system-RAM view)`
|
|
1349
1472
|
: freeGB > 6
|
|
1350
|
-
? ` Status: OK — can load 7B models, 14B might be tight`
|
|
1473
|
+
? ` Status: OK — can load 7B models, 14B might be tight (system-RAM view)`
|
|
1351
1474
|
: freeGB > 3
|
|
1352
|
-
? ` Status: LOW — stick to 3B-7B models`
|
|
1353
|
-
: ` Status: CRITICAL — close other apps before running models
|
|
1475
|
+
? ` Status: LOW — stick to 3B-7B models (system-RAM view)`
|
|
1476
|
+
: ` Status: CRITICAL — close other apps before running models (system-RAM view)`,
|
|
1477
|
+
anyOnGpu
|
|
1478
|
+
? ` NOTE: models are loaded into VRAM on this box — GPU VRAM (see per-model VRAM above), not system RAM, is the real loading constraint.`
|
|
1479
|
+
: ` NOTE: on dedicated-GPU systems, GPU VRAM (not system RAM) is the real constraint for loading models.`
|
|
1354
1480
|
);
|
|
1355
1481
|
|
|
1356
1482
|
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
@@ -1364,5 +1490,44 @@ server.tool(
|
|
|
1364
1490
|
// START
|
|
1365
1491
|
// ============================================================================
|
|
1366
1492
|
|
|
1367
|
-
|
|
1368
|
-
|
|
1493
|
+
// Connect the stdio transport and start serving. Guarded behind main() so that
|
|
1494
|
+
// importing this module (e.g. from tests) does NOT start the server — only
|
|
1495
|
+
// running the file directly does.
|
|
1496
|
+
async function main() {
|
|
1497
|
+
const transport = new StdioServerTransport();
|
|
1498
|
+
await server.connect(transport);
|
|
1499
|
+
}
|
|
1500
|
+
|
|
1501
|
+
// Detect whether this module is the process entry point. When invoked as
|
|
1502
|
+
// `node bin/mcp-server.mjs`, process.argv[1] resolves to this file's path; when
|
|
1503
|
+
// merely imported (e.g. from a test), it points at the importer instead, so the
|
|
1504
|
+
// server is not started. fileURLToPath(import.meta.url) gives this file's
|
|
1505
|
+
// absolute path; argv[1] is the absolute path Node was launched with. We also
|
|
1506
|
+
// resolve argv[1] through fileURLToPath when it is a file:// URL.
|
|
1507
|
+
function runningAsEntry() {
|
|
1508
|
+
const entry = process.argv[1];
|
|
1509
|
+
if (!entry) return false;
|
|
1510
|
+
try {
|
|
1511
|
+
const thisPath = fileURLToPath(import.meta.url);
|
|
1512
|
+
const entryPath = entry.startsWith("file://") ? fileURLToPath(entry) : entry;
|
|
1513
|
+
return entryPath === thisPath;
|
|
1514
|
+
} catch {
|
|
1515
|
+
return false;
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
if (runningAsEntry()) {
|
|
1520
|
+
await main();
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
// Exported for unit testing. Importing this module must NOT start the server
|
|
1524
|
+
// (see runningAsEntry guard above).
|
|
1525
|
+
export {
|
|
1526
|
+
SERVER_VERSION,
|
|
1527
|
+
readPackageVersion,
|
|
1528
|
+
tokensPerSecond,
|
|
1529
|
+
formatTokPerSec,
|
|
1530
|
+
mapHardwareJson,
|
|
1531
|
+
detectFrameworkMarker,
|
|
1532
|
+
FRAMEWORK_MARKERS,
|
|
1533
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "llm-checker",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.7.0",
|
|
4
4
|
"description": "Intelligent CLI tool with AI-powered model selection that analyzes your hardware and recommends optimal LLM models for your system",
|
|
5
5
|
"bin": {
|
|
6
6
|
"llm-checker": "bin/cli.js",
|
|
@@ -16,6 +16,10 @@
|
|
|
16
16
|
"test:ui": "node tests/ui-cli-smoke.test.js",
|
|
17
17
|
"test:runtime": "node tests/runtime-specdec-tests.js",
|
|
18
18
|
"test:deterministic-pool": "node tests/deterministic-model-pool-check.js",
|
|
19
|
+
"test:registry": "node tests/model-registry-ingestors.test.js",
|
|
20
|
+
"test:registry-main": "node tests/model-registry-main-flow.test.js",
|
|
21
|
+
"test:registry-recommender": "node tests/model-registry-recommender.test.js",
|
|
22
|
+
"test:registry-seed": "node tests/model-registry-seed.test.js",
|
|
19
23
|
"test:policy": "node tests/policy-commands.test.js",
|
|
20
24
|
"test:policy-cli": "node tests/policy-cli-enforcement.js",
|
|
21
25
|
"test:policy-engine": "node tests/policy-engine.test.js",
|
|
@@ -36,25 +40,26 @@
|
|
|
36
40
|
"list-models": "node bin/enhanced_cli.js list-models",
|
|
37
41
|
"ai-check": "node bin/enhanced_cli.js ai-check",
|
|
38
42
|
"ai-run": "node bin/enhanced_cli.js ai-run",
|
|
39
|
-
"sync:seed": "node bin/enhanced_cli.js sync --force --quiet && node scripts/update-seed-db.js",
|
|
43
|
+
"sync:seed": "node bin/enhanced_cli.js sync --force --quiet && node scripts/update-seed-db.js && node scripts/update-registry-seed.js",
|
|
44
|
+
"sync:registry-seed": "node scripts/update-registry-seed.js",
|
|
40
45
|
"benchmark": "cd ml-model && python python/benchmark_collector.py",
|
|
41
46
|
"train-ai": "cd ml-model && python python/train_model.py",
|
|
42
47
|
"postinstall": "echo 'LLM Checker installed. Run: llm-checker hw-detect'"
|
|
43
48
|
},
|
|
44
49
|
"dependencies": {
|
|
45
|
-
"@modelcontextprotocol/sdk": "^1.
|
|
50
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
46
51
|
"chalk": "^4.1.2",
|
|
47
52
|
"commander": "^11.1.0",
|
|
48
53
|
"inquirer": "^8.2.6",
|
|
49
54
|
"node-fetch": "^2.7.0",
|
|
50
55
|
"ora": "^5.4.1",
|
|
51
|
-
"systeminformation": "^5.31.
|
|
56
|
+
"systeminformation": "^5.31.6",
|
|
52
57
|
"table": "^6.8.1",
|
|
53
|
-
"yaml": "^2.
|
|
58
|
+
"yaml": "^2.9.0",
|
|
54
59
|
"zod": "^3.23.0"
|
|
55
60
|
},
|
|
56
61
|
"optionalDependencies": {
|
|
57
|
-
"sql.js": "^1.14.
|
|
62
|
+
"sql.js": "^1.14.1"
|
|
58
63
|
},
|
|
59
64
|
"overrides": {
|
|
60
65
|
"ajv": "^8.18.0",
|
|
@@ -64,8 +69,8 @@
|
|
|
64
69
|
"test-exclude": "^7.0.1"
|
|
65
70
|
},
|
|
66
71
|
"devDependencies": {
|
|
67
|
-
"@types/node": "^20.
|
|
68
|
-
"jest": "^30.2
|
|
72
|
+
"@types/node": "^20.19.41",
|
|
73
|
+
"jest": "^30.4.2"
|
|
69
74
|
},
|
|
70
75
|
"keywords": [
|
|
71
76
|
"llm",
|