llm-checker 3.5.15 → 3.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +28 -8
  2. package/analyzer/compatibility.js +5 -0
  3. package/analyzer/performance.js +5 -4
  4. package/bin/cli.js +5 -39
  5. package/bin/enhanced_cli.js +449 -24
  6. package/bin/mcp-server.mjs +266 -101
  7. package/package.json +13 -8
  8. package/src/ai/multi-objective-selector.js +118 -11
  9. package/src/calibration/calibration-manager.js +4 -1
  10. package/src/data/model-database.js +489 -5
  11. package/src/data/registry-ingestors.js +751 -0
  12. package/src/data/registry-recommender.js +514 -0
  13. package/src/data/seed/README.md +11 -3
  14. package/src/data/seed/models.db +0 -0
  15. package/src/data/sync-manager.js +32 -18
  16. package/src/hardware/backends/apple-silicon.js +5 -1
  17. package/src/hardware/backends/cuda-detector.js +47 -19
  18. package/src/hardware/backends/intel-detector.js +6 -2
  19. package/src/hardware/backends/rocm-detector.js +6 -2
  20. package/src/hardware/detector.js +57 -30
  21. package/src/hardware/unified-detector.js +129 -25
  22. package/src/index.js +68 -4
  23. package/src/models/ai-check-selector.js +36 -5
  24. package/src/models/deterministic-selector.js +179 -18
  25. package/src/models/expanded_database.js +9 -5
  26. package/src/models/intelligent-selector.js +87 -1
  27. package/src/models/moe-assumptions.js +11 -0
  28. package/src/models/requirements.js +16 -11
  29. package/src/models/scoring-core.js +341 -0
  30. package/src/models/scoring-engine.js +9 -2
  31. package/src/ollama/capacity-planner.js +15 -2
  32. package/src/ollama/client.js +70 -30
  33. package/src/ollama/enhanced-client.js +20 -2
  34. package/src/ollama/manager.js +14 -2
  35. package/src/policy/cli-policy.js +8 -2
  36. package/src/policy/policy-engine.js +2 -1
  37. package/src/provenance/model-provenance.js +4 -1
  38. package/src/ui/cli-theme.js +47 -7
  39. package/src/ui/interactive-panel.js +162 -24
@@ -20,6 +20,7 @@ import { promisify } from "util";
20
20
  import { fileURLToPath } from "url";
21
21
  import { dirname, join } from "path";
22
22
  import { readdir, stat } from "fs/promises";
23
+ import { readFileSync } from "fs";
23
24
  import http from "http";
24
25
  import os from "os";
25
26
 
@@ -28,8 +29,20 @@ const __filename = fileURLToPath(import.meta.url);
28
29
  const __dirname = dirname(__filename);
29
30
 
30
31
  const CLI_PATH = join(__dirname, "enhanced_cli.js");
32
+ const PACKAGE_JSON_PATH = join(__dirname, "..", "package.json");
31
33
  const OLLAMA_HOST = process.env.OLLAMA_HOST || "http://localhost:11434";
32
34
 
35
+ // Read the package version dynamically so the advertised MCP server version
36
+ // never drifts from package.json. Falls back to "0.0.0" if unreadable.
37
+ function readPackageVersion(packagePath = PACKAGE_JSON_PATH) {
38
+ try {
39
+ const pkg = JSON.parse(readFileSync(packagePath, "utf8"));
40
+ return typeof pkg.version === "string" && pkg.version ? pkg.version : "0.0.0";
41
+ } catch {
42
+ return "0.0.0";
43
+ }
44
+ }
45
+
33
46
  // ============================================================================
34
47
  // HELPERS
35
48
  // ============================================================================
@@ -50,8 +63,13 @@ async function run(args, timeout = 120000) {
50
63
  });
51
64
  return clean(stdout || stderr);
52
65
  } catch (err) {
53
- if (err.stdout) return clean(err.stdout);
54
- throw new Error(`llm-checker failed: ${err.message}`);
66
+ // M8: the CLI exited non-zero. Do NOT silently return captured stdout as if
67
+ // it succeeded — that masks failures from the caller (no error signal).
68
+ // Throw so the tool handler's catch surfaces isError, while preserving the
69
+ // captured output in the error message for diagnostics.
70
+ const captured = clean(err.stdout || err.stderr || "");
71
+ const detail = captured ? `${err.message}\n${captured}` : err.message;
72
+ throw new Error(`llm-checker failed: ${detail}`);
55
73
  }
56
74
  }
57
75
 
@@ -109,6 +127,100 @@ function tryParseJSON(text) {
109
127
  }
110
128
  }
111
129
 
130
+ // ----------------------------------------------------------------------------
131
+ // Pure helpers (exported for unit testing)
132
+ // ----------------------------------------------------------------------------
133
+
134
+ // M2: Compute generation speed in tokens/sec from an Ollama /api/generate
135
+ // response. Ollama reports durations in nanoseconds. Only compute a finite
136
+ // value when BOTH eval_count and eval_duration are positive; otherwise return
137
+ // null so callers can render "n/a" instead of dividing by a bogus 1ns fallback
138
+ // (which produced absurd numbers like billions of tok/s).
139
+ function tokensPerSecond(evalCount, evalDurationNs) {
140
+ const count = Number(evalCount);
141
+ const durNs = Number(evalDurationNs);
142
+ if (!Number.isFinite(count) || !Number.isFinite(durNs)) return null;
143
+ if (count <= 0 || durNs <= 0) return null;
144
+ return (count / durNs) * 1e9;
145
+ }
146
+
147
+ // Format a tokens/sec value (possibly null) for display. Renders "n/a" when
148
+ // the value is unavailable, otherwise a fixed-precision number.
149
+ function formatTokPerSec(value) {
150
+ if (value === null || value === undefined || !Number.isFinite(value)) return "n/a";
151
+ return value.toFixed(1);
152
+ }
153
+
154
+ // M1: Map the structured `hw-detect --json` object to the small set of facts
155
+ // the optimizer/cleanup tools need, sourced from typed fields instead of
156
+ // regex-scraping human-readable CLI text.
157
+ // - tier: summary.hardwareTier, upper-cased (e.g. "MEDIUM_HIGH"); "UNKNOWN" if absent.
158
+ // - vramGB: summary.totalVRAM (unified/dedicated GPU memory budget).
159
+ // - maxGB: largest model size that fits. Mirrors the detector's
160
+ // getMaxModelSize(): effectiveMemory - 2GB headroom. Falls back to
161
+ // totalVRAM/systemRAM derivations, then a sane 15GB default.
162
+ function mapHardwareJson(json) {
163
+ const summary = (json && typeof json === "object" && json.summary) || {};
164
+
165
+ const rawTier = summary.hardwareTier;
166
+ const tier = typeof rawTier === "string" && rawTier.trim()
167
+ ? rawTier.trim().toUpperCase()
168
+ : "UNKNOWN";
169
+
170
+ const vramGB = Number.isFinite(Number(summary.totalVRAM)) ? Number(summary.totalVRAM) : null;
171
+
172
+ let maxGB;
173
+ if (Number.isFinite(Number(summary.effectiveMemory))) {
174
+ maxGB = Math.max(0, Math.round(Number(summary.effectiveMemory) - 2));
175
+ } else if (Number.isFinite(Number(summary.totalVRAM)) && Number(summary.totalVRAM) > 0) {
176
+ maxGB = Math.max(0, Math.round(Number(summary.totalVRAM) - 2));
177
+ } else if (Number.isFinite(Number(summary.systemRAM)) && Number(summary.systemRAM) > 0) {
178
+ maxGB = Math.max(0, Math.round(Number(summary.systemRAM) - 2));
179
+ } else {
180
+ maxGB = 15; // sane fallback when JSON lacks memory fields
181
+ }
182
+
183
+ return { tier, vramGB, maxGB };
184
+ }
185
+
186
+ // M7: Map of framework/marker filenames AND directories to their labels.
187
+ // Some markers are directories (e.g. ".github" -> "GitHub Actions") whose
188
+ // names start with a dot; the directory scan skips dotfiles, so these must be
189
+ // detected explicitly before the dotfile skip.
190
+ const FRAMEWORK_MARKERS = {
191
+ "package.json": "Node.js",
192
+ "Cargo.toml": "Rust/Cargo",
193
+ "go.mod": "Go Modules",
194
+ "requirements.txt": "Python/pip",
195
+ "pyproject.toml": "Python",
196
+ "Gemfile": "Ruby/Bundler",
197
+ "pom.xml": "Java/Maven",
198
+ "build.gradle": "Java/Gradle",
199
+ "composer.json": "PHP/Composer",
200
+ "Anchor.toml": "Solana/Anchor",
201
+ "hardhat.config.js": "Ethereum/Hardhat",
202
+ "foundry.toml": "Ethereum/Foundry",
203
+ "CMakeLists.txt": "CMake",
204
+ "Makefile": "Make",
205
+ "Dockerfile": "Docker",
206
+ "docker-compose.yml": "Docker Compose",
207
+ ".github": "GitHub Actions",
208
+ "next.config.js": "Next.js",
209
+ "next.config.mjs": "Next.js",
210
+ "vite.config.ts": "Vite",
211
+ "tailwind.config.js": "Tailwind CSS",
212
+ "tsconfig.json": "TypeScript",
213
+ };
214
+
215
+ // Returns the framework label for a given directory/file entry name, or null
216
+ // if the name is not a recognized marker. Pure + synchronous so it is unit
217
+ // testable (regression for M7: ".github" must resolve to "GitHub Actions").
218
+ function detectFrameworkMarker(name) {
219
+ return Object.prototype.hasOwnProperty.call(FRAMEWORK_MARKERS, name)
220
+ ? FRAMEWORK_MARKERS[name]
221
+ : null;
222
+ }
223
+
112
224
  function formatExportBlock(envObject) {
113
225
  if (!envObject || typeof envObject !== "object") return "";
114
226
  const entries = Object.entries(envObject).filter(([, value]) => value !== undefined && value !== null);
@@ -185,9 +297,11 @@ const ALLOWED_CLI_COMMANDS = new Set([
185
297
  // MCP SERVER
186
298
  // ============================================================================
187
299
 
300
+ const SERVER_VERSION = readPackageVersion();
301
+
188
302
  const server = new McpServer({
189
303
  name: "llm-checker",
190
- version: "3.5.11",
304
+ version: SERVER_VERSION,
191
305
  });
192
306
 
193
307
  // ============================================================================
@@ -733,9 +847,7 @@ server.tool(
733
847
  async ({ model, prompt }) => {
734
848
  try {
735
849
  const data = await ollamaAPI("/api/generate", { model, prompt, stream: false }, 300000);
736
- const tokPerSec = data.eval_count && data.eval_duration
737
- ? ((data.eval_count / data.eval_duration) * 1e9).toFixed(1)
738
- : "?";
850
+ const tokPerSec = formatTokPerSec(tokensPerSecond(data.eval_count, data.eval_duration));
739
851
  const result = [
740
852
  `MODEL: ${model}`,
741
853
  `RESPONSE: ${data.response}`,
@@ -776,7 +888,12 @@ server.tool(
776
888
  {},
777
889
  async () => {
778
890
  try {
779
- const hwResult = await run(["hw-detect"]);
891
+ // M1: source hardware facts from structured `hw-detect --json` instead of
892
+ // regex-scraping human-readable CLI text.
893
+ const hwJsonText = await run(["hw-detect", "--json"]);
894
+ const hwJson = tryParseJSON(hwJsonText);
895
+ const { tier, vramGB } = mapHardwareJson(hwJson || {});
896
+
780
897
  const totalMem = os.totalmem();
781
898
  const freeMem = os.freemem();
782
899
  const cpuCount = os.cpus().length;
@@ -784,10 +901,6 @@ server.tool(
784
901
  const freeGB = Math.round(freeMem / 1e9);
785
902
  const platform = os.platform();
786
903
 
787
- // Parse tier from hw-detect output
788
- const tierMatch = hwResult.match(/Tier:\s*(\w[\w\s]*)/i);
789
- const tier = tierMatch ? tierMatch[1].trim().toUpperCase() : "UNKNOWN";
790
-
791
904
  // Determine GPU layers
792
905
  const isApple = platform === "darwin";
793
906
  let numGPU = 999; // Apple Silicon = all layers on GPU
@@ -845,7 +958,7 @@ server.tool(
845
958
  `OLLAMA OPTIMIZATION FOR YOUR SYSTEM`,
846
959
  `====================================`,
847
960
  `Hardware: ${cpuCount} cores, ${totalGB}GB total RAM, ${freeGB}GB free`,
848
- `Platform: ${platform} | Tier: ${tier}`,
961
+ `Platform: ${platform} | Tier: ${tier}${vramGB !== null ? ` | VRAM: ${vramGB}GB` : ""}`,
849
962
  ``,
850
963
  `RECOMMENDED ENVIRONMENT VARIABLES:`,
851
964
  `----------------------------------`,
@@ -883,42 +996,43 @@ server.tool(
883
996
 
884
997
  server.tool(
885
998
  "benchmark",
886
- "Benchmark a local Ollama model: measure tokens/sec, load time, and generation speed with a standardized prompt or custom prompt. Runs 3 iterations for reliable averages.",
999
+ "Benchmark a local Ollama model: measure tokens/sec, load time, and generation speed. With no custom prompt it runs the SAME standardized prompt 3 times (true warm-up iterations); the first (cold) run is excluded from the tok/s average and load time is reported from that first run. A custom prompt is run once.",
887
1000
  {
888
1001
  model: z.string().describe("Model name to benchmark (e.g. 'qwen2.5-coder:7b')"),
889
1002
  prompt: z
890
1003
  .string()
891
1004
  .optional()
892
- .describe("Custom benchmark prompt (default: standardized coding + reasoning prompt)"),
1005
+ .describe("Custom benchmark prompt (default: standardized coding prompt run 3x)"),
893
1006
  },
894
1007
  async ({ model, prompt }) => {
895
- const benchPrompts = prompt
896
- ? [prompt]
897
- : [
898
- "Write a Python function to find the nth Fibonacci number using memoization. Include type hints.",
899
- "Explain the difference between a mutex and a semaphore in 3 sentences.",
900
- "What is the time complexity of quicksort in the average and worst case? Answer briefly.",
901
- ];
1008
+ // M4: when no custom prompt is given, run the SAME prompt N times so these
1009
+ // are real iterations of one workload (warm-up effects, not different
1010
+ // tasks). The first run is cold (includes model load) and is excluded from
1011
+ // the tok/s average; its load time is reported separately.
1012
+ const STANDARD_PROMPT =
1013
+ "Write a Python function to find the nth Fibonacci number using memoization. Include type hints.";
1014
+ const usingCustom = Boolean(prompt);
1015
+ const benchPrompt = usingCustom ? prompt : STANDARD_PROMPT;
1016
+ const iterations = usingCustom ? 1 : 3;
902
1017
 
903
1018
  try {
904
1019
  const results = [];
905
- for (let i = 0; i < benchPrompts.length; i++) {
1020
+ for (let i = 0; i < iterations; i++) {
906
1021
  const data = await ollamaAPI(
907
1022
  "/api/generate",
908
- { model, prompt: benchPrompts[i], stream: false },
1023
+ { model, prompt: benchPrompt, stream: false },
909
1024
  300000
910
1025
  );
911
1026
 
912
1027
  const evalTokens = data.eval_count || 0;
913
- const evalDur = data.eval_duration || 1;
914
- const tokPerSec = (evalTokens / evalDur) * 1e9;
1028
+ // M2: only a finite value when both eval_count and eval_duration are > 0.
1029
+ const tokPerSec = tokensPerSecond(data.eval_count, data.eval_duration);
915
1030
  const totalSec = data.total_duration ? data.total_duration / 1e9 : 0;
916
1031
  const loadMs = data.load_duration ? data.load_duration / 1e6 : 0;
917
1032
  const promptTokens = data.prompt_eval_count || 0;
918
1033
  const promptMs = data.prompt_eval_duration ? data.prompt_eval_duration / 1e6 : 0;
919
1034
 
920
1035
  results.push({
921
- prompt: benchPrompts[i].slice(0, 60) + (benchPrompts[i].length > 60 ? "..." : ""),
922
1036
  evalTokens,
923
1037
  tokPerSec,
924
1038
  totalSec,
@@ -929,30 +1043,40 @@ server.tool(
929
1043
  });
930
1044
  }
931
1045
 
932
- // Compute averages
933
- const avgTokPerSec = results.reduce((s, r) => s + r.tokPerSec, 0) / results.length;
934
- const avgTotalSec = results.reduce((s, r) => s + r.totalSec, 0) / results.length;
935
- const avgLoadMs = results.reduce((s, r) => s + r.loadMs, 0) / results.length;
1046
+ // tok/s average over WARM runs only (exclude the cold first run when we
1047
+ // have more than one iteration). Drop unavailable (null) measurements.
1048
+ const warmRuns = iterations > 1 ? results.slice(1) : results;
1049
+ const warmTokRates = warmRuns.map((r) => r.tokPerSec).filter((v) => v !== null);
1050
+ const avgTokPerSec = warmTokRates.length > 0
1051
+ ? warmTokRates.reduce((s, v) => s + v, 0) / warmTokRates.length
1052
+ : null;
1053
+
1054
+ const warmTotals = warmRuns.map((r) => r.totalSec);
1055
+ const avgTotalSec = warmTotals.reduce((s, v) => s + v, 0) / warmTotals.length;
1056
+ // Load time is a cold-start cost: report it from the first run only.
1057
+ const coldLoadMs = results[0]?.loadMs ?? 0;
936
1058
  const totalTokens = results.reduce((s, r) => s + r.evalTokens, 0);
937
1059
 
1060
+ const promptPreview = benchPrompt.slice(0, 60) + (benchPrompt.length > 60 ? "..." : "");
1061
+
938
1062
  const output = [
939
1063
  `BENCHMARK: ${model}`,
940
1064
  `${"=".repeat(60)}`,
941
- `Iterations: ${results.length}`,
1065
+ `Prompt: "${promptPreview}"`,
1066
+ `Iterations: ${results.length}${iterations > 1 ? " (run 1 = cold, excluded from speed average)" : ""}`,
942
1067
  ``,
943
1068
  ...results.map((r, i) => [
944
- `--- Run ${i + 1} ---`,
945
- `Prompt: "${r.prompt}"`,
946
- `Generated: ${r.evalTokens} tokens at ${r.tokPerSec.toFixed(1)} tok/s`,
1069
+ `--- Run ${i + 1}${iterations > 1 && i === 0 ? " (cold)" : ""} ---`,
1070
+ `Generated: ${r.evalTokens} tokens at ${formatTokPerSec(r.tokPerSec)} tok/s`,
947
1071
  `Total: ${r.totalSec.toFixed(2)}s | Load: ${r.loadMs.toFixed(0)}ms | Prompt eval: ${r.promptMs.toFixed(0)}ms (${r.promptTokens} tokens)`,
948
1072
  `Response: "${r.responsePreview}..."`,
949
1073
  ``,
950
1074
  ]).flat(),
951
1075
  `${"=".repeat(60)}`,
952
- `AVERAGES:`,
953
- ` Generation speed: ${avgTokPerSec.toFixed(1)} tok/s`,
1076
+ iterations > 1 ? `WARM AVERAGES (excludes cold run 1):` : `RESULTS:`,
1077
+ ` Generation speed: ${formatTokPerSec(avgTokPerSec)} tok/s`,
954
1078
  ` Total time: ${avgTotalSec.toFixed(2)}s`,
955
- ` Load time: ${avgLoadMs.toFixed(0)}ms`,
1079
+ ` Cold load time: ${coldLoadMs.toFixed(0)}ms`,
956
1080
  ` Total tokens generated: ${totalTokens}`,
957
1081
  ].join("\n");
958
1082
 
@@ -969,7 +1093,7 @@ server.tool(
969
1093
 
970
1094
  server.tool(
971
1095
  "compare_models",
972
- "Compare two local Ollama models head-to-head: same prompt, measured speed, token count, and response quality side by side",
1096
+ "Compare two local Ollama models head-to-head on the same prompt. Models are run SEQUENTIALLY (model A fully, then model B) so each tok/s measurement is uncontended — running them in parallel would make them fight over GPU/RAM and invalidate the speed comparison.",
973
1097
  {
974
1098
  model_a: z.string().describe("First model (e.g. 'qwen2.5-coder:7b')"),
975
1099
  model_b: z.string().describe("Second model (e.g. 'codellama:7b')"),
@@ -982,16 +1106,15 @@ server.tool(
982
1106
  const testPrompt = prompt || "Write a Python function that checks if a string is a valid IPv4 address. Include edge cases.";
983
1107
 
984
1108
  try {
985
- // Run both models
986
- const [resultA, resultB] = await Promise.all([
987
- ollamaAPI("/api/generate", { model: model_a, prompt: testPrompt, stream: false }, 300000),
988
- ollamaAPI("/api/generate", { model: model_b, prompt: testPrompt, stream: false }, 300000),
989
- ]);
1109
+ // M3: run sequentially so the two models do not contend for GPU/RAM.
1110
+ // Each measurement is taken while the other model is not executing.
1111
+ const resultA = await ollamaAPI("/api/generate", { model: model_a, prompt: testPrompt, stream: false }, 300000);
1112
+ const resultB = await ollamaAPI("/api/generate", { model: model_b, prompt: testPrompt, stream: false }, 300000);
990
1113
 
991
1114
  function metrics(data) {
992
1115
  const evalTokens = data.eval_count || 0;
993
- const evalDur = data.eval_duration || 1;
994
- const tokPerSec = (evalTokens / evalDur) * 1e9;
1116
+ // M2: null when eval_count/eval_duration are not both positive.
1117
+ const tokPerSec = tokensPerSecond(data.eval_count, data.eval_duration);
995
1118
  const totalSec = data.total_duration ? data.total_duration / 1e9 : 0;
996
1119
  const loadSec = data.load_duration ? data.load_duration / 1e9 : 0;
997
1120
  return { evalTokens, tokPerSec, totalSec, loadSec, response: data.response || "" };
@@ -1000,17 +1123,23 @@ server.tool(
1000
1123
  const a = metrics(resultA);
1001
1124
  const b = metrics(resultB);
1002
1125
 
1003
- const speedWinner = a.tokPerSec > b.tokPerSec ? model_a : model_b;
1004
- const verbosityWinner = a.evalTokens > b.evalTokens ? model_a : model_b;
1126
+ // Only declare a speed winner when both rates are known.
1127
+ let speedWinner;
1128
+ if (a.tokPerSec === null && b.tokPerSec === null) speedWinner = "n/a (no timing data)";
1129
+ else if (a.tokPerSec === null) speedWinner = model_b;
1130
+ else if (b.tokPerSec === null) speedWinner = model_a;
1131
+ else speedWinner = a.tokPerSec >= b.tokPerSec ? model_a : model_b;
1132
+
1133
+ const verbosityWinner = a.evalTokens >= b.evalTokens ? model_a : model_b;
1005
1134
 
1006
1135
  const output = [
1007
- `HEAD-TO-HEAD COMPARISON`,
1136
+ `HEAD-TO-HEAD COMPARISON (sequential runs, uncontended)`,
1008
1137
  `${"=".repeat(70)}`,
1009
1138
  `Prompt: "${testPrompt.slice(0, 80)}${testPrompt.length > 80 ? "..." : ""}"`,
1010
1139
  ``,
1011
1140
  `METRIC ${model_a.padEnd(25)} ${model_b.padEnd(25)}`,
1012
1141
  `-`.repeat(70),
1013
- `Speed (tok/s) ${a.tokPerSec.toFixed(1).padEnd(25)} ${b.tokPerSec.toFixed(1).padEnd(25)}`,
1142
+ `Speed (tok/s) ${formatTokPerSec(a.tokPerSec).padEnd(25)} ${formatTokPerSec(b.tokPerSec).padEnd(25)}`,
1014
1143
  `Tokens generated ${String(a.evalTokens).padEnd(25)} ${String(b.evalTokens).padEnd(25)}`,
1015
1144
  `Total time ${(a.totalSec.toFixed(2) + "s").padEnd(25)} ${(b.totalSec.toFixed(2) + "s").padEnd(25)}`,
1016
1145
  `Load time ${(a.loadSec.toFixed(2) + "s").padEnd(25)} ${(b.loadSec.toFixed(2) + "s").padEnd(25)}`,
@@ -1042,9 +1171,9 @@ server.tool(
1042
1171
  {},
1043
1172
  async () => {
1044
1173
  try {
1045
- const [tagsData, hwResult] = await Promise.all([
1174
+ const [tagsData, hwJsonText] = await Promise.all([
1046
1175
  ollamaAPI("/api/tags", null, 10000),
1047
- run(["hw-detect"]),
1176
+ run(["hw-detect", "--json"]),
1048
1177
  ]);
1049
1178
 
1050
1179
  if (!tagsData.models || tagsData.models.length === 0) {
@@ -1054,11 +1183,10 @@ server.tool(
1054
1183
  const models = tagsData.models;
1055
1184
  const totalSize = models.reduce((s, m) => s + (m.size || 0), 0);
1056
1185
 
1057
- // Parse hardware tier
1058
- const tierMatch = hwResult.match(/Tier:\s*(\w[\w\s]*)/i);
1059
- const tier = tierMatch ? tierMatch[1].trim() : "UNKNOWN";
1060
- const maxSizeMatch = hwResult.match(/Max model size:\s*(\d+)/i);
1061
- const maxGB = maxSizeMatch ? parseInt(maxSizeMatch[1]) : 15;
1186
+ // M1: source hardware tier + max model size from structured JSON instead
1187
+ // of regex-scraping CLI text. mapHardwareJson() falls back to a sane 15GB
1188
+ // default if the JSON could not be parsed (tryParseJSON -> null).
1189
+ const { maxGB } = mapHardwareJson(tryParseJSON(hwJsonText) || {});
1062
1190
 
1063
1191
  // Analyze each model
1064
1192
  const analysis = models.map((m) => {
@@ -1172,46 +1300,23 @@ server.tool(
1172
1300
  ".sh": "Shell", ".bash": "Shell", ".zsh": "Shell",
1173
1301
  };
1174
1302
 
1175
- const frameworkFiles = {
1176
- "package.json": "Node.js",
1177
- "Cargo.toml": "Rust/Cargo",
1178
- "go.mod": "Go Modules",
1179
- "requirements.txt": "Python/pip",
1180
- "pyproject.toml": "Python",
1181
- "Gemfile": "Ruby/Bundler",
1182
- "pom.xml": "Java/Maven",
1183
- "build.gradle": "Java/Gradle",
1184
- "composer.json": "PHP/Composer",
1185
- "Anchor.toml": "Solana/Anchor",
1186
- "hardhat.config.js": "Ethereum/Hardhat",
1187
- "foundry.toml": "Ethereum/Foundry",
1188
- "CMakeLists.txt": "CMake",
1189
- "Makefile": "Make",
1190
- "Dockerfile": "Docker",
1191
- "docker-compose.yml": "Docker Compose",
1192
- ".github": "GitHub Actions",
1193
- "next.config.js": "Next.js",
1194
- "next.config.mjs": "Next.js",
1195
- "vite.config.ts": "Vite",
1196
- "tailwind.config.js": "Tailwind CSS",
1197
- "tsconfig.json": "TypeScript",
1198
- };
1199
-
1200
1303
  async function scanDir(dir, depth = 0) {
1201
1304
  if (depth > 4) return; // Max depth
1202
1305
  try {
1203
1306
  const entries = await readdir(dir, { withFileTypes: true });
1204
1307
  for (const entry of entries) {
1308
+ // M7: detect known framework markers (incl. dot-directories like
1309
+ // ".github" -> "GitHub Actions") BEFORE skipping dotfiles, otherwise
1310
+ // the dotfile skip below means ".github" can never be matched.
1311
+ const marker = detectFrameworkMarker(entry.name);
1312
+ if (marker) frameworks.add(marker);
1313
+
1205
1314
  if (entry.name.startsWith(".") || entry.name === "node_modules" || entry.name === "target" || entry.name === "__pycache__" || entry.name === "dist" || entry.name === "build" || entry.name === "vendor") continue;
1206
1315
 
1207
1316
  const fullPath = join(dir, entry.name);
1208
1317
  if (entry.isDirectory()) {
1209
- if (frameworkFiles[entry.name]) frameworks.add(frameworkFiles[entry.name]);
1210
1318
  await scanDir(fullPath, depth + 1);
1211
1319
  } else {
1212
- // Check framework files
1213
- if (frameworkFiles[entry.name]) frameworks.add(frameworkFiles[entry.name]);
1214
-
1215
1320
  // Count by extension
1216
1321
  const ext = entry.name.includes(".") ? "." + entry.name.split(".").pop().toLowerCase() : "";
1217
1322
  if (extMap[ext]) {
@@ -1302,8 +1407,14 @@ server.tool(
1302
1407
  const cpus = os.cpus();
1303
1408
  const loadAvg = os.loadavg();
1304
1409
 
1305
- // CPU usage from loadavg
1306
- const cpuPercent = ((loadAvg[0] / cpus.length) * 100).toFixed(1);
1410
+ // M5: os.loadavg() is always [0,0,0] on Windows, so deriving CPU% from it
1411
+ // reports a misleading 0%. Detect that case (Windows, or an all-zero
1412
+ // loadavg) and show "n/a" instead.
1413
+ const loadAvgUnavailable =
1414
+ os.platform() === "win32" || (loadAvg[0] === 0 && loadAvg[1] === 0 && loadAvg[2] === 0);
1415
+ const cpuPercentLabel = loadAvgUnavailable
1416
+ ? "n/a (load average not reported on this platform)"
1417
+ : `${((loadAvg[0] / cpus.length) * 100).toFixed(1)}% (load: ${loadAvg[0].toFixed(2)})`;
1307
1418
 
1308
1419
  // Installed models total size
1309
1420
  const installedModels = tagsData.models || [];
@@ -1317,40 +1428,55 @@ server.tool(
1317
1428
  `${"=".repeat(60)}`,
1318
1429
  ``,
1319
1430
  `SYSTEM RESOURCES:`,
1320
- ` RAM: ${formatBytes(usedMem)} / ${formatBytes(totalMem)} (${memPercent}% used)`,
1431
+ ` System RAM: ${formatBytes(usedMem)} / ${formatBytes(totalMem)} (${memPercent}% used)`,
1321
1432
  ` Free: ${formatBytes(freeMem)}`,
1322
- ` CPU: ${cpuPercent}% (${cpus.length} cores, load: ${loadAvg[0].toFixed(2)})`,
1433
+ ` CPU: ${cpuPercentLabel} (${cpus.length} cores)`,
1323
1434
  ``,
1324
1435
  `OLLAMA STATUS:`,
1325
1436
  ` Installed models: ${installedModels.length} (${formatBytes(totalModelSize)} on disk)`,
1326
1437
  ` Running models: ${runningModels.length}`,
1327
1438
  ];
1328
1439
 
1440
+ // M5: surface each running model's VRAM residency from /api/ps.
1441
+ let anyOnGpu = false;
1329
1442
  if (runningModels.length > 0) {
1330
1443
  lines.push(``, ` LOADED IN MEMORY:`);
1331
1444
  for (const m of runningModels) {
1332
- const vram = m.size_vram ? formatBytes(m.size_vram) : "?";
1333
- const ram = m.size ? formatBytes(m.size) : "?";
1445
+ const sizeVram = Number(m.size_vram) || 0;
1446
+ const totalLoaded = Number(m.size) || 0;
1447
+ if (sizeVram > 0) anyOnGpu = true;
1448
+ const vram = sizeVram > 0 ? formatBytes(sizeVram) : "0 B";
1449
+ const total = totalLoaded > 0 ? formatBytes(totalLoaded) : "?";
1450
+ // Portion resident in system RAM (CPU offload) = total - VRAM.
1451
+ const ramPortion = totalLoaded > sizeVram ? formatBytes(totalLoaded - sizeVram) : "0 B";
1334
1452
  const expires = m.expires_at ? new Date(m.expires_at).toLocaleTimeString() : "?";
1335
- lines.push(` ${m.name.padEnd(25)} VRAM: ${vram.padEnd(10)} RAM: ${ram.padEnd(10)} Expires: ${expires}`);
1453
+ lines.push(
1454
+ ` ${m.name.padEnd(25)} total ${total.padEnd(10)} VRAM ${vram.padEnd(10)} sysRAM ${ramPortion.padEnd(10)} Expires: ${expires}`
1455
+ );
1336
1456
  }
1337
1457
  } else {
1338
1458
  lines.push(``, ` No models currently loaded in memory.`);
1339
1459
  }
1340
1460
 
1341
- // Memory headroom analysis
1461
+ // M5: the system-RAM free figure is NOT the constraint on GPU boxes —
1462
+ // model residency is bounded by VRAM there. Clarify that this headroom
1463
+ // note reflects system RAM, and call out that VRAM is the real limit when
1464
+ // models are running on the GPU.
1342
1465
  const freeGB = freeMem / 1e9;
1343
1466
  lines.push(
1344
1467
  ``,
1345
- `MEMORY HEADROOM:`,
1346
- ` Available for models: ~${freeGB.toFixed(1)}GB`,
1468
+ `MEMORY HEADROOM (system RAM):`,
1469
+ ` Free system RAM: ~${freeGB.toFixed(1)}GB`,
1347
1470
  freeGB > 12
1348
- ? ` Status: PLENTY — can load 14B+ models comfortably`
1471
+ ? ` Status: PLENTY — can load 14B+ models comfortably (system-RAM view)`
1349
1472
  : freeGB > 6
1350
- ? ` Status: OK — can load 7B models, 14B might be tight`
1473
+ ? ` Status: OK — can load 7B models, 14B might be tight (system-RAM view)`
1351
1474
  : freeGB > 3
1352
- ? ` Status: LOW — stick to 3B-7B models`
1353
- : ` Status: CRITICAL — close other apps before running models`
1475
+ ? ` Status: LOW — stick to 3B-7B models (system-RAM view)`
1476
+ : ` Status: CRITICAL — close other apps before running models (system-RAM view)`,
1477
+ anyOnGpu
1478
+ ? ` NOTE: models are loaded into VRAM on this box — GPU VRAM (see per-model VRAM above), not system RAM, is the real loading constraint.`
1479
+ : ` NOTE: on dedicated-GPU systems, GPU VRAM (not system RAM) is the real constraint for loading models.`
1354
1480
  );
1355
1481
 
1356
1482
  return { content: [{ type: "text", text: lines.join("\n") }] };
@@ -1364,5 +1490,44 @@ server.tool(
1364
1490
  // START
1365
1491
  // ============================================================================
1366
1492
 
1367
- const transport = new StdioServerTransport();
1368
- await server.connect(transport);
1493
+ // Connect the stdio transport and start serving. Guarded behind main() so that
1494
+ // importing this module (e.g. from tests) does NOT start the server — only
1495
+ // running the file directly does.
1496
+ async function main() {
1497
+ const transport = new StdioServerTransport();
1498
+ await server.connect(transport);
1499
+ }
1500
+
1501
+ // Detect whether this module is the process entry point. When invoked as
1502
+ // `node bin/mcp-server.mjs`, process.argv[1] resolves to this file's path; when
1503
+ // merely imported (e.g. from a test), it points at the importer instead, so the
1504
+ // server is not started. fileURLToPath(import.meta.url) gives this file's
1505
+ // absolute path; argv[1] is the absolute path Node was launched with. We also
1506
+ // resolve argv[1] through fileURLToPath when it is a file:// URL.
1507
+ function runningAsEntry() {
1508
+ const entry = process.argv[1];
1509
+ if (!entry) return false;
1510
+ try {
1511
+ const thisPath = fileURLToPath(import.meta.url);
1512
+ const entryPath = entry.startsWith("file://") ? fileURLToPath(entry) : entry;
1513
+ return entryPath === thisPath;
1514
+ } catch {
1515
+ return false;
1516
+ }
1517
+ }
1518
+
1519
+ if (runningAsEntry()) {
1520
+ await main();
1521
+ }
1522
+
1523
+ // Exported for unit testing. Importing this module must NOT start the server
1524
+ // (see runningAsEntry guard above).
1525
+ export {
1526
+ SERVER_VERSION,
1527
+ readPackageVersion,
1528
+ tokensPerSecond,
1529
+ formatTokPerSec,
1530
+ mapHardwareJson,
1531
+ detectFrameworkMarker,
1532
+ FRAMEWORK_MARKERS,
1533
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "llm-checker",
3
- "version": "3.5.15",
3
+ "version": "3.7.0",
4
4
  "description": "Intelligent CLI tool with AI-powered model selection that analyzes your hardware and recommends optimal LLM models for your system",
5
5
  "bin": {
6
6
  "llm-checker": "bin/cli.js",
@@ -16,6 +16,10 @@
16
16
  "test:ui": "node tests/ui-cli-smoke.test.js",
17
17
  "test:runtime": "node tests/runtime-specdec-tests.js",
18
18
  "test:deterministic-pool": "node tests/deterministic-model-pool-check.js",
19
+ "test:registry": "node tests/model-registry-ingestors.test.js",
20
+ "test:registry-main": "node tests/model-registry-main-flow.test.js",
21
+ "test:registry-recommender": "node tests/model-registry-recommender.test.js",
22
+ "test:registry-seed": "node tests/model-registry-seed.test.js",
19
23
  "test:policy": "node tests/policy-commands.test.js",
20
24
  "test:policy-cli": "node tests/policy-cli-enforcement.js",
21
25
  "test:policy-engine": "node tests/policy-engine.test.js",
@@ -36,25 +40,26 @@
36
40
  "list-models": "node bin/enhanced_cli.js list-models",
37
41
  "ai-check": "node bin/enhanced_cli.js ai-check",
38
42
  "ai-run": "node bin/enhanced_cli.js ai-run",
39
- "sync:seed": "node bin/enhanced_cli.js sync --force --quiet && node scripts/update-seed-db.js",
43
+ "sync:seed": "node bin/enhanced_cli.js sync --force --quiet && node scripts/update-seed-db.js && node scripts/update-registry-seed.js",
44
+ "sync:registry-seed": "node scripts/update-registry-seed.js",
40
45
  "benchmark": "cd ml-model && python python/benchmark_collector.py",
41
46
  "train-ai": "cd ml-model && python python/train_model.py",
42
47
  "postinstall": "echo 'LLM Checker installed. Run: llm-checker hw-detect'"
43
48
  },
44
49
  "dependencies": {
45
- "@modelcontextprotocol/sdk": "^1.26.0",
50
+ "@modelcontextprotocol/sdk": "^1.29.0",
46
51
  "chalk": "^4.1.2",
47
52
  "commander": "^11.1.0",
48
53
  "inquirer": "^8.2.6",
49
54
  "node-fetch": "^2.7.0",
50
55
  "ora": "^5.4.1",
51
- "systeminformation": "^5.31.1",
56
+ "systeminformation": "^5.31.6",
52
57
  "table": "^6.8.1",
53
- "yaml": "^2.8.1",
58
+ "yaml": "^2.9.0",
54
59
  "zod": "^3.23.0"
55
60
  },
56
61
  "optionalDependencies": {
57
- "sql.js": "^1.14.0"
62
+ "sql.js": "^1.14.1"
58
63
  },
59
64
  "overrides": {
60
65
  "ajv": "^8.18.0",
@@ -64,8 +69,8 @@
64
69
  "test-exclude": "^7.0.1"
65
70
  },
66
71
  "devDependencies": {
67
- "@types/node": "^20.0.0",
68
- "jest": "^30.2.0"
72
+ "@types/node": "^20.19.41",
73
+ "jest": "^30.4.2"
69
74
  },
70
75
  "keywords": [
71
76
  "llm",