llm-checker 3.6.1 → 3.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.js CHANGED
@@ -20,6 +20,17 @@ const {
20
20
  } = require('./provenance/model-provenance');
21
21
  const { normalizePlatform } = require('./utils/platform');
22
22
 
23
+ function normalizeRecommendationRuntime(runtime = 'auto') {
24
+ const normalized = String(runtime || 'auto').trim().toLowerCase();
25
+ if (['auto', 'all', '*'].includes(normalized)) return 'auto';
26
+ if (['ollama', 'vllm', 'mlx', 'llama.cpp', 'llamacpp', 'llama_cpp', 'transformers', 'hf'].includes(normalized)) {
27
+ if (normalized === 'llamacpp' || normalized === 'llama_cpp') return 'llama.cpp';
28
+ if (normalized === 'hf') return 'transformers';
29
+ return normalized;
30
+ }
31
+ return normalizeRuntime(normalized);
32
+ }
33
+
23
34
  class LLMChecker {
24
35
  constructor(options = {}) {
25
36
  this.hardwareDetector = new HardwareDetector();
@@ -2467,7 +2478,59 @@ class LLMChecker {
2467
2478
  async generateIntelligentRecommendations(hardware, options = {}) {
2468
2479
  try {
2469
2480
  this.logger.info('Generating intelligent recommendations...');
2470
- const selectedRuntime = normalizeRuntime(options.runtime || 'ollama');
2481
+ const selectedRuntime = normalizeRecommendationRuntime(options.runtime || 'auto');
2482
+ const optimizeFor = options.optimizeFor || options.optimize || 'balanced';
2483
+
2484
+ if (options.registry !== false) {
2485
+ let registryRecommender = null;
2486
+ try {
2487
+ const { RegistryRecommender } = require('./data/registry-recommender');
2488
+ registryRecommender = new RegistryRecommender();
2489
+ await registryRecommender.initialize();
2490
+
2491
+ const registryResult = await registryRecommender.getBestModelsForHardware(hardware, {
2492
+ runtime: selectedRuntime,
2493
+ optimizeFor,
2494
+ limit: 3,
2495
+ poolLimit: options.poolLimit || 20000,
2496
+ localOnly: options.includeGated ? false : true
2497
+ });
2498
+ const recommendations = registryResult.recommendations;
2499
+ const hasRegistryRecommendations = Object.values(recommendations)
2500
+ .some((group) => Array.isArray(group.bestModels) && group.bestModels.length > 0);
2501
+
2502
+ if (hasRegistryRecommendations) {
2503
+ const summary = this.intelligentRecommender.generateRecommendationSummary(
2504
+ recommendations,
2505
+ hardware,
2506
+ { optimizeFor }
2507
+ );
2508
+ const totalModelsAnalyzed = Number(registryResult.totalModelsAnalyzed) || Object.values(recommendations)
2509
+ .reduce((sum, group) => sum + (Number(group.totalCandidates) || Number(group.totalEvaluated) || 0), 0);
2510
+
2511
+ this.logger.info(`Generated registry recommendations for ${Object.keys(recommendations).length} categories`);
2512
+
2513
+ return {
2514
+ recommendations,
2515
+ summary,
2516
+ optimizeFor: summary.optimize_for || optimizeFor,
2517
+ runtime: selectedRuntime,
2518
+ recommendationSource: 'registry',
2519
+ registryStats: registryResult.registryStats,
2520
+ totalModelsAnalyzed,
2521
+ generatedAt: new Date().toISOString()
2522
+ };
2523
+ }
2524
+
2525
+ this.logger.warn('Registry recommendations were empty, falling back to Ollama catalog');
2526
+ } catch (error) {
2527
+ this.logger.warn('Registry recommendations unavailable, falling back to Ollama catalog', { error: error.message });
2528
+ } finally {
2529
+ if (registryRecommender) {
2530
+ registryRecommender.close();
2531
+ }
2532
+ }
2533
+ }
2471
2534
 
2472
2535
  // Prefer the synced SQLite catalog so `llm-checker sync` updates recommendations immediately.
2473
2536
  const ollamaData = await this.loadOllamaModelData();
@@ -2479,11 +2542,11 @@ class LLMChecker {
2479
2542
  }
2480
2543
 
2481
2544
  // Generar recomendaciones inteligentes
2482
- const optimizeFor = options.optimizeFor || options.optimize || 'balanced';
2545
+ const fallbackRuntime = selectedRuntime === 'auto' ? 'ollama' : selectedRuntime;
2483
2546
  const recommendations = await this.intelligentRecommender.getBestModelsForHardware(
2484
2547
  hardware,
2485
2548
  allModels,
2486
- { optimizeFor, runtime: selectedRuntime }
2549
+ { optimizeFor, runtime: fallbackRuntime }
2487
2550
  );
2488
2551
  const summary = this.intelligentRecommender.generateRecommendationSummary(
2489
2552
  recommendations,
@@ -2497,7 +2560,8 @@ class LLMChecker {
2497
2560
  recommendations,
2498
2561
  summary,
2499
2562
  optimizeFor: summary.optimize_for || optimizeFor,
2500
- runtime: selectedRuntime,
2563
+ runtime: fallbackRuntime,
2564
+ recommendationSource: 'ollama_catalog',
2501
2565
  totalModelsAnalyzed: allModels.length,
2502
2566
  generatedAt: new Date().toISOString()
2503
2567
  };
@@ -243,13 +243,12 @@ class DeterministicModelSelector {
243
243
  directVRAM ??
244
244
  0;
245
245
 
246
- // Multi-GPU fallback when only per-GPU memory is known.
247
- if (!explicitTotalVRAM && gpuCount > 1) {
248
- if (vramPerGPU) {
249
- vramGB = vramPerGPU * gpuCount;
250
- } else if (directVRAM && Boolean(gpu.isMultiGPU || input.isMultiGPU)) {
251
- vramGB = Math.max(directVRAM, directVRAM * gpuCount);
252
- }
246
+ // Multi-GPU: only scale up when memory is known to be PER-GPU (vramPerGPU).
247
+ // A bare `vram`/`vramGB` is treated as the box total and never multiplied,
248
+ // so we don't double an already-total figure and falsely "fit" a model
249
+ // (e.g. a 2x24=48GB box must stay 48GB, not become 96GB).
250
+ if (!explicitTotalVRAM && gpuCount > 1 && vramPerGPU) {
251
+ vramGB = vramPerGPU * gpuCount;
253
252
  }
254
253
 
255
254
  let gpuType = gpu.type;
@@ -1152,6 +1151,17 @@ class DeterministicModelSelector {
1152
1151
  return explicitParams;
1153
1152
  }
1154
1153
 
1154
+ // Use the variant's OWN artifact size to DISAMBIGUATE the model-level size
1155
+ // list. A size-unknown variant (e.g. `:latest`) must not blindly inherit
1156
+ // model_sizes[0]: for qwen3 (model_sizes ["30b","235b"]) that mislabeled a
1157
+ // small qwen3:latest as 30B and poisoned the real qwen3:30b size map, making
1158
+ // a 19GB model falsely "fit" a 16GB machine.
1159
+ const artifactSizeGB = this.extractVariantSizeGB(variant, null);
1160
+ const artifactParamsB =
1161
+ (!this.isCloudVariantTag(variant.tag) && Number.isFinite(artifactSizeGB) && artifactSizeGB > 0)
1162
+ ? this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant)
1163
+ : null;
1164
+
1155
1165
  const metadataCandidates = this.extractParameterCandidates(
1156
1166
  ollamaModel.model_sizes,
1157
1167
  ollamaModel.parameters,
@@ -1159,12 +1169,23 @@ class DeterministicModelSelector {
1159
1169
  ollamaModel.parameter_count
1160
1170
  );
1161
1171
  if (metadataCandidates.length > 0) {
1172
+ if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
1173
+ // Pick the listed size CLOSEST to what this variant's own artifact
1174
+ // implies; if even the closest is far off, trust the artifact size.
1175
+ let closest = metadataCandidates[0];
1176
+ let bestDiff = Math.abs(closest - artifactParamsB);
1177
+ for (const cand of metadataCandidates) {
1178
+ const diff = Math.abs(cand - artifactParamsB);
1179
+ if (diff < bestDiff) { bestDiff = diff; closest = cand; }
1180
+ }
1181
+ const tolerance = Math.max(2, closest * 0.5);
1182
+ return bestDiff <= tolerance ? closest : artifactParamsB;
1183
+ }
1162
1184
  return metadataCandidates[0];
1163
1185
  }
1164
1186
 
1165
- const artifactSizeGB = this.extractVariantSizeGB(variant, null);
1166
- if (!this.isCloudVariantTag(variant.tag) && Number.isFinite(artifactSizeGB) && artifactSizeGB > 0) {
1167
- return this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant);
1187
+ if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
1188
+ return artifactParamsB;
1168
1189
  }
1169
1190
 
1170
1191
  const modelArtifactSizeGB = this.extractArtifactSizeGBFromValue(ollamaModel.main_size);
@@ -1512,28 +1533,35 @@ class DeterministicModelSelector {
1512
1533
  return false;
1513
1534
  }
1514
1535
 
1536
+ // Guard against malformed external pool rows (a missing tags/modalities
1537
+ // /name field used to throw and silently nuke the whole category).
1538
+ const tags = Array.isArray(model.tags) ? model.tags : [];
1539
+ const modalities = Array.isArray(model.modalities) ? model.modalities : [];
1540
+ const name = String(model.name || model.model_identifier || '').toLowerCase();
1541
+ const paramsB = Number(model.paramsB) || 0;
1542
+
1515
1543
  switch (category) {
1516
1544
  case 'coding':
1517
- return model.tags.some(tag => ['coder', 'code', 'instruct'].includes(tag)) ||
1518
- model.name.toLowerCase().includes('code');
1519
-
1545
+ return tags.some(tag => ['coder', 'code', 'instruct'].includes(tag)) ||
1546
+ name.includes('code');
1547
+
1520
1548
  case 'multimodal':
1521
- return model.modalities.includes('vision') ||
1522
- model.tags.includes('vision');
1523
-
1549
+ return modalities.includes('vision') ||
1550
+ tags.includes('vision');
1551
+
1524
1552
  case 'embeddings':
1525
- return model.tags.includes('embedding') ||
1526
- model.tags.includes('embeddings') ||
1527
- model.name.toLowerCase().includes('embed') ||
1528
- model.name.toLowerCase().includes('bge-') ||
1529
- model.name.toLowerCase().includes('nomic-embed') ||
1530
- model.name.toLowerCase().includes('all-minilm') ||
1553
+ return tags.includes('embedding') ||
1554
+ tags.includes('embeddings') ||
1555
+ name.includes('embed') ||
1556
+ name.includes('bge-') ||
1557
+ name.includes('nomic-embed') ||
1558
+ name.includes('all-minilm') ||
1531
1559
  model.specialization === 'embeddings';
1532
-
1560
+
1533
1561
  case 'reasoning':
1534
- return model.tags.includes('instruct') ||
1535
- model.paramsB >= 7; // Prefer larger models for reasoning
1536
-
1562
+ return tags.includes('instruct') ||
1563
+ paramsB >= 7; // Prefer larger models for reasoning
1564
+
1537
1565
  default: // general, reading, summarization
1538
1566
  return true; // Most models can handle these
1539
1567
  }
@@ -1711,15 +1739,19 @@ class DeterministicModelSelector {
1711
1739
  : (Number.isFinite(directVariantMatch) && directVariantMatch > 0 ? directVariantMatch : null);
1712
1740
 
1713
1741
  const parameterProfile = this.resolveMemoryParameterProfile(model);
1714
- const modeledWeightGB = parameterProfile.effectiveParamsB * bpp;
1715
- const preferSparseInferenceParams =
1716
- parameterProfile.isMoE &&
1717
- (parameterProfile.assumptionSource === 'moe_active_metadata' ||
1718
- parameterProfile.assumptionSource === 'moe_derived_expert_ratio');
1719
- const useObservedArtifactSize =
1720
- !preferSparseInferenceParams &&
1721
- Number.isFinite(observedWeightGB) &&
1722
- observedWeightGB > 0;
1742
+ // Weight memory must account for ALL resident parameters. For MoE under
1743
+ // Ollama / Metal / vLLM every expert is resident, so size the weights by
1744
+ // the TOTAL parameter count (not the active count). Active params drive
1745
+ // speed and KV-cache only. Sizing weights by active params used to make a
1746
+ // 236B MoE look like ~14GB and falsely "fit" small hardware.
1747
+ const weightParamsB =
1748
+ parameterProfile.isMoE && Number.isFinite(parameterProfile.totalParamsB) && parameterProfile.totalParamsB > 0
1749
+ ? parameterProfile.totalParamsB
1750
+ : parameterProfile.effectiveParamsB;
1751
+ const modeledWeightGB = weightParamsB * bpp;
1752
+ // A real observed artifact size always wins for weight memory — never let
1753
+ // an MoE "sparse inference" assumption discard a measured on-disk size.
1754
+ const useObservedArtifactSize = Number.isFinite(observedWeightGB) && observedWeightGB > 0;
1723
1755
  const modelMemGB = useObservedArtifactSize ? observedWeightGB : modeledWeightGB;
1724
1756
  const effectiveCtx = Number.isFinite(Number(ctx)) && Number(ctx) > 0 ? Number(ctx) : 4096;
1725
1757
 
@@ -1729,9 +1761,10 @@ class DeterministicModelSelector {
1729
1761
 
1730
1762
  // Runtime overhead (Metal/CUDA context, buffers)
1731
1763
  const runtimeOverhead = useObservedArtifactSize ? 0.35 : 0.5;
1764
+ const usedMoeTotal = parameterProfile.isMoE && weightParamsB === parameterProfile.totalParamsB;
1732
1765
  const memorySource = useObservedArtifactSize
1733
1766
  ? 'observed_artifact_size'
1734
- : (preferSparseInferenceParams ? 'moe_sparse_inference_params' : 'estimated_from_params');
1767
+ : (usedMoeTotal ? 'moe_total_params' : 'estimated_from_params');
1735
1768
 
1736
1769
  return {
1737
1770
  parameterProfile,
@@ -2342,6 +2375,9 @@ class DeterministicModelSelector {
2342
2375
  estimatedRAM: candidate.requiredGB,
2343
2376
  reasoning: candidate.rationale,
2344
2377
  runtime: candidate.runtime || candidate.speed?.runtime || 'ollama',
2378
+ installCommand: candidate.meta.installCommand || provenance.install_command || '',
2379
+ downloadUrl: candidate.meta.downloadUrl || provenance.download_url || '',
2380
+ artifactFormat: candidate.meta.artifact?.format || '',
2345
2381
  memoryAssumptionSource: candidate.memory?.assumptionSource || 'dense_params',
2346
2382
  speedAssumptions: candidate.speed?.moe ? {
2347
2383
  applied: Boolean(candidate.speed.moe.applied),
@@ -2523,19 +2559,24 @@ class DeterministicModelSelector {
2523
2559
  Object.entries(recommendations).forEach(([category, data]) => {
2524
2560
  const bestModel = data.bestModels[0];
2525
2561
  if (bestModel) {
2562
+ const command = bestModel.installCommand ||
2563
+ bestModel.provenance?.install_command ||
2564
+ `ollama pull ${bestModel.model_identifier}`;
2526
2565
  summary.by_category[category] = {
2527
2566
  name: bestModel.model_name || bestModel.name,
2528
2567
  identifier: bestModel.model_identifier,
2529
2568
  score: Math.round(bestModel.categoryScore || bestModel.score),
2530
- command: `ollama pull ${bestModel.model_identifier}`,
2569
+ command,
2531
2570
  size: this.formatModelSize(bestModel),
2532
2571
  quantization: bestModel.quantization || bestModel.quant || 'Q4_K_M',
2572
+ runtime: bestModel.runtime || bestModel.provenance?.runtime || 'ollama',
2533
2573
  pulls: bestModel.pulls || 0,
2534
2574
  source: bestModel.source || bestModel.provenance?.source || 'unknown',
2535
2575
  registry: bestModel.registry || bestModel.provenance?.registry || 'unknown',
2536
2576
  version: bestModel.version || bestModel.provenance?.version || 'unknown',
2537
2577
  license: bestModel.license || bestModel.provenance?.license || 'unknown',
2538
2578
  digest: bestModel.digest || bestModel.provenance?.digest || 'unknown',
2579
+ download_url: bestModel.downloadUrl || bestModel.provenance?.download_url || '',
2539
2580
  provenance: bestModel.provenance || {
2540
2581
  source: bestModel.source || 'unknown',
2541
2582
  registry: bestModel.registry || 'unknown',
@@ -2545,7 +2586,7 @@ class DeterministicModelSelector {
2545
2586
  }
2546
2587
  };
2547
2588
 
2548
- summary.quick_commands.push(`ollama pull ${bestModel.model_identifier}`);
2589
+ summary.quick_commands.push(command);
2549
2590
 
2550
2591
  const isGeneralCategory = ['general', 'coding', 'talking', 'reading'].includes(category);
2551
2592
  const score = bestModel.categoryScore || bestModel.score || 0;
@@ -2559,18 +2600,23 @@ class DeterministicModelSelector {
2559
2600
  });
2560
2601
 
2561
2602
  if (bestOverallModel) {
2603
+ const command = bestOverallModel.installCommand ||
2604
+ bestOverallModel.provenance?.install_command ||
2605
+ `ollama pull ${bestOverallModel.model_identifier}`;
2562
2606
  summary.best_overall = {
2563
2607
  name: bestOverallModel.model_name || bestOverallModel.name,
2564
2608
  identifier: bestOverallModel.model_identifier,
2565
2609
  category: bestOverallCategory,
2566
2610
  score: Math.round(bestOverallScore),
2567
- command: `ollama pull ${bestOverallModel.model_identifier}`,
2611
+ command,
2568
2612
  quantization: bestOverallModel.quantization || bestOverallModel.quant || 'Q4_K_M',
2613
+ runtime: bestOverallModel.runtime || bestOverallModel.provenance?.runtime || 'ollama',
2569
2614
  source: bestOverallModel.source || bestOverallModel.provenance?.source || 'unknown',
2570
2615
  registry: bestOverallModel.registry || bestOverallModel.provenance?.registry || 'unknown',
2571
2616
  version: bestOverallModel.version || bestOverallModel.provenance?.version || 'unknown',
2572
2617
  license: bestOverallModel.license || bestOverallModel.provenance?.license || 'unknown',
2573
2618
  digest: bestOverallModel.digest || bestOverallModel.provenance?.digest || 'unknown',
2619
+ download_url: bestOverallModel.downloadUrl || bestOverallModel.provenance?.download_url || '',
2574
2620
  provenance: bestOverallModel.provenance || {
2575
2621
  source: bestOverallModel.source || 'unknown',
2576
2622
  registry: bestOverallModel.registry || 'unknown',
@@ -24,6 +24,14 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
24
24
  maxEffectiveGain: 2.65,
25
25
  notes: ['optimized scheduler', 'better expert batching', 'lower offload pressure']
26
26
  }),
27
+ transformers: Object.freeze({
28
+ runtime: 'transformers',
29
+ routingOverhead: 0.15,
30
+ communicationOverhead: 0.10,
31
+ offloadOverhead: 0.06,
32
+ maxEffectiveGain: 2.45,
33
+ notes: ['general Hugging Face path', 'broad architecture support', 'higher Python overhead than vLLM']
34
+ }),
27
35
  mlx: Object.freeze({
28
36
  runtime: 'mlx',
29
37
  routingOverhead: 0.16,
@@ -45,6 +53,9 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
45
53
  const RUNTIME_ALIASES = Object.freeze({
46
54
  ollama: 'ollama',
47
55
  vllm: 'vllm',
56
+ transformers: 'transformers',
57
+ 'huggingface-transformers': 'transformers',
58
+ hf: 'transformers',
48
59
  mlx: 'mlx',
49
60
  'mlx-lm': 'mlx',
50
61
  mlx_lm: 'mlx',