llm-checker 3.6.1 → 3.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -8
- package/bin/enhanced_cli.js +407 -5
- package/bin/mcp-server.mjs +5 -0
- package/package.json +7 -2
- package/src/data/model-database.js +452 -0
- package/src/data/registry-ingestors.js +765 -0
- package/src/data/registry-recommender.js +632 -0
- package/src/data/seed/README.md +11 -3
- package/src/data/seed/models.db +0 -0
- package/src/index.js +68 -4
- package/src/models/deterministic-selector.js +85 -39
- package/src/models/moe-assumptions.js +11 -0
package/src/index.js
CHANGED
|
@@ -20,6 +20,17 @@ const {
|
|
|
20
20
|
} = require('./provenance/model-provenance');
|
|
21
21
|
const { normalizePlatform } = require('./utils/platform');
|
|
22
22
|
|
|
23
|
+
function normalizeRecommendationRuntime(runtime = 'auto') {
|
|
24
|
+
const normalized = String(runtime || 'auto').trim().toLowerCase();
|
|
25
|
+
if (['auto', 'all', '*'].includes(normalized)) return 'auto';
|
|
26
|
+
if (['ollama', 'vllm', 'mlx', 'llama.cpp', 'llamacpp', 'llama_cpp', 'transformers', 'hf'].includes(normalized)) {
|
|
27
|
+
if (normalized === 'llamacpp' || normalized === 'llama_cpp') return 'llama.cpp';
|
|
28
|
+
if (normalized === 'hf') return 'transformers';
|
|
29
|
+
return normalized;
|
|
30
|
+
}
|
|
31
|
+
return normalizeRuntime(normalized);
|
|
32
|
+
}
|
|
33
|
+
|
|
23
34
|
class LLMChecker {
|
|
24
35
|
constructor(options = {}) {
|
|
25
36
|
this.hardwareDetector = new HardwareDetector();
|
|
@@ -2467,7 +2478,59 @@ class LLMChecker {
|
|
|
2467
2478
|
async generateIntelligentRecommendations(hardware, options = {}) {
|
|
2468
2479
|
try {
|
|
2469
2480
|
this.logger.info('Generating intelligent recommendations...');
|
|
2470
|
-
const selectedRuntime =
|
|
2481
|
+
const selectedRuntime = normalizeRecommendationRuntime(options.runtime || 'auto');
|
|
2482
|
+
const optimizeFor = options.optimizeFor || options.optimize || 'balanced';
|
|
2483
|
+
|
|
2484
|
+
if (options.registry !== false) {
|
|
2485
|
+
let registryRecommender = null;
|
|
2486
|
+
try {
|
|
2487
|
+
const { RegistryRecommender } = require('./data/registry-recommender');
|
|
2488
|
+
registryRecommender = new RegistryRecommender();
|
|
2489
|
+
await registryRecommender.initialize();
|
|
2490
|
+
|
|
2491
|
+
const registryResult = await registryRecommender.getBestModelsForHardware(hardware, {
|
|
2492
|
+
runtime: selectedRuntime,
|
|
2493
|
+
optimizeFor,
|
|
2494
|
+
limit: 3,
|
|
2495
|
+
poolLimit: options.poolLimit || 20000,
|
|
2496
|
+
localOnly: options.includeGated ? false : true
|
|
2497
|
+
});
|
|
2498
|
+
const recommendations = registryResult.recommendations;
|
|
2499
|
+
const hasRegistryRecommendations = Object.values(recommendations)
|
|
2500
|
+
.some((group) => Array.isArray(group.bestModels) && group.bestModels.length > 0);
|
|
2501
|
+
|
|
2502
|
+
if (hasRegistryRecommendations) {
|
|
2503
|
+
const summary = this.intelligentRecommender.generateRecommendationSummary(
|
|
2504
|
+
recommendations,
|
|
2505
|
+
hardware,
|
|
2506
|
+
{ optimizeFor }
|
|
2507
|
+
);
|
|
2508
|
+
const totalModelsAnalyzed = Number(registryResult.totalModelsAnalyzed) || Object.values(recommendations)
|
|
2509
|
+
.reduce((sum, group) => sum + (Number(group.totalCandidates) || Number(group.totalEvaluated) || 0), 0);
|
|
2510
|
+
|
|
2511
|
+
this.logger.info(`Generated registry recommendations for ${Object.keys(recommendations).length} categories`);
|
|
2512
|
+
|
|
2513
|
+
return {
|
|
2514
|
+
recommendations,
|
|
2515
|
+
summary,
|
|
2516
|
+
optimizeFor: summary.optimize_for || optimizeFor,
|
|
2517
|
+
runtime: selectedRuntime,
|
|
2518
|
+
recommendationSource: 'registry',
|
|
2519
|
+
registryStats: registryResult.registryStats,
|
|
2520
|
+
totalModelsAnalyzed,
|
|
2521
|
+
generatedAt: new Date().toISOString()
|
|
2522
|
+
};
|
|
2523
|
+
}
|
|
2524
|
+
|
|
2525
|
+
this.logger.warn('Registry recommendations were empty, falling back to Ollama catalog');
|
|
2526
|
+
} catch (error) {
|
|
2527
|
+
this.logger.warn('Registry recommendations unavailable, falling back to Ollama catalog', { error: error.message });
|
|
2528
|
+
} finally {
|
|
2529
|
+
if (registryRecommender) {
|
|
2530
|
+
registryRecommender.close();
|
|
2531
|
+
}
|
|
2532
|
+
}
|
|
2533
|
+
}
|
|
2471
2534
|
|
|
2472
2535
|
// Prefer the synced SQLite catalog so `llm-checker sync` updates recommendations immediately.
|
|
2473
2536
|
const ollamaData = await this.loadOllamaModelData();
|
|
@@ -2479,11 +2542,11 @@ class LLMChecker {
|
|
|
2479
2542
|
}
|
|
2480
2543
|
|
|
2481
2544
|
// Generar recomendaciones inteligentes
|
|
2482
|
-
const
|
|
2545
|
+
const fallbackRuntime = selectedRuntime === 'auto' ? 'ollama' : selectedRuntime;
|
|
2483
2546
|
const recommendations = await this.intelligentRecommender.getBestModelsForHardware(
|
|
2484
2547
|
hardware,
|
|
2485
2548
|
allModels,
|
|
2486
|
-
{ optimizeFor, runtime:
|
|
2549
|
+
{ optimizeFor, runtime: fallbackRuntime }
|
|
2487
2550
|
);
|
|
2488
2551
|
const summary = this.intelligentRecommender.generateRecommendationSummary(
|
|
2489
2552
|
recommendations,
|
|
@@ -2497,7 +2560,8 @@ class LLMChecker {
|
|
|
2497
2560
|
recommendations,
|
|
2498
2561
|
summary,
|
|
2499
2562
|
optimizeFor: summary.optimize_for || optimizeFor,
|
|
2500
|
-
runtime:
|
|
2563
|
+
runtime: fallbackRuntime,
|
|
2564
|
+
recommendationSource: 'ollama_catalog',
|
|
2501
2565
|
totalModelsAnalyzed: allModels.length,
|
|
2502
2566
|
generatedAt: new Date().toISOString()
|
|
2503
2567
|
};
|
|
@@ -243,13 +243,12 @@ class DeterministicModelSelector {
|
|
|
243
243
|
directVRAM ??
|
|
244
244
|
0;
|
|
245
245
|
|
|
246
|
-
// Multi-GPU
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
}
|
|
246
|
+
// Multi-GPU: only scale up when memory is known to be PER-GPU (vramPerGPU).
|
|
247
|
+
// A bare `vram`/`vramGB` is treated as the box total and never multiplied,
|
|
248
|
+
// so we don't double an already-total figure and falsely "fit" a model
|
|
249
|
+
// (e.g. a 2x24=48GB box must stay 48GB, not become 96GB).
|
|
250
|
+
if (!explicitTotalVRAM && gpuCount > 1 && vramPerGPU) {
|
|
251
|
+
vramGB = vramPerGPU * gpuCount;
|
|
253
252
|
}
|
|
254
253
|
|
|
255
254
|
let gpuType = gpu.type;
|
|
@@ -1152,6 +1151,17 @@ class DeterministicModelSelector {
|
|
|
1152
1151
|
return explicitParams;
|
|
1153
1152
|
}
|
|
1154
1153
|
|
|
1154
|
+
// Use the variant's OWN artifact size to DISAMBIGUATE the model-level size
|
|
1155
|
+
// list. A size-unknown variant (e.g. `:latest`) must not blindly inherit
|
|
1156
|
+
// model_sizes[0]: for qwen3 (model_sizes ["30b","235b"]) that mislabeled a
|
|
1157
|
+
// small qwen3:latest as 30B and poisoned the real qwen3:30b size map, making
|
|
1158
|
+
// a 19GB model falsely "fit" a 16GB machine.
|
|
1159
|
+
const artifactSizeGB = this.extractVariantSizeGB(variant, null);
|
|
1160
|
+
const artifactParamsB =
|
|
1161
|
+
(!this.isCloudVariantTag(variant.tag) && Number.isFinite(artifactSizeGB) && artifactSizeGB > 0)
|
|
1162
|
+
? this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant)
|
|
1163
|
+
: null;
|
|
1164
|
+
|
|
1155
1165
|
const metadataCandidates = this.extractParameterCandidates(
|
|
1156
1166
|
ollamaModel.model_sizes,
|
|
1157
1167
|
ollamaModel.parameters,
|
|
@@ -1159,12 +1169,23 @@ class DeterministicModelSelector {
|
|
|
1159
1169
|
ollamaModel.parameter_count
|
|
1160
1170
|
);
|
|
1161
1171
|
if (metadataCandidates.length > 0) {
|
|
1172
|
+
if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
|
|
1173
|
+
// Pick the listed size CLOSEST to what this variant's own artifact
|
|
1174
|
+
// implies; if even the closest is far off, trust the artifact size.
|
|
1175
|
+
let closest = metadataCandidates[0];
|
|
1176
|
+
let bestDiff = Math.abs(closest - artifactParamsB);
|
|
1177
|
+
for (const cand of metadataCandidates) {
|
|
1178
|
+
const diff = Math.abs(cand - artifactParamsB);
|
|
1179
|
+
if (diff < bestDiff) { bestDiff = diff; closest = cand; }
|
|
1180
|
+
}
|
|
1181
|
+
const tolerance = Math.max(2, closest * 0.5);
|
|
1182
|
+
return bestDiff <= tolerance ? closest : artifactParamsB;
|
|
1183
|
+
}
|
|
1162
1184
|
return metadataCandidates[0];
|
|
1163
1185
|
}
|
|
1164
1186
|
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
return this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant);
|
|
1187
|
+
if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
|
|
1188
|
+
return artifactParamsB;
|
|
1168
1189
|
}
|
|
1169
1190
|
|
|
1170
1191
|
const modelArtifactSizeGB = this.extractArtifactSizeGBFromValue(ollamaModel.main_size);
|
|
@@ -1512,28 +1533,35 @@ class DeterministicModelSelector {
|
|
|
1512
1533
|
return false;
|
|
1513
1534
|
}
|
|
1514
1535
|
|
|
1536
|
+
// Guard against malformed external pool rows (a missing tags/modalities
|
|
1537
|
+
// /name field used to throw and silently nuke the whole category).
|
|
1538
|
+
const tags = Array.isArray(model.tags) ? model.tags : [];
|
|
1539
|
+
const modalities = Array.isArray(model.modalities) ? model.modalities : [];
|
|
1540
|
+
const name = String(model.name || model.model_identifier || '').toLowerCase();
|
|
1541
|
+
const paramsB = Number(model.paramsB) || 0;
|
|
1542
|
+
|
|
1515
1543
|
switch (category) {
|
|
1516
1544
|
case 'coding':
|
|
1517
|
-
return
|
|
1518
|
-
|
|
1519
|
-
|
|
1545
|
+
return tags.some(tag => ['coder', 'code', 'instruct'].includes(tag)) ||
|
|
1546
|
+
name.includes('code');
|
|
1547
|
+
|
|
1520
1548
|
case 'multimodal':
|
|
1521
|
-
return
|
|
1522
|
-
|
|
1523
|
-
|
|
1549
|
+
return modalities.includes('vision') ||
|
|
1550
|
+
tags.includes('vision');
|
|
1551
|
+
|
|
1524
1552
|
case 'embeddings':
|
|
1525
|
-
return
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1553
|
+
return tags.includes('embedding') ||
|
|
1554
|
+
tags.includes('embeddings') ||
|
|
1555
|
+
name.includes('embed') ||
|
|
1556
|
+
name.includes('bge-') ||
|
|
1557
|
+
name.includes('nomic-embed') ||
|
|
1558
|
+
name.includes('all-minilm') ||
|
|
1531
1559
|
model.specialization === 'embeddings';
|
|
1532
|
-
|
|
1560
|
+
|
|
1533
1561
|
case 'reasoning':
|
|
1534
|
-
return
|
|
1535
|
-
|
|
1536
|
-
|
|
1562
|
+
return tags.includes('instruct') ||
|
|
1563
|
+
paramsB >= 7; // Prefer larger models for reasoning
|
|
1564
|
+
|
|
1537
1565
|
default: // general, reading, summarization
|
|
1538
1566
|
return true; // Most models can handle these
|
|
1539
1567
|
}
|
|
@@ -1711,15 +1739,19 @@ class DeterministicModelSelector {
|
|
|
1711
1739
|
: (Number.isFinite(directVariantMatch) && directVariantMatch > 0 ? directVariantMatch : null);
|
|
1712
1740
|
|
|
1713
1741
|
const parameterProfile = this.resolveMemoryParameterProfile(model);
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
const
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1742
|
+
// Weight memory must account for ALL resident parameters. For MoE under
|
|
1743
|
+
// Ollama / Metal / vLLM every expert is resident, so size the weights by
|
|
1744
|
+
// the TOTAL parameter count (not the active count). Active params drive
|
|
1745
|
+
// speed and KV-cache only. Sizing weights by active params used to make a
|
|
1746
|
+
// 236B MoE look like ~14GB and falsely "fit" small hardware.
|
|
1747
|
+
const weightParamsB =
|
|
1748
|
+
parameterProfile.isMoE && Number.isFinite(parameterProfile.totalParamsB) && parameterProfile.totalParamsB > 0
|
|
1749
|
+
? parameterProfile.totalParamsB
|
|
1750
|
+
: parameterProfile.effectiveParamsB;
|
|
1751
|
+
const modeledWeightGB = weightParamsB * bpp;
|
|
1752
|
+
// A real observed artifact size always wins for weight memory — never let
|
|
1753
|
+
// an MoE "sparse inference" assumption discard a measured on-disk size.
|
|
1754
|
+
const useObservedArtifactSize = Number.isFinite(observedWeightGB) && observedWeightGB > 0;
|
|
1723
1755
|
const modelMemGB = useObservedArtifactSize ? observedWeightGB : modeledWeightGB;
|
|
1724
1756
|
const effectiveCtx = Number.isFinite(Number(ctx)) && Number(ctx) > 0 ? Number(ctx) : 4096;
|
|
1725
1757
|
|
|
@@ -1729,9 +1761,10 @@ class DeterministicModelSelector {
|
|
|
1729
1761
|
|
|
1730
1762
|
// Runtime overhead (Metal/CUDA context, buffers)
|
|
1731
1763
|
const runtimeOverhead = useObservedArtifactSize ? 0.35 : 0.5;
|
|
1764
|
+
const usedMoeTotal = parameterProfile.isMoE && weightParamsB === parameterProfile.totalParamsB;
|
|
1732
1765
|
const memorySource = useObservedArtifactSize
|
|
1733
1766
|
? 'observed_artifact_size'
|
|
1734
|
-
: (
|
|
1767
|
+
: (usedMoeTotal ? 'moe_total_params' : 'estimated_from_params');
|
|
1735
1768
|
|
|
1736
1769
|
return {
|
|
1737
1770
|
parameterProfile,
|
|
@@ -2342,6 +2375,9 @@ class DeterministicModelSelector {
|
|
|
2342
2375
|
estimatedRAM: candidate.requiredGB,
|
|
2343
2376
|
reasoning: candidate.rationale,
|
|
2344
2377
|
runtime: candidate.runtime || candidate.speed?.runtime || 'ollama',
|
|
2378
|
+
installCommand: candidate.meta.installCommand || provenance.install_command || '',
|
|
2379
|
+
downloadUrl: candidate.meta.downloadUrl || provenance.download_url || '',
|
|
2380
|
+
artifactFormat: candidate.meta.artifact?.format || '',
|
|
2345
2381
|
memoryAssumptionSource: candidate.memory?.assumptionSource || 'dense_params',
|
|
2346
2382
|
speedAssumptions: candidate.speed?.moe ? {
|
|
2347
2383
|
applied: Boolean(candidate.speed.moe.applied),
|
|
@@ -2523,19 +2559,24 @@ class DeterministicModelSelector {
|
|
|
2523
2559
|
Object.entries(recommendations).forEach(([category, data]) => {
|
|
2524
2560
|
const bestModel = data.bestModels[0];
|
|
2525
2561
|
if (bestModel) {
|
|
2562
|
+
const command = bestModel.installCommand ||
|
|
2563
|
+
bestModel.provenance?.install_command ||
|
|
2564
|
+
`ollama pull ${bestModel.model_identifier}`;
|
|
2526
2565
|
summary.by_category[category] = {
|
|
2527
2566
|
name: bestModel.model_name || bestModel.name,
|
|
2528
2567
|
identifier: bestModel.model_identifier,
|
|
2529
2568
|
score: Math.round(bestModel.categoryScore || bestModel.score),
|
|
2530
|
-
command
|
|
2569
|
+
command,
|
|
2531
2570
|
size: this.formatModelSize(bestModel),
|
|
2532
2571
|
quantization: bestModel.quantization || bestModel.quant || 'Q4_K_M',
|
|
2572
|
+
runtime: bestModel.runtime || bestModel.provenance?.runtime || 'ollama',
|
|
2533
2573
|
pulls: bestModel.pulls || 0,
|
|
2534
2574
|
source: bestModel.source || bestModel.provenance?.source || 'unknown',
|
|
2535
2575
|
registry: bestModel.registry || bestModel.provenance?.registry || 'unknown',
|
|
2536
2576
|
version: bestModel.version || bestModel.provenance?.version || 'unknown',
|
|
2537
2577
|
license: bestModel.license || bestModel.provenance?.license || 'unknown',
|
|
2538
2578
|
digest: bestModel.digest || bestModel.provenance?.digest || 'unknown',
|
|
2579
|
+
download_url: bestModel.downloadUrl || bestModel.provenance?.download_url || '',
|
|
2539
2580
|
provenance: bestModel.provenance || {
|
|
2540
2581
|
source: bestModel.source || 'unknown',
|
|
2541
2582
|
registry: bestModel.registry || 'unknown',
|
|
@@ -2545,7 +2586,7 @@ class DeterministicModelSelector {
|
|
|
2545
2586
|
}
|
|
2546
2587
|
};
|
|
2547
2588
|
|
|
2548
|
-
summary.quick_commands.push(
|
|
2589
|
+
summary.quick_commands.push(command);
|
|
2549
2590
|
|
|
2550
2591
|
const isGeneralCategory = ['general', 'coding', 'talking', 'reading'].includes(category);
|
|
2551
2592
|
const score = bestModel.categoryScore || bestModel.score || 0;
|
|
@@ -2559,18 +2600,23 @@ class DeterministicModelSelector {
|
|
|
2559
2600
|
});
|
|
2560
2601
|
|
|
2561
2602
|
if (bestOverallModel) {
|
|
2603
|
+
const command = bestOverallModel.installCommand ||
|
|
2604
|
+
bestOverallModel.provenance?.install_command ||
|
|
2605
|
+
`ollama pull ${bestOverallModel.model_identifier}`;
|
|
2562
2606
|
summary.best_overall = {
|
|
2563
2607
|
name: bestOverallModel.model_name || bestOverallModel.name,
|
|
2564
2608
|
identifier: bestOverallModel.model_identifier,
|
|
2565
2609
|
category: bestOverallCategory,
|
|
2566
2610
|
score: Math.round(bestOverallScore),
|
|
2567
|
-
command
|
|
2611
|
+
command,
|
|
2568
2612
|
quantization: bestOverallModel.quantization || bestOverallModel.quant || 'Q4_K_M',
|
|
2613
|
+
runtime: bestOverallModel.runtime || bestOverallModel.provenance?.runtime || 'ollama',
|
|
2569
2614
|
source: bestOverallModel.source || bestOverallModel.provenance?.source || 'unknown',
|
|
2570
2615
|
registry: bestOverallModel.registry || bestOverallModel.provenance?.registry || 'unknown',
|
|
2571
2616
|
version: bestOverallModel.version || bestOverallModel.provenance?.version || 'unknown',
|
|
2572
2617
|
license: bestOverallModel.license || bestOverallModel.provenance?.license || 'unknown',
|
|
2573
2618
|
digest: bestOverallModel.digest || bestOverallModel.provenance?.digest || 'unknown',
|
|
2619
|
+
download_url: bestOverallModel.downloadUrl || bestOverallModel.provenance?.download_url || '',
|
|
2574
2620
|
provenance: bestOverallModel.provenance || {
|
|
2575
2621
|
source: bestOverallModel.source || 'unknown',
|
|
2576
2622
|
registry: bestOverallModel.registry || 'unknown',
|
|
@@ -24,6 +24,14 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
|
|
|
24
24
|
maxEffectiveGain: 2.65,
|
|
25
25
|
notes: ['optimized scheduler', 'better expert batching', 'lower offload pressure']
|
|
26
26
|
}),
|
|
27
|
+
transformers: Object.freeze({
|
|
28
|
+
runtime: 'transformers',
|
|
29
|
+
routingOverhead: 0.15,
|
|
30
|
+
communicationOverhead: 0.10,
|
|
31
|
+
offloadOverhead: 0.06,
|
|
32
|
+
maxEffectiveGain: 2.45,
|
|
33
|
+
notes: ['general Hugging Face path', 'broad architecture support', 'higher Python overhead than vLLM']
|
|
34
|
+
}),
|
|
27
35
|
mlx: Object.freeze({
|
|
28
36
|
runtime: 'mlx',
|
|
29
37
|
routingOverhead: 0.16,
|
|
@@ -45,6 +53,9 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
|
|
|
45
53
|
const RUNTIME_ALIASES = Object.freeze({
|
|
46
54
|
ollama: 'ollama',
|
|
47
55
|
vllm: 'vllm',
|
|
56
|
+
transformers: 'transformers',
|
|
57
|
+
'huggingface-transformers': 'transformers',
|
|
58
|
+
hf: 'transformers',
|
|
48
59
|
mlx: 'mlx',
|
|
49
60
|
'mlx-lm': 'mlx',
|
|
50
61
|
mlx_lm: 'mlx',
|