npm - llm-checker - Versions diffs - 3.6.1 → 3.7.4 - Mend

llm-checker 3.6.1 → 3.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/README.md +45 -8
package/bin/enhanced_cli.js +407 -5
package/bin/mcp-server.mjs +5 -0
package/package.json +7 -2
package/src/data/model-database.js +452 -0
package/src/data/registry-ingestors.js +765 -0
package/src/data/registry-recommender.js +632 -0
package/src/data/seed/README.md +11 -3
package/src/data/seed/models.db +0 -0
package/src/index.js +68 -4
package/src/models/deterministic-selector.js +85 -39
package/src/models/moe-assumptions.js +11 -0

package/src/index.js CHANGED Viewed

@@ -20,6 +20,17 @@ const {
 } = require('./provenance/model-provenance');
 const { normalizePlatform } = require('./utils/platform');
+function normalizeRecommendationRuntime(runtime = 'auto') {
+    const normalized = String(runtime || 'auto').trim().toLowerCase();
+    if (['auto', 'all', '*'].includes(normalized)) return 'auto';
+    if (['ollama', 'vllm', 'mlx', 'llama.cpp', 'llamacpp', 'llama_cpp', 'transformers', 'hf'].includes(normalized)) {
+        if (normalized === 'llamacpp' || normalized === 'llama_cpp') return 'llama.cpp';
+        if (normalized === 'hf') return 'transformers';
+        return normalized;
+    }
+    return normalizeRuntime(normalized);
+}
 class LLMChecker {
     constructor(options = {}) {
         this.hardwareDetector = new HardwareDetector();
@@ -2467,7 +2478,59 @@ class LLMChecker {
     async generateIntelligentRecommendations(hardware, options = {}) {
         try {
             this.logger.info('Generating intelligent recommendations...');
-            const selectedRuntime = normalizeRuntime(options.runtime || 'ollama');
+            const selectedRuntime = normalizeRecommendationRuntime(options.runtime || 'auto');
+            const optimizeFor = options.optimizeFor || options.optimize || 'balanced';
+            if (options.registry !== false) {
+                let registryRecommender = null;
+                try {
+                    const { RegistryRecommender } = require('./data/registry-recommender');
+                    registryRecommender = new RegistryRecommender();
+                    await registryRecommender.initialize();
+                    const registryResult = await registryRecommender.getBestModelsForHardware(hardware, {
+                        runtime: selectedRuntime,
+                        optimizeFor,
+                        limit: 3,
+                        poolLimit: options.poolLimit || 20000,
+                        localOnly: options.includeGated ? false : true
+                    });
+                    const recommendations = registryResult.recommendations;
+                    const hasRegistryRecommendations = Object.values(recommendations)
+                        .some((group) => Array.isArray(group.bestModels) && group.bestModels.length > 0);
+                    if (hasRegistryRecommendations) {
+                        const summary = this.intelligentRecommender.generateRecommendationSummary(
+                            recommendations,
+                            hardware,
+                            { optimizeFor }
+                        );
+                        const totalModelsAnalyzed = Number(registryResult.totalModelsAnalyzed) || Object.values(recommendations)
+                            .reduce((sum, group) => sum + (Number(group.totalCandidates) || Number(group.totalEvaluated) || 0), 0);
+                        this.logger.info(`Generated registry recommendations for ${Object.keys(recommendations).length} categories`);
+                        return {
+                            recommendations,
+                            summary,
+                            optimizeFor: summary.optimize_for || optimizeFor,
+                            runtime: selectedRuntime,
+                            recommendationSource: 'registry',
+                            registryStats: registryResult.registryStats,
+                            totalModelsAnalyzed,
+                            generatedAt: new Date().toISOString()
+                        };
+                    }
+                    this.logger.warn('Registry recommendations were empty, falling back to Ollama catalog');
+                } catch (error) {
+                    this.logger.warn('Registry recommendations unavailable, falling back to Ollama catalog', { error: error.message });
+                } finally {
+                    if (registryRecommender) {
+                        registryRecommender.close();
+                    }
+                }
+            }
             // Prefer the synced SQLite catalog so `llm-checker sync` updates recommendations immediately.
             const ollamaData = await this.loadOllamaModelData();
@@ -2479,11 +2542,11 @@ class LLMChecker {
             }
             // Generar recomendaciones inteligentes
-            const optimizeFor = options.optimizeFor || options.optimize || 'balanced';
+            const fallbackRuntime = selectedRuntime === 'auto' ? 'ollama' : selectedRuntime;
             const recommendations = await this.intelligentRecommender.getBestModelsForHardware(
                 hardware,
                 allModels,
-                { optimizeFor, runtime: selectedRuntime }
+                { optimizeFor, runtime: fallbackRuntime }
             );
             const summary = this.intelligentRecommender.generateRecommendationSummary(
                 recommendations,
@@ -2497,7 +2560,8 @@ class LLMChecker {
                 recommendations,
                 summary,
                 optimizeFor: summary.optimize_for || optimizeFor,
-                runtime: selectedRuntime,
+                runtime: fallbackRuntime,
+                recommendationSource: 'ollama_catalog',
                 totalModelsAnalyzed: allModels.length,
                 generatedAt: new Date().toISOString()
             };

package/src/models/deterministic-selector.js CHANGED Viewed

@@ -243,13 +243,12 @@ class DeterministicModelSelector {
             directVRAM ??
             0;
-        // Multi-GPU fallback when only per-GPU memory is known.
-        if (!explicitTotalVRAM && gpuCount > 1) {
-            if (vramPerGPU) {
-                vramGB = vramPerGPU * gpuCount;
-            } else if (directVRAM && Boolean(gpu.isMultiGPU || input.isMultiGPU)) {
-                vramGB = Math.max(directVRAM, directVRAM * gpuCount);
-            }
+        // Multi-GPU: only scale up when memory is known to be PER-GPU (vramPerGPU).
+        // A bare `vram`/`vramGB` is treated as the box total and never multiplied,
+        // so we don't double an already-total figure and falsely "fit" a model
+        // (e.g. a 2x24=48GB box must stay 48GB, not become 96GB).
+        if (!explicitTotalVRAM && gpuCount > 1 && vramPerGPU) {
+            vramGB = vramPerGPU * gpuCount;
         }
         let gpuType = gpu.type;
@@ -1152,6 +1151,17 @@ class DeterministicModelSelector {
             return explicitParams;
         }
+        // Use the variant's OWN artifact size to DISAMBIGUATE the model-level size
+        // list. A size-unknown variant (e.g. `:latest`) must not blindly inherit
+        // model_sizes[0]: for qwen3 (model_sizes ["30b","235b"]) that mislabeled a
+        // small qwen3:latest as 30B and poisoned the real qwen3:30b size map, making
+        // a 19GB model falsely "fit" a 16GB machine.
+        const artifactSizeGB = this.extractVariantSizeGB(variant, null);
+        const artifactParamsB =
+            (!this.isCloudVariantTag(variant.tag) && Number.isFinite(artifactSizeGB) && artifactSizeGB > 0)
+                ? this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant)
+                : null;
         const metadataCandidates = this.extractParameterCandidates(
             ollamaModel.model_sizes,
             ollamaModel.parameters,
@@ -1159,12 +1169,23 @@ class DeterministicModelSelector {
             ollamaModel.parameter_count
         );
         if (metadataCandidates.length > 0) {
+            if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
+                // Pick the listed size CLOSEST to what this variant's own artifact
+                // implies; if even the closest is far off, trust the artifact size.
+                let closest = metadataCandidates[0];
+                let bestDiff = Math.abs(closest - artifactParamsB);
+                for (const cand of metadataCandidates) {
+                    const diff = Math.abs(cand - artifactParamsB);
+                    if (diff < bestDiff) { bestDiff = diff; closest = cand; }
+                }
+                const tolerance = Math.max(2, closest * 0.5);
+                return bestDiff <= tolerance ? closest : artifactParamsB;
+            }
             return metadataCandidates[0];
         }
-        const artifactSizeGB = this.extractVariantSizeGB(variant, null);
-        if (!this.isCloudVariantTag(variant.tag) && Number.isFinite(artifactSizeGB) && artifactSizeGB > 0) {
-            return this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant);
+        if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
+            return artifactParamsB;
         }
         const modelArtifactSizeGB = this.extractArtifactSizeGBFromValue(ollamaModel.main_size);
@@ -1512,28 +1533,35 @@ class DeterministicModelSelector {
                 return false;
             }
+            // Guard against malformed external pool rows (a missing tags/modalities
+            // /name field used to throw and silently nuke the whole category).
+            const tags = Array.isArray(model.tags) ? model.tags : [];
+            const modalities = Array.isArray(model.modalities) ? model.modalities : [];
+            const name = String(model.name || model.model_identifier || '').toLowerCase();
+            const paramsB = Number(model.paramsB) || 0;
             switch (category) {
                 case 'coding':
-                    return model.tags.some(tag => ['coder', 'code', 'instruct'].includes(tag)) ||
-                           model.name.toLowerCase().includes('code');
+                    return tags.some(tag => ['coder', 'code', 'instruct'].includes(tag)) ||
+                           name.includes('code');
                 case 'multimodal':
-                    return model.modalities.includes('vision') ||
-                           model.tags.includes('vision');
+                    return modalities.includes('vision') ||
+                           tags.includes('vision');
                 case 'embeddings':
-                    return model.tags.includes('embedding') ||
-                           model.tags.includes('embeddings') ||
-                           model.name.toLowerCase().includes('embed') ||
-                           model.name.toLowerCase().includes('bge-') ||
-                           model.name.toLowerCase().includes('nomic-embed') ||
-                           model.name.toLowerCase().includes('all-minilm') ||
+                    return tags.includes('embedding') ||
+                           tags.includes('embeddings') ||
+                           name.includes('embed') ||
+                           name.includes('bge-') ||
+                           name.includes('nomic-embed') ||
+                           name.includes('all-minilm') ||
                            model.specialization === 'embeddings';
                 case 'reasoning':
-                    return model.tags.includes('instruct') ||
-                           model.paramsB >= 7; // Prefer larger models for reasoning
+                    return tags.includes('instruct') ||
+                           paramsB >= 7; // Prefer larger models for reasoning
                 default: // general, reading, summarization
                     return true; // Most models can handle these
             }
@@ -1711,15 +1739,19 @@ class DeterministicModelSelector {
             : (Number.isFinite(directVariantMatch) && directVariantMatch > 0 ? directVariantMatch : null);
         const parameterProfile = this.resolveMemoryParameterProfile(model);
-        const modeledWeightGB = parameterProfile.effectiveParamsB * bpp;
-        const preferSparseInferenceParams =
-            parameterProfile.isMoE &&
-            (parameterProfile.assumptionSource === 'moe_active_metadata' ||
-                parameterProfile.assumptionSource === 'moe_derived_expert_ratio');
-        const useObservedArtifactSize =
-            !preferSparseInferenceParams &&
-            Number.isFinite(observedWeightGB) &&
-            observedWeightGB > 0;
+        // Weight memory must account for ALL resident parameters. For MoE under
+        // Ollama / Metal / vLLM every expert is resident, so size the weights by
+        // the TOTAL parameter count (not the active count). Active params drive
+        // speed and KV-cache only. Sizing weights by active params used to make a
+        // 236B MoE look like ~14GB and falsely "fit" small hardware.
+        const weightParamsB =
+            parameterProfile.isMoE && Number.isFinite(parameterProfile.totalParamsB) && parameterProfile.totalParamsB > 0
+                ? parameterProfile.totalParamsB
+                : parameterProfile.effectiveParamsB;
+        const modeledWeightGB = weightParamsB * bpp;
+        // A real observed artifact size always wins for weight memory — never let
+        // an MoE "sparse inference" assumption discard a measured on-disk size.
+        const useObservedArtifactSize = Number.isFinite(observedWeightGB) && observedWeightGB > 0;
         const modelMemGB = useObservedArtifactSize ? observedWeightGB : modeledWeightGB;
         const effectiveCtx = Number.isFinite(Number(ctx)) && Number(ctx) > 0 ? Number(ctx) : 4096;
@@ -1729,9 +1761,10 @@ class DeterministicModelSelector {
         // Runtime overhead (Metal/CUDA context, buffers)
         const runtimeOverhead = useObservedArtifactSize ? 0.35 : 0.5;
+        const usedMoeTotal = parameterProfile.isMoE && weightParamsB === parameterProfile.totalParamsB;
         const memorySource = useObservedArtifactSize
             ? 'observed_artifact_size'
-            : (preferSparseInferenceParams ? 'moe_sparse_inference_params' : 'estimated_from_params');
+            : (usedMoeTotal ? 'moe_total_params' : 'estimated_from_params');
         return {
             parameterProfile,
@@ -2342,6 +2375,9 @@ class DeterministicModelSelector {
             estimatedRAM: candidate.requiredGB,
             reasoning: candidate.rationale,
             runtime: candidate.runtime || candidate.speed?.runtime || 'ollama',
+            installCommand: candidate.meta.installCommand || provenance.install_command || '',
+            downloadUrl: candidate.meta.downloadUrl || provenance.download_url || '',
+            artifactFormat: candidate.meta.artifact?.format || '',
             memoryAssumptionSource: candidate.memory?.assumptionSource || 'dense_params',
             speedAssumptions: candidate.speed?.moe ? {
                 applied: Boolean(candidate.speed.moe.applied),
@@ -2523,19 +2559,24 @@ class DeterministicModelSelector {
         Object.entries(recommendations).forEach(([category, data]) => {
             const bestModel = data.bestModels[0];
             if (bestModel) {
+                const command = bestModel.installCommand ||
+                    bestModel.provenance?.install_command ||
+                    `ollama pull ${bestModel.model_identifier}`;
                 summary.by_category[category] = {
                     name: bestModel.model_name || bestModel.name,
                     identifier: bestModel.model_identifier,
                     score: Math.round(bestModel.categoryScore || bestModel.score),
-                    command: `ollama pull ${bestModel.model_identifier}`,
+                    command,
                     size: this.formatModelSize(bestModel),
                     quantization: bestModel.quantization || bestModel.quant || 'Q4_K_M',
+                    runtime: bestModel.runtime || bestModel.provenance?.runtime || 'ollama',
                     pulls: bestModel.pulls || 0,
                     source: bestModel.source || bestModel.provenance?.source || 'unknown',
                     registry: bestModel.registry || bestModel.provenance?.registry || 'unknown',
                     version: bestModel.version || bestModel.provenance?.version || 'unknown',
                     license: bestModel.license || bestModel.provenance?.license || 'unknown',
                     digest: bestModel.digest || bestModel.provenance?.digest || 'unknown',
+                    download_url: bestModel.downloadUrl || bestModel.provenance?.download_url || '',
                     provenance: bestModel.provenance || {
                         source: bestModel.source || 'unknown',
                         registry: bestModel.registry || 'unknown',
@@ -2545,7 +2586,7 @@ class DeterministicModelSelector {
                     }
                 };
-                summary.quick_commands.push(`ollama pull ${bestModel.model_identifier}`);
+                summary.quick_commands.push(command);
                 const isGeneralCategory = ['general', 'coding', 'talking', 'reading'].includes(category);
                 const score = bestModel.categoryScore || bestModel.score || 0;
@@ -2559,18 +2600,23 @@ class DeterministicModelSelector {
         });
         if (bestOverallModel) {
+            const command = bestOverallModel.installCommand ||
+                bestOverallModel.provenance?.install_command ||
+                `ollama pull ${bestOverallModel.model_identifier}`;
             summary.best_overall = {
                 name: bestOverallModel.model_name || bestOverallModel.name,
                 identifier: bestOverallModel.model_identifier,
                 category: bestOverallCategory,
                 score: Math.round(bestOverallScore),
-                command: `ollama pull ${bestOverallModel.model_identifier}`,
+                command,
                 quantization: bestOverallModel.quantization || bestOverallModel.quant || 'Q4_K_M',
+                runtime: bestOverallModel.runtime || bestOverallModel.provenance?.runtime || 'ollama',
                 source: bestOverallModel.source || bestOverallModel.provenance?.source || 'unknown',
                 registry: bestOverallModel.registry || bestOverallModel.provenance?.registry || 'unknown',
                 version: bestOverallModel.version || bestOverallModel.provenance?.version || 'unknown',
                 license: bestOverallModel.license || bestOverallModel.provenance?.license || 'unknown',
                 digest: bestOverallModel.digest || bestOverallModel.provenance?.digest || 'unknown',
+                download_url: bestOverallModel.downloadUrl || bestOverallModel.provenance?.download_url || '',
                 provenance: bestOverallModel.provenance || {
                     source: bestOverallModel.source || 'unknown',
                     registry: bestOverallModel.registry || 'unknown',

package/src/models/moe-assumptions.js CHANGED Viewed

@@ -24,6 +24,14 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
         maxEffectiveGain: 2.65,
         notes: ['optimized scheduler', 'better expert batching', 'lower offload pressure']
     }),
+    transformers: Object.freeze({
+        runtime: 'transformers',
+        routingOverhead: 0.15,
+        communicationOverhead: 0.10,
+        offloadOverhead: 0.06,
+        maxEffectiveGain: 2.45,
+        notes: ['general Hugging Face path', 'broad architecture support', 'higher Python overhead than vLLM']
+    }),
     mlx: Object.freeze({
         runtime: 'mlx',
         routingOverhead: 0.16,
@@ -45,6 +53,9 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
 const RUNTIME_ALIASES = Object.freeze({
     ollama: 'ollama',
     vllm: 'vllm',
+    transformers: 'transformers',
+    'huggingface-transformers': 'transformers',
+    hf: 'transformers',
     mlx: 'mlx',
     'mlx-lm': 'mlx',
     mlx_lm: 'mlx',