npm - llm-checker - Versions diffs - 3.2.5 → 3.2.7 - Mend

llm-checker 3.2.5 → 3.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/LICENSE +27 -9
package/README.md +72 -8
package/bin/enhanced_cli.js +13 -2
package/package.json +2 -2
package/src/hardware/backends/rocm-detector.js +20 -1
package/src/hardware/detector.js +75 -10
package/src/hardware/unified-detector.js +49 -10
package/src/index.js +19 -4
package/src/models/deterministic-selector.js +712 -38
package/src/models/intelligent-selector.js +2 -0
package/src/models/moe-assumptions.js +311 -0
package/src/models/scoring-engine.js +38 -13

package/src/models/deterministic-selector.js CHANGED Viewed

@@ -10,6 +10,14 @@ const path = require('path');
 const os = require('os');
 const { spawn } = require('child_process');
 const { DETERMINISTIC_WEIGHTS } = require('./scoring-config');
+const {
+    parseBillionsValue: parseMoEBillionsValue,
+    parsePositiveNumber: parseMoEPositiveNumber,
+    normalizeMoERuntime,
+    extractMoEMetadata: extractCanonicalMoEMetadata,
+    resolveMoEParameterProfile,
+    estimateMoESpeedMultiplier
+} = require('./moe-assumptions');
 class DeterministicModelSelector {
     constructor() {
@@ -33,6 +41,7 @@ class DeterministicModelSelector {
         // Family quality bumps
         this.familyBumps = {
             'qwen2.5': 2,
+            'qwen3': 4,
             'deepseek': 3,
             'mistral': 1,
             'llama3.1': 1,
@@ -101,6 +110,28 @@ class DeterministicModelSelector {
         // Category scoring weights [Q, S, F, C] from centralized config
         this.categoryWeights = DETERMINISTIC_WEIGHTS;
+        // User optimization profile overrides [Q, S, F, C]
+        this.optimizationProfiles = {
+            balanced: null,
+            speed: [0.25, 0.55, 0.15, 0.05],
+            quality: [0.65, 0.10, 0.15, 0.10],
+            context: [0.30, 0.10, 0.20, 0.40],
+            coding: [0.55, 0.25, 0.10, 0.10]
+        };
+        this.freshnessThresholds = {
+            staleDays: 365,
+            veryStaleDays: 730,
+            indexCadenceDays: 14
+        };
+        this.modelIndexStatus = {
+            source: 'unknown',
+            ageDays: null,
+            isStale: false,
+            cachedAt: null
+        };
     }
     // ============================================================================
@@ -148,6 +179,7 @@ class DeterministicModelSelector {
         const gpu = input.gpu || {};
         const memory = input.memory || {};
         const acceleration = input.acceleration || {};
+        const gpuEntries = Array.isArray(gpu.all) ? gpu.all : [];
         const totalMemGB =
             toNumber(memory.totalGB) ??
@@ -156,21 +188,62 @@ class DeterministicModelSelector {
             toNumber(input.memoryGB) ??
             8;
+        const modelHints = `${gpu.model || ''} ${gpu.vendor || ''} ${gpu.type || ''}`.toLowerCase();
+        const inferredUnified =
+            Boolean(gpu.unified) ||
+            /apple|m1|m2|m3|m4|unified/.test(modelHints);
+        const utilizationFactor = inferredUnified ? 0.85 : 0.8;
+        const memoryHeadroomGB = inferredUnified ? 1.5 : 2;
         const usableMemGB =
             toNumber(input.usableMemGB) ??
-            Math.max(1, Math.min(0.8 * totalMemGB, totalMemGB - 2));
+            Math.max(1, Math.min(utilizationFactor * totalMemGB, totalMemGB - memoryHeadroomGB));
+        const gpuCount =
+            toNumber(gpu.gpuCount) ??
+            toNumber(gpu.count) ??
+            (gpuEntries.length > 0 ? gpuEntries.length : null) ??
+            toNumber(input.gpuCount) ??
+            1;
+        const vramPerGPU =
+            toNumber(gpu.vramPerGPU) ??
+            toNumber(input.vramPerGPU) ??
+            null;
+        const summedEntryVRAM = gpuEntries.reduce((sum, entry) => {
+            return sum + (
+                toNumber(entry?.vramGB) ??
+                toNumber(entry?.vram) ??
+                toNumber(entry?.totalVRAM) ??
+                0
+            );
+        }, 0);
-        const vramGB =
+        const explicitTotalVRAM =
+            toNumber(gpu.totalVRAM) ??
+            toNumber(input.totalVRAM) ??
+            toNumber(input.gpuTotalVRAM) ??
+            (summedEntryVRAM > 0 ? summedEntryVRAM : null);
+        const directVRAM =
             toNumber(gpu.vramGB) ??
             toNumber(gpu.vram) ??
-            toNumber(gpu.totalVRAM) ??
-            toNumber(gpu.vramPerGPU) ??
+            null;
+        let vramGB =
+            explicitTotalVRAM ??
+            directVRAM ??
             0;
-        const modelHints = `${gpu.model || ''} ${gpu.vendor || ''} ${gpu.type || ''}`.toLowerCase();
-        const inferredUnified =
-            Boolean(gpu.unified) ||
-            /apple|m1|m2|m3|m4|unified/.test(modelHints);
+        // Multi-GPU fallback when only per-GPU memory is known.
+        if (!explicitTotalVRAM && gpuCount > 1) {
+            if (vramPerGPU) {
+                vramGB = vramPerGPU * gpuCount;
+            } else if (directVRAM && Boolean(gpu.isMultiGPU || input.isMultiGPU)) {
+                vramGB = Math.max(directVRAM, directVRAM * gpuCount);
+            }
+        }
         let gpuType = gpu.type;
         if (!gpuType) {
@@ -206,6 +279,9 @@ class DeterministicModelSelector {
                 ...gpu,
                 type: gpuType,
                 vramGB,
+                vramPerGPU: vramPerGPU ?? (gpuCount > 0 ? (vramGB > 0 ? vramGB / gpuCount : 0) : 0),
+                gpuCount,
+                isMultiGPU: Boolean(gpu.isMultiGPU || gpuCount > 1),
                 unified: inferredUnified
             },
             memory: {
@@ -217,6 +293,43 @@ class DeterministicModelSelector {
         };
     }
+    normalizeOptimizationObjective(objective) {
+        if (!objective) return 'balanced';
+        const normalized = String(objective).toLowerCase().trim();
+        if (['balanced', 'default', 'auto'].includes(normalized)) return 'balanced';
+        if (['speed', 'fast', 'latency', 'throughput'].includes(normalized)) return 'speed';
+        if (['quality', 'accurate', 'accuracy'].includes(normalized)) return 'quality';
+        if (['context', 'long-context', 'long_context', 'memory'].includes(normalized)) return 'context';
+        if (['coding', 'code', 'developer'].includes(normalized)) return 'coding';
+        return 'balanced';
+    }
+    getScoringWeights(category, optimizeFor = 'balanced') {
+        const base = this.categoryWeights[category] || this.categoryWeights.general;
+        const objective = this.normalizeOptimizationObjective(optimizeFor);
+        const objectiveWeights = this.optimizationProfiles[objective];
+        if (!objectiveWeights) {
+            return base;
+        }
+        // Blend category semantics with requested profile, but keep explicit
+        // user intent dominant (especially for quality/context priorities).
+        const objectivePriorities = {
+            speed: 0.8,
+            quality: 0.95,
+            context: 0.85,
+            coding: 0.8
+        };
+        const objectivePriority = objectivePriorities[objective] || 0.75;
+        const categoryPriority = 1 - objectivePriority;
+        return base.map((weight, idx) => {
+            const blended = (weight * categoryPriority) + (objectiveWeights[idx] * objectivePriority);
+            return Math.round(blended * 1000) / 1000;
+        });
+    }
     async getCPUInfo() {
         const os = require('os');
         return {
@@ -511,7 +624,8 @@ class DeterministicModelSelector {
                 if (!fs.existsSync(cachePath)) continue;
                 const raw = JSON.parse(fs.readFileSync(cachePath, 'utf8'));
                 const sourceModels = Array.isArray(raw) ? raw : (raw.models || []);
-                const normalized = this.normalizeExternalModels(sourceModels);
+                const indexMeta = this.extractModelIndexMetadata(raw, cachePath);
+                const normalized = this.normalizeExternalModels(sourceModels, { indexMeta });
                 if (normalized.length > 0) return normalized;
             } catch (error) {
                 // Ignore broken cache files and keep trying fallbacks
@@ -520,8 +634,28 @@ class DeterministicModelSelector {
         return [];
     }
-    normalizeExternalModels(models = []) {
+    extractModelIndexMetadata(raw, sourcePath = '') {
+        const cachedAtRaw = raw?.cached_at || raw?.generated_at || raw?.last_updated || null;
+        const cachedAt = this.parseDateSafe(cachedAtRaw);
+        const ageDays = cachedAt
+            ? Math.max(0, (Date.now() - cachedAt.getTime()) / (1000 * 60 * 60 * 24))
+            : null;
+        const isStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.indexCadenceDays;
+        const status = {
+            source: sourcePath || 'cache',
+            ageDays: Number.isFinite(ageDays) ? Math.round(ageDays * 10) / 10 : null,
+            isStale: Boolean(isStale),
+            cachedAt: cachedAt ? cachedAt.toISOString() : null
+        };
+        this.modelIndexStatus = status;
+        return status;
+    }
+    normalizeExternalModels(models = [], context = {}) {
         const normalized = [];
+        const indexMeta = context.indexMeta || this.modelIndexStatus || {};
         for (const model of models) {
             if (!model || typeof model !== 'object') continue;
@@ -531,17 +665,23 @@ class DeterministicModelSelector {
                 typeof model.ctxMax === 'number' &&
                 model.model_identifier;
+            const freshness = this.computeFreshnessMetadata(model, indexMeta);
+            const quantizations = this.extractAvailableQuantizations(model, model.variants || []);
             if (alreadyNormalized) {
                 normalized.push({
                     ...model,
                     tags: Array.isArray(model.tags) ? model.tags : [],
                     modalities: Array.isArray(model.modalities) ? model.modalities : ['text'],
                     installed: Boolean(model.installed),
+                    availableQuantizations: model.availableQuantizations || quantizations,
+                    sizeByQuant: model.sizeByQuant || {},
                     source: model.source || 'ollama_database',
                     registry: model.registry || 'ollama.com',
                     version: model.version || model.model_identifier,
                     license: model.license || 'unknown',
                     digest: model.digest || 'unknown',
+                    ...freshness,
                     provenance: model.provenance || {
                         source: model.source || 'ollama_database',
                         registry: model.registry || 'ollama.com',
@@ -553,7 +693,7 @@ class DeterministicModelSelector {
                 continue;
             }
-            const converted = this.convertOllamaModelToDeterministicModels(model);
+            const converted = this.convertOllamaModelToDeterministicModels(model, { indexMeta });
             normalized.push(...converted);
         }
@@ -567,12 +707,14 @@ class DeterministicModelSelector {
         return [...deduped.values()];
     }
-    convertOllamaModelToDeterministicModels(ollamaModel) {
+    convertOllamaModelToDeterministicModels(ollamaModel, context = {}) {
         const baseIdentifier = ollamaModel.model_identifier || ollamaModel.model_name || 'unknown';
         const fallbackTag = `${baseIdentifier}:latest`;
         const variants = Array.isArray(ollamaModel.variants) && ollamaModel.variants.length > 0
             ? ollamaModel.variants
             : [{ tag: ollamaModel.model_identifier || fallbackTag }];
+        const indexMeta = context.indexMeta || this.modelIndexStatus || {};
+        const freshness = this.computeFreshnessMetadata(ollamaModel, indexMeta);
         const contextLength = this.parseContextLength(
             ollamaModel.context_length ||
@@ -614,6 +756,7 @@ class DeterministicModelSelector {
                 ollamaModel.main_size,
                 ollamaModel.model_identifier
             );
+            const moeMetadata = this.extractMoEMetadata(ollamaModel, variant, paramsB, baseText);
             const quant = this.normalizeQuantization(
                 variant.quantization ||
                 this.extractQuantizationFromTag(variantTag) ||
@@ -623,17 +766,67 @@ class DeterministicModelSelector {
             const variantSizeGB = this.extractVariantSizeGB(variant, paramsB);
             const modalities = this.inferModalities(ollamaModel, variantTag);
             const modelTags = this.inferTagsForVariant(derivedTags, variant, variantTag);
+            const sizeByQuant = {};
+            for (const sibling of variants) {
+                const siblingParams = this.extractParamsFromString(
+                    sibling.size,
+                    sibling.tag,
+                    ollamaModel.main_size,
+                    ollamaModel.model_identifier
+                );
+                // Keep quantization map parameter-aware: don't blend 8B/70B/405B sizes.
+                if (Math.abs(siblingParams - paramsB) > 0.25) continue;
+                const siblingQuant = this.normalizeQuantization(
+                    sibling.quantization ||
+                    this.extractQuantizationFromTag(sibling.tag || '') ||
+                    quant
+                );
+                const siblingSize = this.extractVariantSizeGB(sibling, siblingParams);
+                if (!Number.isFinite(sizeByQuant[siblingQuant]) || siblingSize < sizeByQuant[siblingQuant]) {
+                    sizeByQuant[siblingQuant] = siblingSize;
+                }
+            }
+            const availableQuantizations = this.getQuantizationCandidates({
+                availableQuantizations: this.extractAvailableQuantizations(ollamaModel, variants),
+                sizeByQuant
+            });
             const source = ollamaModel.source || 'ollama_database';
             const registry = ollamaModel.registry || 'ollama.com';
             const version = ollamaModel.version || variantTag;
             const license = ollamaModel.license || 'unknown';
             const digest = ollamaModel.digest || 'unknown';
+            const normalizedExpertCount = Number.isFinite(moeMetadata.expertCount) && moeMetadata.expertCount > 0
+                ? Math.round(moeMetadata.expertCount)
+                : null;
+            const normalizedExpertsActive = Number.isFinite(moeMetadata.expertsActivePerToken) && moeMetadata.expertsActivePerToken > 0
+                ? moeMetadata.expertsActivePerToken
+                : null;
+            const normalizedTotalParamsB = Number.isFinite(moeMetadata.totalParamsB) && moeMetadata.totalParamsB > 0
+                ? moeMetadata.totalParamsB
+                : null;
+            const normalizedActiveParamsB = Number.isFinite(moeMetadata.activeParamsB) && moeMetadata.activeParamsB > 0
+                ? moeMetadata.activeParamsB
+                : null;
             return {
                 name: variantTag,
                 family: this.extractFamily(baseIdentifier),
                 paramsB,
+                isMoE: Boolean(moeMetadata.isMoE),
+                is_moe: Boolean(moeMetadata.isMoE),
+                totalParamsB: normalizedTotalParamsB,
+                activeParamsB: normalizedActiveParamsB,
+                expertCount: normalizedExpertCount,
+                expertsActivePerToken: normalizedExpertsActive,
+                total_params_b: normalizedTotalParamsB,
+                active_params_b: normalizedActiveParamsB,
+                expert_count: normalizedExpertCount,
+                experts_active_per_token: normalizedExpertsActive,
                 ctxMax: contextLength,
                 quant,
                 sizeGB: variantSizeGB,
@@ -642,6 +835,9 @@ class DeterministicModelSelector {
                 model_identifier: variantTag,
                 installed: false,
                 pulls: ollamaModel.actual_pulls || ollamaModel.pulls || 0,
+                availableQuantizations,
+                sizeByQuant,
+                ...freshness,
                 source,
                 registry,
                 version,
@@ -658,6 +854,134 @@ class DeterministicModelSelector {
         });
     }
+    parseBillionsValue(rawValue) {
+        return parseMoEBillionsValue(rawValue);
+    }
+    parsePositiveNumber(rawValue) {
+        return parseMoEPositiveNumber(rawValue);
+    }
+    extractMoEMetadata(model = {}, variant = {}, paramsB = null, baseText = '') {
+        return extractCanonicalMoEMetadata({
+            model,
+            variant,
+            paramsB,
+            baseText
+        });
+    }
+    parseDateSafe(value) {
+        if (!value || typeof value !== 'string') return null;
+        const parsed = new Date(value);
+        if (Number.isNaN(parsed.getTime())) return null;
+        return parsed;
+    }
+    extractAvailableQuantizations(model, variants = []) {
+        const quantSet = new Set();
+        const candidateStrings = [];
+        if (Array.isArray(model?.quantizations)) {
+            candidateStrings.push(...model.quantizations);
+        }
+        if (typeof model?.quantization === 'string') {
+            candidateStrings.push(model.quantization);
+        }
+        for (const variant of variants) {
+            if (variant?.quantization) candidateStrings.push(variant.quantization);
+            if (variant?.tag) candidateStrings.push(variant.tag);
+        }
+        for (const value of candidateStrings) {
+            const inferred = this.normalizeQuantization(
+                this.extractQuantizationFromTag(String(value)) || String(value)
+            );
+            if (inferred) quantSet.add(inferred);
+        }
+        if (quantSet.size === 0 && model?.quant) {
+            quantSet.add(this.normalizeQuantization(model.quant));
+        }
+        if (quantSet.size === 0) {
+            quantSet.add('Q4_K_M');
+        }
+        return [...quantSet].sort((a, b) => {
+            const aIdx = this.quantHierarchy.indexOf(a);
+            const bIdx = this.quantHierarchy.indexOf(b);
+            const safeA = aIdx === -1 ? Number.MAX_SAFE_INTEGER : aIdx;
+            const safeB = bIdx === -1 ? Number.MAX_SAFE_INTEGER : bIdx;
+            return safeA - safeB;
+        });
+    }
+    computeFreshnessMetadata(model = {}, indexMeta = {}) {
+        const dateCandidates = [
+            model.last_updated,
+            model.lastUpdated,
+            model.updated_at,
+            model.updatedAt,
+            model.release_date,
+            model.released_at,
+            model.created_at,
+            model.detailed_scraped_at
+        ];
+        const updatedAt = dateCandidates
+            .map((value) => this.parseDateSafe(value))
+            .find(Boolean);
+        const ageDays = updatedAt
+            ? Math.max(0, (Date.now() - updatedAt.getTime()) / (1000 * 60 * 60 * 24))
+            : null;
+        let freshnessScore = 55; // neutral fallback when timestamp is unknown
+        if (Number.isFinite(ageDays)) {
+            if (ageDays <= 30) freshnessScore = 100;
+            else if (ageDays <= 90) freshnessScore = 90;
+            else if (ageDays <= 180) freshnessScore = 75;
+            else if (ageDays <= 365) freshnessScore = 60;
+            else if (ageDays <= 540) freshnessScore = 40;
+            else if (ageDays <= 720) freshnessScore = 25;
+            else freshnessScore = 10;
+        }
+        const textBlob = [
+            model.model_identifier,
+            model.model_name,
+            model.name,
+            model.description,
+            model.detailed_description,
+            model.status,
+            ...(Array.isArray(model.tags) ? model.tags : [])
+        ]
+            .filter(Boolean)
+            .join(' ')
+            .toLowerCase();
+        const isDeprecatedByText =
+            /\bdeprecated\b|\bobsolete\b|\blegacy\b|\barchived\b|\breplaced by\b|\buse .+ instead\b/.test(textBlob);
+        const isDeprecated = Boolean(model.deprecated || model.is_deprecated || model.archived || isDeprecatedByText);
+        const isStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.staleDays;
+        const veryStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.veryStaleDays;
+        const indexStale = Boolean(indexMeta?.isStale);
+        if (isDeprecated) freshnessScore = Math.min(freshnessScore, 15);
+        if (veryStale) freshnessScore = Math.min(freshnessScore, 20);
+        if (indexStale && !updatedAt) freshnessScore = Math.max(0, freshnessScore - 10);
+        return {
+            lastUpdatedAt: updatedAt ? updatedAt.toISOString() : null,
+            modelAgeDays: Number.isFinite(ageDays) ? Math.round(ageDays * 10) / 10 : null,
+            freshnessScore,
+            isStale,
+            isDeprecated,
+            indexAgeDays: Number.isFinite(indexMeta?.ageDays) ? indexMeta.ageDays : null,
+            indexStale
+        };
+    }
     parseContextLength(contextValue) {
         if (typeof contextValue === 'number' && Number.isFinite(contextValue) && contextValue > 0) {
             return Math.round(contextValue);
@@ -766,7 +1090,7 @@ class DeterministicModelSelector {
     extractFamily(modelName) {
         const name = modelName.toLowerCase();
         if (name.includes('qwen2.5')) return 'qwen2.5';
-        if (name.includes('qwen3')) return 'qwen2.5';
+        if (name.includes('qwen3')) return 'qwen3';
         if (name.includes('qwen')) return 'qwen2.5';
         if (name.includes('deepseek')) return 'deepseek';
         if (name.includes('llama3.2') || name.includes('llama3.3')) return 'llama3.2';
@@ -889,13 +1213,22 @@ class DeterministicModelSelector {
             topN = 5,
             enableProbe = false,
             silent = false,
+            optimizeFor = 'balanced',
+            runtime = 'ollama',
             hardware: providedHardware = null,
             installedModels = null,
             modelPool = null
         } = options;
+        const normalizedRuntime = normalizeMoERuntime(runtime);
+        const optimizationObjective = this.normalizeOptimizationObjective(
+            options.optimize || options.objective || optimizeFor
+        );
         if (!silent) {
             console.log(`🔍 Selecting models for category: ${category}`);
+            if (optimizationObjective !== 'balanced') {
+                console.log(`⚙️  Optimization profile: ${optimizationObjective}`);
+            }
         }
         // Phase 0: Gather data
@@ -931,7 +1264,15 @@ class DeterministicModelSelector {
         const budget = isUnified ? usableMem : (vram || usableMem);
         for (const model of filtered) {
-            const result = this.evaluateModel(model, hardware, category, targetCtx, budget);
+            const result = this.evaluateModel(
+                model,
+                hardware,
+                category,
+                targetCtx,
+                budget,
+                optimizationObjective,
+                normalizedRuntime
+            );
             if (result) {
                 candidates.push(result);
             }
@@ -939,7 +1280,14 @@ class DeterministicModelSelector {
         // Sort by score
         candidates.sort((a, b) => b.score - a.score);
-        const topCandidates = candidates.slice(0, topN);
+        let topCandidates = candidates.slice(0, topN);
+        topCandidates = this.ensureFeasibleMidTierCoverage(
+            topCandidates,
+            candidates,
+            category,
+            hardware,
+            optimizationObjective
+        );
         if (!silent) {
             console.log(`✨ Selected ${topCandidates.length} top candidates`);
@@ -957,6 +1305,8 @@ class DeterministicModelSelector {
         return {
             category,
+            optimizeFor: optimizationObjective,
+            runtime: normalizedRuntime,
             hardware,
             candidates: topCandidates,
             total_evaluated: filtered.length,
@@ -1008,42 +1358,115 @@ class DeterministicModelSelector {
         });
     }
-    evaluateModel(model, hardware, category, targetCtx, budget) {
+    evaluateModel(model, hardware, category, targetCtx, budget, optimizeFor = 'balanced', runtime = 'ollama') {
         // 1. Select best fitting quantization
         const bestQuant = this.selectBestQuantization(model, budget, targetCtx);
         if (!bestQuant) return null;
         // 2. Calculate required memory
-        const requiredGB = this.estimateRequiredGB(model, bestQuant.quant, targetCtx);
+        const memoryEstimate = this.estimateMemoryBreakdown(model, bestQuant.quant, targetCtx);
+        const requiredGB = memoryEstimate.requiredGB;
         if (requiredGB > budget) return null;
         // 3. Calculate component scores
         const Q = this.calculateQualityPrior(model, bestQuant.quant, category);
-        const S = this.estimateSpeed(hardware, model, bestQuant.quant, category);
+        const speedEstimate = this.estimateSpeedProfile(hardware, model, bestQuant.quant, category, runtime);
+        const S = speedEstimate.score;
         const F = this.calculateFitScore(requiredGB, budget);
         const C = this.calculateContextScore(model, targetCtx);
         // 4. Calculate final weighted score
-        const weights = this.categoryWeights[category] || this.categoryWeights.general;
+        const weights = this.getScoringWeights(category, optimizeFor);
         const score = Math.round((Q * weights[0] + S * weights[1] + F * weights[2] + C * weights[3]) * 10) / 10;
         // 5. Build rationale
-        const rationale = this.buildRationale(hardware, model, bestQuant.quant, requiredGB, budget, category, Q, S);
+        const rationale = this.buildRationale(
+            hardware,
+            model,
+            bestQuant.quant,
+            requiredGB,
+            budget,
+            category,
+            Q,
+            S,
+            memoryEstimate,
+            speedEstimate
+        );
         return {
             meta: model,
             quant: bestQuant.quant,
             requiredGB: Math.round(requiredGB * 10) / 10,
-            estTPS: S,
+            estTPS: speedEstimate.estimatedTPS,
             score,
+            runtime: speedEstimate.runtime,
             rationale,
+            memory: {
+                modelMemGB: Math.round(memoryEstimate.modelMemGB * 100) / 100,
+                kvCacheGB: Math.round(memoryEstimate.kvCacheGB * 100) / 100,
+                runtimeOverheadGB: Math.round(memoryEstimate.runtimeOverheadGB * 100) / 100,
+                memorySource: memoryEstimate.memorySource,
+                assumptionSource: memoryEstimate.parameterProfile.assumptionSource,
+                isMoE: memoryEstimate.parameterProfile.isMoE,
+                effectiveParamsB: Math.round(memoryEstimate.parameterProfile.effectiveParamsB * 1000) / 1000
+            },
+            speed: {
+                backend: speedEstimate.backend,
+                targetTPS: speedEstimate.targetTPS,
+                estimatedTPS: speedEstimate.estimatedTPS,
+                runtime: speedEstimate.runtime,
+                moe: speedEstimate.moe
+            },
             components: { Q, S, F, C }
         };
     }
+    getQuantizationCandidates(model) {
+        const normalizedAvailable = Array.isArray(model?.availableQuantizations)
+            ? model.availableQuantizations.map((quant) => this.normalizeQuantization(quant))
+            : [];
+        const fromSizeMap = model?.sizeByQuant && typeof model.sizeByQuant === 'object'
+            ? Object.keys(model.sizeByQuant).map((quant) => this.normalizeQuantization(quant))
+            : [];
+        const seeded = (fromSizeMap.length > 0
+            ? [...new Set(fromSizeMap)]
+            : [...new Set(normalizedAvailable)])
+            .filter(Boolean);
+        let candidates = seeded.length > 0 ? seeded : [...this.quantHierarchy];
+        // If we have at least one known quantization, allow extrapolating to
+        // *more compressed* levels as an explicit feasibility assumption.
+        if (seeded.length > 0) {
+            const expanded = new Set();
+            for (const quant of seeded) {
+                const idx = this.quantHierarchy.indexOf(quant);
+                if (idx === -1) {
+                    expanded.add(quant);
+                    continue;
+                }
+                for (let i = idx; i < this.quantHierarchy.length; i++) {
+                    expanded.add(this.quantHierarchy[i]);
+                }
+            }
+            candidates = [...expanded];
+        }
+        return candidates.sort((a, b) => {
+            const aIdx = this.quantHierarchy.indexOf(a);
+            const bIdx = this.quantHierarchy.indexOf(b);
+            const safeA = aIdx === -1 ? Number.MAX_SAFE_INTEGER : aIdx;
+            const safeB = bIdx === -1 ? Number.MAX_SAFE_INTEGER : bIdx;
+            return safeA - safeB;
+        });
+    }
     selectBestQuantization(model, budget, targetCtx) {
+        const quantizationCandidates = this.getQuantizationCandidates(model);
         // Try quantizations from best to worst quality
-        for (const quant of this.quantHierarchy) {
+        for (const quant of quantizationCandidates) {
             const requiredGB = this.estimateRequiredGB(model, quant, targetCtx);
             if (requiredGB <= budget) {
                 return { quant, sizeGB: requiredGB };
@@ -1053,7 +1476,7 @@ class DeterministicModelSelector {
         // If nothing fits at target context, try halving context once
         const halfCtx = Math.floor(targetCtx / 2);
         if (halfCtx >= 1024) {
-            for (const quant of this.quantHierarchy) {
+            for (const quant of quantizationCandidates) {
                 const requiredGB = this.estimateRequiredGB(model, quant, halfCtx);
                 if (requiredGB <= budget) {
                     return { quant, sizeGB: requiredGB };
@@ -1064,7 +1487,11 @@ class DeterministicModelSelector {
         return null; // Model doesn't fit
     }
-    estimateRequiredGB(model, quant, ctx) {
+    resolveMemoryParameterProfile(model = {}) {
+        return resolveMoEParameterProfile(model);
+    }
+    estimateMemoryBreakdown(model, quant, ctx) {
         // Bytes per parameter by quantization level (calibrated to real Ollama sizes)
         // 7B Q4_K_M=~4.5GB, 14B Q4_K_M=~9GB, 32B Q4_K_M=~19GB
         const bytesPerParam = {
@@ -1075,17 +1502,54 @@ class DeterministicModelSelector {
             'Q3_K': 0.48,
             'Q2_K': 0.37
         };
-        const bpp = bytesPerParam[quant] || 0.63;
-        const modelMemGB = model.paramsB * bpp;
+        const normalizedQuant = this.normalizeQuantization(quant);
+        const bpp = bytesPerParam[normalizedQuant] || 0.63;
+        const sizeByQuant = model?.sizeByQuant && typeof model.sizeByQuant === 'object' ? model.sizeByQuant : {};
+        const observedFromSizeMap = Number(sizeByQuant[normalizedQuant]);
+        const directVariantMatch =
+            this.normalizeQuantization(model?.quant || '') === normalizedQuant
+                ? Number(model?.sizeGB ?? model?.size)
+                : NaN;
+        const observedWeightGB = Number.isFinite(observedFromSizeMap) && observedFromSizeMap > 0
+            ? observedFromSizeMap
+            : (Number.isFinite(directVariantMatch) && directVariantMatch > 0 ? directVariantMatch : null);
+        const parameterProfile = this.resolveMemoryParameterProfile(model);
+        const modeledWeightGB = parameterProfile.effectiveParamsB * bpp;
+        const preferSparseInferenceParams =
+            parameterProfile.isMoE &&
+            (parameterProfile.assumptionSource === 'moe_active_metadata' ||
+                parameterProfile.assumptionSource === 'moe_derived_expert_ratio');
+        const useObservedArtifactSize =
+            !preferSparseInferenceParams &&
+            Number.isFinite(observedWeightGB) &&
+            observedWeightGB > 0;
+        const modelMemGB = useObservedArtifactSize ? observedWeightGB : modeledWeightGB;
+        const effectiveCtx = Number.isFinite(Number(ctx)) && Number(ctx) > 0 ? Number(ctx) : 4096;
         // KV cache: ~2 * numLayers * hiddenDim * 2bytes * ctx / 1e9
         // Simplified: ~0.000008 GB per billion params per context token
-        const kvCacheGB = 0.000008 * model.paramsB * ctx;
+        const kvCacheGB = 0.000008 * parameterProfile.effectiveParamsB * effectiveCtx;
         // Runtime overhead (Metal/CUDA context, buffers)
-        const runtimeOverhead = 0.5;
+        const runtimeOverhead = useObservedArtifactSize ? 0.35 : 0.5;
+        const memorySource = useObservedArtifactSize
+            ? 'observed_artifact_size'
+            : (preferSparseInferenceParams ? 'moe_sparse_inference_params' : 'estimated_from_params');
-        return modelMemGB + kvCacheGB + runtimeOverhead;
+        return {
+            parameterProfile,
+            memorySource,
+            modelMemGB,
+            kvCacheGB,
+            runtimeOverheadGB: runtimeOverhead,
+            requiredGB: modelMemGB + kvCacheGB + runtimeOverhead
+        };
+    }
+    estimateRequiredGB(model, quant, ctx) {
+        return this.estimateMemoryBreakdown(model, quant, ctx).requiredGB;
     }
     calculateQualityPrior(model, quant, category) {
@@ -1099,6 +1563,10 @@ class DeterministicModelSelector {
         // Quantization penalty
         const quantPenalty = this.quantPenalties[quant] || -5;
         Q += quantPenalty;
+        // Freshness/deprecation adjustment
+        const freshnessAdjustment = this.calculateFreshnessAdjustment(model);
+        Q += freshnessAdjustment;
         // Task alignment bump
         const taskBump = this.getTaskAlignmentBump(model, category);
@@ -1155,7 +1623,28 @@ class DeterministicModelSelector {
         }
     }
-    estimateSpeed(hardware, model, quant, category) {
+    calculateFreshnessAdjustment(model = {}) {
+        const freshnessScore = Number.isFinite(model.freshnessScore) ? model.freshnessScore : 55;
+        const ageDays = Number.isFinite(model.modelAgeDays) ? model.modelAgeDays : null;
+        const isDeprecated = Boolean(model.isDeprecated);
+        const isStale = Boolean(model.isStale);
+        if (isDeprecated) return -12;
+        if (ageDays !== null && ageDays > this.freshnessThresholds.veryStaleDays) return -8;
+        if (ageDays !== null && ageDays > this.freshnessThresholds.staleDays) return -4;
+        if (isStale) return -3;
+        if (freshnessScore >= 90) return 3;
+        if (freshnessScore >= 75) return 2;
+        if (freshnessScore >= 60) return 1;
+        if (freshnessScore <= 25) return -4;
+        return 0;
+    }
+    estimateSpeed(hardware, model, quant, category, runtime = 'ollama') {
+        return this.estimateSpeedProfile(hardware, model, quant, category, runtime).score;
+    }
+    estimateSpeedProfile(hardware, model, quant, category, runtime = 'ollama') {
         // Determine backend
         let backend = 'cpu_x86';
         if (hardware.acceleration.supports_metal) backend = 'metal';
@@ -1164,7 +1653,14 @@ class DeterministicModelSelector {
         // Base speed calculation
         const K = this.backendK[backend];
-        let base = K / model.paramsB;
+        const denseParamsB = Number.isFinite(this.parseBillionsValue(model.paramsB))
+            ? this.parseBillionsValue(model.paramsB)
+            : 1;
+        const parameterProfile = this.resolveMemoryParameterProfile(model);
+        const effectiveParamsB = Number.isFinite(parameterProfile.effectiveParamsB) && parameterProfile.effectiveParamsB > 0
+            ? parameterProfile.effectiveParamsB
+            : denseParamsB;
+        let base = K / effectiveParamsB;
         // Quantization multiplier
         const quantMultiplier = this.quantSpeedMultipliers[quant] || 1.0;
@@ -1173,10 +1669,31 @@ class DeterministicModelSelector {
         // Threading multiplier
         if (hardware.cpu.cores >= 8) base *= 1.1;
         if (hardware.acceleration.supports_metal || hardware.acceleration.supports_cuda) base *= 1.2;
+        const normalizedRuntime = normalizeMoERuntime(runtime);
+        const moe = estimateMoESpeedMultiplier({
+            model,
+            runtime: normalizedRuntime,
+            denseParamsB,
+            parameterProfile
+        });
+        if (moe.applied) {
+            base *= moe.multiplier;
+        }
         // Normalize to 0-100 score
         const target = this.targetSpeeds[category] || this.targetSpeeds.general;
-        return Math.min(100, Math.round((100 * base / target) * 10) / 10);
+        const estimatedTPS = Math.max(1, Math.round(base * 10) / 10);
+        const score = Math.min(100, Math.round((100 * estimatedTPS / target) * 10) / 10);
+        return {
+            backend,
+            targetTPS: target,
+            estimatedTPS,
+            score,
+            runtime: normalizedRuntime,
+            moe
+        };
     }
     calculateFitScore(requiredGB, budgetGB) {
@@ -1192,7 +1709,104 @@ class DeterministicModelSelector {
         return 0; // Should be filtered out earlier
     }
-    buildRationale(hardware, model, quant, requiredGB, budget, category, Q, S) {
+    estimatePracticalMaxParamsForBudget(budgetGB) {
+        if (!Number.isFinite(budgetGB) || budgetGB <= 0) return 4;
+        if (budgetGB >= 80) return 70;
+        if (budgetGB >= 48) return 46;
+        if (budgetGB >= 32) return 30;
+        if (budgetGB >= 24) return 14;
+        if (budgetGB >= 16) return 8;
+        return 4;
+    }
+    ensureFeasibleMidTierCoverage(selectedCandidates, allCandidates, category, hardware, optimizeFor = 'balanced') {
+        if (!Array.isArray(selectedCandidates) || selectedCandidates.length === 0) {
+            return selectedCandidates;
+        }
+        const objective = this.normalizeOptimizationObjective(optimizeFor);
+        if (objective === 'speed') {
+            return selectedCandidates;
+        }
+        const enforceCategories = new Set(['general', 'talking', 'reading', 'coding', 'reasoning', 'multimodal']);
+        if (!enforceCategories.has(category)) {
+            return selectedCandidates;
+        }
+        const normalizedHardware = this.normalizeHardwareProfile(hardware || {});
+        const budget = normalizedHardware.gpu.unified
+            ? normalizedHardware.usableMemGB
+            : (normalizedHardware.gpu.vramGB || normalizedHardware.usableMemGB);
+        if (!Number.isFinite(budget) || budget < 16) {
+            return selectedCandidates;
+        }
+        const candidatePool = Array.isArray(allCandidates) && allCandidates.length > 0
+            ? allCandidates
+            : selectedCandidates;
+        let promoted = [...selectedCandidates];
+        const minMidTierParams = budget >= 24 ? 7 : 6;
+        const alreadyHasMidTier = promoted.some((candidate) => (candidate?.meta?.paramsB || 0) >= minMidTierParams);
+        if (!alreadyHasMidTier) {
+            const practicalSpeedFloor = normalizedHardware.gpu.unified ? 25 : 20;
+            const feasibleMidTier = candidatePool.find((candidate) => {
+                const params = candidate?.meta?.paramsB || 0;
+                const speedScore = candidate?.components?.S ?? candidate?.estTPS ?? 0;
+                return params >= minMidTierParams && speedScore >= practicalSpeedFloor;
+            });
+            if (
+                feasibleMidTier &&
+                !promoted.some((candidate) => candidate?.meta?.model_identifier === feasibleMidTier?.meta?.model_identifier)
+            ) {
+                promoted[promoted.length - 1] = feasibleMidTier;
+                promoted.sort((a, b) => b.score - a.score);
+            }
+        }
+        const practicalMaxParams = this.estimatePracticalMaxParamsForBudget(budget);
+        const shouldEnforceThirtyBCoverage =
+            Boolean(normalizedHardware?.gpu?.isMultiGPU) &&
+            !Boolean(normalizedHardware?.gpu?.unified) &&
+            practicalMaxParams >= 30;
+        if (!shouldEnforceThirtyBCoverage || objective === 'speed') {
+            return promoted;
+        }
+        const alreadyHasThirtyB = promoted.some((candidate) => (candidate?.meta?.paramsB || 0) >= 30);
+        if (alreadyHasThirtyB) {
+            return promoted;
+        }
+        const largeModelSpeedFloor = Math.max(
+            8,
+            Math.round((this.targetSpeeds[category] || this.targetSpeeds.general) * 0.2)
+        );
+        const feasibleThirtyB = candidatePool.find((candidate) => {
+            const params = candidate?.meta?.paramsB || 0;
+            const estTPS = candidate?.estTPS ?? candidate?.speed?.estimatedTPS ?? 0;
+            return params >= 30 && estTPS >= largeModelSpeedFloor;
+        });
+        if (!feasibleThirtyB) {
+            return promoted;
+        }
+        if (promoted.some((candidate) => candidate?.meta?.model_identifier === feasibleThirtyB?.meta?.model_identifier)) {
+            return promoted;
+        }
+        const highCapacityPromoted = [...promoted];
+        highCapacityPromoted[highCapacityPromoted.length - 1] = feasibleThirtyB;
+        highCapacityPromoted.sort((a, b) => b.score - a.score);
+        return highCapacityPromoted;
+    }
+    buildRationale(hardware, model, quant, requiredGB, budget, category, Q, S, memoryEstimate = null, speedEstimate = null) {
         const parts = [];
         // Memory fit
@@ -1204,6 +1818,27 @@ class DeterministicModelSelector {
         // Special attributes
         if (model.tags.includes('coder')) parts.push('coder-tuned');
         if (model.modalities.includes('vision')) parts.push('vision-capable');
+        if (model.isDeprecated) parts.push('deprecated penalized');
+        else if (model.isStale) parts.push('stale penalized');
+        else if (model.freshnessScore >= 90) parts.push('fresh release');
+        const memoryProfile = memoryEstimate?.parameterProfile;
+        if (memoryProfile?.isMoE) {
+            const assumptionLabels = {
+                moe_active_metadata: 'MoE active params',
+                moe_derived_expert_ratio: 'MoE derived active ratio',
+                moe_fallback_total_params: 'MoE fallback total params',
+                moe_fallback_model_params: 'MoE fallback model params',
+                moe_fallback_default: 'MoE fallback default'
+            };
+            parts.push(assumptionLabels[memoryProfile.assumptionSource] || memoryProfile.assumptionSource);
+        }
+        if (speedEstimate?.moe?.applied) {
+            const runtimeLabel = speedEstimate.runtime || 'ollama';
+            const multiplier = Number(speedEstimate.moe.multiplier || 1).toFixed(2);
+            parts.push(`MoE speed x${multiplier} (${runtimeLabel})`);
+        }
         // Size sweet spot
         if (model.paramsB >= 7 && model.paramsB <= 13) {
@@ -1380,6 +2015,16 @@ class DeterministicModelSelector {
             quantization: candidate.quant,
             estimatedRAM: candidate.requiredGB,
             reasoning: candidate.rationale,
+            runtime: candidate.runtime || candidate.speed?.runtime || 'ollama',
+            memoryAssumptionSource: candidate.memory?.assumptionSource || 'dense_params',
+            speedAssumptions: candidate.speed?.moe ? {
+                applied: Boolean(candidate.speed.moe.applied),
+                runtime: candidate.speed.runtime || candidate.runtime || 'ollama',
+                multiplier: Number.isFinite(candidate.speed.moe.multiplier) ? candidate.speed.moe.multiplier : 1,
+                theoreticalSpeedup: Number.isFinite(candidate.speed.moe.theoreticalSpeedup) ? candidate.speed.moe.theoreticalSpeedup : 1,
+                overheadMultiplier: Number.isFinite(candidate.speed.moe.overheadMultiplier) ? candidate.speed.moe.overheadMultiplier : 1,
+                assumptionSource: candidate.speed.moe.assumptionSource || candidate.memory?.assumptionSource || 'dense_params'
+            } : null,
             source: provenance.source,
             registry: provenance.registry,
             version: provenance.version,
@@ -1410,9 +2055,23 @@ class DeterministicModelSelector {
             cores = 4;
         }
-        if (ram >= 64 && cores >= 16) return 'extreme';
-        if (ram >= 32 && cores >= 12) return 'very_high';
-        if (ram >= 16 && cores >= 8) return 'high';
+        const gpu = hardware?.gpu || {};
+        const gpuCount =
+            (Number.isFinite(Number(gpu.gpuCount)) ? Number(gpu.gpuCount) : null) ??
+            (Number.isFinite(Number(hardware?.gpuCount)) ? Number(hardware.gpuCount) : null) ??
+            1;
+        const totalVRAM =
+            (Number.isFinite(Number(gpu.vramGB)) ? Number(gpu.vramGB) : null) ??
+            (Number.isFinite(Number(gpu.vram)) ? Number(gpu.vram) : null) ??
+            (Number.isFinite(Number(gpu.totalVRAM)) ? Number(gpu.totalVRAM) : null) ??
+            0;
+        const unifiedGPU = Boolean(gpu.unified) || gpu.type === 'apple_silicon';
+        const effectiveAcceleratorMem = unifiedGPU ? Math.max(totalVRAM, ram) : totalVRAM;
+        if (effectiveAcceleratorMem >= 80 || (ram >= 64 && cores >= 16)) return 'extreme';
+        if (effectiveAcceleratorMem >= 48 || (ram >= 32 && cores >= 12)) return 'very_high';
+        if (effectiveAcceleratorMem >= 24 || (ram >= 16 && cores >= 8)) return 'high';
+        if (gpuCount >= 2 && effectiveAcceleratorMem >= 20) return 'high';
         if (ram >= 8 && cores >= 4) return 'medium';
         return 'low';
     }
@@ -1451,12 +2110,16 @@ class DeterministicModelSelector {
     /**
      * Generate recommendations by category (main API, replaces EnhancedModelSelector)
      */
-    async getBestModelsForHardware(hardware, allModels) {
+    async getBestModelsForHardware(hardware, allModels, options = {}) {
         const categories = ['coding', 'reasoning', 'multimodal', 'creative', 'talking', 'reading', 'general'];
         const recommendations = {};
         const normalizedPool = this.normalizeExternalModels(Array.isArray(allModels) ? allModels : []);
         const installedModels = await this.getInstalledModels();
         const normalizedHardware = this.normalizeHardwareProfile(hardware || await this.getHardware());
+        const runtime = normalizeMoERuntime(options.runtime || 'ollama');
+        const optimizationObjective = this.normalizeOptimizationObjective(
+            options.optimizeFor || options.optimize || options.objective
+        );
         for (const category of categories) {
             try {
@@ -1464,6 +2127,8 @@ class DeterministicModelSelector {
                     topN: 3,
                     enableProbe: false,
                     silent: true,
+                    optimizeFor: optimizationObjective,
+                    runtime,
                     hardware: normalizedHardware,
                     installedModels,
                     modelPool: normalizedPool
@@ -1471,6 +2136,8 @@ class DeterministicModelSelector {
                 recommendations[category] = {
                     tier: this.mapHardwareTier(normalizedHardware),
+                    optimizeFor: optimizationObjective,
+                    runtime,
                     bestModels: result.candidates.map(candidate => this.mapCandidateToLegacyFormat(candidate)),
                     totalEvaluated: result.total_evaluated,
                     category: this.getCategoryInfo(category)
@@ -1478,6 +2145,8 @@ class DeterministicModelSelector {
             } catch (error) {
                 recommendations[category] = {
                     tier: this.mapHardwareTier(normalizedHardware),
+                    optimizeFor: optimizationObjective,
+                    runtime,
                     bestModels: [],
                     totalEvaluated: 0,
                     category: this.getCategoryInfo(category)
@@ -1491,9 +2160,12 @@ class DeterministicModelSelector {
     /**
      * Generate recommendation summary
      */
-    generateRecommendationSummary(recommendations, hardware) {
+    generateRecommendationSummary(recommendations, hardware, options = {}) {
         const summary = {
             hardware_tier: this.mapHardwareTier(hardware),
+            optimize_for: this.normalizeOptimizationObjective(
+                options.optimizeFor || options.optimize || options.objective
+            ),
             total_categories: Object.keys(recommendations).length,
             best_overall: null,
             by_category: {},
@@ -1513,6 +2185,7 @@ class DeterministicModelSelector {
                     score: Math.round(bestModel.categoryScore || bestModel.score),
                     command: `ollama pull ${bestModel.model_identifier}`,
                     size: this.formatModelSize(bestModel),
+                    quantization: bestModel.quantization || bestModel.quant || 'Q4_K_M',
                     pulls: bestModel.pulls || 0,
                     source: bestModel.source || bestModel.provenance?.source || 'unknown',
                     registry: bestModel.registry || bestModel.provenance?.registry || 'unknown',
@@ -1548,6 +2221,7 @@ class DeterministicModelSelector {
                 category: bestOverallCategory,
                 score: Math.round(bestOverallScore),
                 command: `ollama pull ${bestOverallModel.model_identifier}`,
+                quantization: bestOverallModel.quantization || bestOverallModel.quant || 'Q4_K_M',
                 source: bestOverallModel.source || bestOverallModel.provenance?.source || 'unknown',
                 registry: bestOverallModel.registry || bestOverallModel.provenance?.registry || 'unknown',
                 version: bestOverallModel.version || bestOverallModel.provenance?.version || 'unknown',