npm - llm-checker - Versions diffs - 3.2.5 → 3.2.6 - Mend

llm-checker 3.2.5 → 3.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +63 -6
package/bin/enhanced_cli.js +13 -2
package/package.json +1 -1
package/src/hardware/backends/rocm-detector.js +20 -1
package/src/hardware/detector.js +75 -10
package/src/hardware/unified-detector.js +49 -10
package/src/index.js +19 -4
package/src/models/deterministic-selector.js +712 -38
package/src/models/intelligent-selector.js +2 -0
package/src/models/moe-assumptions.js +311 -0
package/src/models/scoring-engine.js +38 -13

package/src/models/intelligent-selector.js CHANGED Viewed

@@ -29,6 +29,7 @@ class IntelligentSelector {
             useCase: 'general',
             targetContext: 8192,
             targetTPS: 20,
+            runtime: 'ollama',
             preferQuantization: null,  // null = auto select
             preferFamily: null,
             maxSize: null,  // null = auto from hardware
@@ -70,6 +71,7 @@ class IntelligentSelector {
             useCase: opts.useCase,
             targetContext: opts.targetContext,
             targetTPS: opts.targetTPS,
+            runtime: opts.runtime,
             headroom: opts.headroom || 2
         });

package/src/models/moe-assumptions.js ADDED Viewed

@@ -0,0 +1,311 @@
+/**
+ * Canonical MoE helpers shared across recommendation/scoring paths.
+ *
+ * Centralizes:
+ * - MoE feature detection/normalization
+ * - Active-vs-total parameter fallback logic
+ * - Runtime-aware routing/offload overhead profiles for speed estimation
+ */
+const MOE_RUNTIME_PROFILES = Object.freeze({
+    ollama: Object.freeze({
+        runtime: 'ollama',
+        routingOverhead: 0.18,
+        communicationOverhead: 0.13,
+        offloadOverhead: 0.08,
+        maxEffectiveGain: 2.35,
+        notes: ['generic router path', 'mixed expert communication', 'partial offload risk']
+    }),
+    vllm: Object.freeze({
+        runtime: 'vllm',
+        routingOverhead: 0.12,
+        communicationOverhead: 0.08,
+        offloadOverhead: 0.04,
+        maxEffectiveGain: 2.65,
+        notes: ['optimized scheduler', 'better expert batching', 'lower offload pressure']
+    }),
+    mlx: Object.freeze({
+        runtime: 'mlx',
+        routingOverhead: 0.16,
+        communicationOverhead: 0.10,
+        offloadOverhead: 0.05,
+        maxEffectiveGain: 2.45,
+        notes: ['apple-unified memory path', 'metal expert routing', 'reduced copy overhead']
+    }),
+    'llama.cpp': Object.freeze({
+        runtime: 'llama.cpp',
+        routingOverhead: 0.20,
+        communicationOverhead: 0.14,
+        offloadOverhead: 0.09,
+        maxEffectiveGain: 2.30,
+        notes: ['portable backend path', 'higher routing overhead', 'manual offload tuning']
+    })
+});
+const RUNTIME_ALIASES = Object.freeze({
+    ollama: 'ollama',
+    vllm: 'vllm',
+    mlx: 'mlx',
+    'mlx-lm': 'mlx',
+    mlx_lm: 'mlx',
+    'llama.cpp': 'llama.cpp',
+    llamacpp: 'llama.cpp',
+    llama_cpp: 'llama.cpp'
+});
+function parseBillionsValue(rawValue) {
+    if (rawValue === null || rawValue === undefined || rawValue === '') return null;
+    if (typeof rawValue === 'number') {
+        return Number.isFinite(rawValue) && rawValue > 0 ? rawValue : null;
+    }
+    if (typeof rawValue !== 'string') return null;
+    const normalized = rawValue.trim().toLowerCase();
+    if (!normalized) return null;
+    const match = normalized.match(/(\d+\.?\d*)\s*([bm])?/i);
+    if (!match) return null;
+    const value = Number(match[1]);
+    if (!Number.isFinite(value) || value <= 0) return null;
+    const suffix = (match[2] || 'b').toLowerCase();
+    return suffix === 'm' ? value / 1000 : value;
+}
+function parsePositiveNumber(rawValue) {
+    if (rawValue === null || rawValue === undefined || rawValue === '') return null;
+    const candidate = Number(rawValue);
+    if (!Number.isFinite(candidate) || candidate <= 0) return null;
+    return candidate;
+}
+function clamp(value, min, max) {
+    return Math.min(max, Math.max(min, value));
+}
+function normalizeMoERuntime(runtime = 'ollama') {
+    const normalized = String(runtime || 'ollama').trim().toLowerCase();
+    return RUNTIME_ALIASES[normalized] || 'ollama';
+}
+function getMoERuntimeProfile(runtime = 'ollama') {
+    const normalizedRuntime = normalizeMoERuntime(runtime);
+    const profile = MOE_RUNTIME_PROFILES[normalizedRuntime] || MOE_RUNTIME_PROFILES.ollama;
+    const routingMultiplier = 1 - profile.routingOverhead;
+    const communicationMultiplier = 1 - profile.communicationOverhead;
+    const offloadMultiplier = 1 - profile.offloadOverhead;
+    const overheadMultiplier = routingMultiplier * communicationMultiplier * offloadMultiplier;
+    return {
+        ...profile,
+        runtime: normalizedRuntime,
+        routingMultiplier,
+        communicationMultiplier,
+        offloadMultiplier,
+        overheadMultiplier
+    };
+}
+function extractMoEMetadata({ model = {}, variant = {}, paramsB = null, baseText = '' } = {}) {
+    const totalParamsB = [
+        variant.total_params_b,
+        variant.totalParamsB,
+        variant.total_params,
+        variant.totalParams,
+        model.total_params_b,
+        model.totalParamsB,
+        model.total_params,
+        model.totalParams
+    ]
+        .map((value) => parseBillionsValue(value))
+        .find((value) => Number.isFinite(value));
+    const activeParamsB = [
+        variant.active_params_b,
+        variant.activeParamsB,
+        variant.active_params,
+        variant.activeParams,
+        model.active_params_b,
+        model.activeParamsB,
+        model.active_params,
+        model.activeParams
+    ]
+        .map((value) => parseBillionsValue(value))
+        .find((value) => Number.isFinite(value));
+    const expertCount = [
+        variant.expert_count,
+        variant.expertCount,
+        model.expert_count,
+        model.expertCount
+    ]
+        .map((value) => parsePositiveNumber(value))
+        .find((value) => Number.isFinite(value));
+    const expertsActivePerToken = [
+        variant.experts_active_per_token,
+        variant.expertsActivePerToken,
+        variant.active_experts,
+        variant.activeExperts,
+        model.experts_active_per_token,
+        model.expertsActivePerToken,
+        model.active_experts,
+        model.activeExperts
+    ]
+        .map((value) => parsePositiveNumber(value))
+        .find((value) => Number.isFinite(value));
+    const text = String(baseText || '').toLowerCase();
+    const isMoE = Boolean(
+        variant.is_moe ||
+            variant.isMoE ||
+            model.is_moe ||
+            model.isMoE ||
+            Number.isFinite(totalParamsB) ||
+            Number.isFinite(activeParamsB) ||
+            (Number.isFinite(expertCount) && Number.isFinite(expertsActivePerToken)) ||
+            text.includes('moe') ||
+            text.includes('mixtral')
+    );
+    return {
+        isMoE,
+        totalParamsB: Number.isFinite(totalParamsB) ? totalParamsB : null,
+        activeParamsB: Number.isFinite(activeParamsB) ? activeParamsB : null,
+        expertCount: Number.isFinite(expertCount) ? expertCount : null,
+        expertsActivePerToken: Number.isFinite(expertsActivePerToken) ? expertsActivePerToken : null,
+        paramsB: parseBillionsValue(paramsB)
+    };
+}
+function resolveMoEParameterProfile(model = {}) {
+    const denseParamsB = parseBillionsValue(model.paramsB);
+    const totalParamsB = parseBillionsValue(model.totalParamsB ?? model.total_params_b ?? model.total_params);
+    const activeParamsBRaw = parseBillionsValue(model.activeParamsB ?? model.active_params_b ?? model.active_params);
+    const expertCount = parsePositiveNumber(model.expertCount ?? model.expert_count);
+    const expertsActivePerToken = parsePositiveNumber(
+        model.expertsActivePerToken ??
+            model.experts_active_per_token ??
+            model.activeExperts ??
+            model.active_experts
+    );
+    const normalizedTotalParamsB = Number.isFinite(totalParamsB) ? totalParamsB : null;
+    const normalizedActiveParamsB =
+        Number.isFinite(activeParamsBRaw) && Number.isFinite(normalizedTotalParamsB)
+            ? Math.min(activeParamsBRaw, normalizedTotalParamsB)
+            : Number.isFinite(activeParamsBRaw)
+              ? activeParamsBRaw
+              : null;
+    const hasMetadataSignal =
+        Number.isFinite(normalizedTotalParamsB) ||
+        Number.isFinite(normalizedActiveParamsB) ||
+        Number.isFinite(expertCount) ||
+        Number.isFinite(expertsActivePerToken);
+    const isMoE = Boolean(model.isMoE || model.is_moe || hasMetadataSignal);
+    let effectiveParamsB = Number.isFinite(denseParamsB) ? denseParamsB : 1;
+    let assumptionSource = 'dense_params';
+    if (isMoE) {
+        if (Number.isFinite(normalizedActiveParamsB)) {
+            effectiveParamsB = normalizedActiveParamsB;
+            assumptionSource = 'moe_active_metadata';
+        } else if (
+            Number.isFinite(normalizedTotalParamsB) &&
+            Number.isFinite(expertCount) &&
+            Number.isFinite(expertsActivePerToken) &&
+            expertCount > 0
+        ) {
+            const activeRatio = Math.min(1, expertsActivePerToken / expertCount);
+            effectiveParamsB = Math.max(0.1, normalizedTotalParamsB * activeRatio);
+            assumptionSource = 'moe_derived_expert_ratio';
+        } else if (Number.isFinite(normalizedTotalParamsB)) {
+            effectiveParamsB = normalizedTotalParamsB;
+            assumptionSource = 'moe_fallback_total_params';
+        } else if (Number.isFinite(denseParamsB)) {
+            effectiveParamsB = denseParamsB;
+            assumptionSource = 'moe_fallback_model_params';
+        } else {
+            effectiveParamsB = 1;
+            assumptionSource = 'moe_fallback_default';
+        }
+    }
+    const normalizedEffective = Number.isFinite(effectiveParamsB) && effectiveParamsB > 0 ? effectiveParamsB : 1;
+    return {
+        isMoE,
+        totalParamsB: normalizedTotalParamsB,
+        activeParamsB: normalizedActiveParamsB,
+        expertCount: Number.isFinite(expertCount) ? expertCount : null,
+        expertsActivePerToken: Number.isFinite(expertsActivePerToken) ? expertsActivePerToken : null,
+        effectiveParamsB: normalizedEffective,
+        assumptionSource
+    };
+}
+function estimateMoESpeedMultiplier({
+    model = {},
+    runtime = 'ollama',
+    denseParamsB = null,
+    parameterProfile = null
+} = {}) {
+    const profile = parameterProfile || resolveMoEParameterProfile(model);
+    const runtimeProfile = getMoERuntimeProfile(runtime);
+    const denseParams =
+        parseBillionsValue(denseParamsB) ??
+        parseBillionsValue(model.paramsB) ??
+        profile.totalParamsB ??
+        profile.effectiveParamsB ??
+        1;
+    const activeParams = profile.effectiveParamsB || denseParams;
+    if (!profile.isMoE) {
+        return {
+            applied: false,
+            runtime: runtimeProfile.runtime,
+            runtimeProfile,
+            denseParamsB: denseParams,
+            activeParamsB: activeParams,
+            theoreticalSpeedup: 1,
+            overheadMultiplier: 1,
+            multiplier: 1,
+            assumptionSource: profile.assumptionSource
+        };
+    }
+    const theoreticalSpeedup = clamp(denseParams / Math.max(activeParams, 0.1), 1, 4);
+    const overheadMultiplier = runtimeProfile.overheadMultiplier;
+    const rawMultiplier = theoreticalSpeedup * overheadMultiplier;
+    const multiplier = clamp(rawMultiplier, 1, runtimeProfile.maxEffectiveGain || 2.5);
+    return {
+        applied: true,
+        runtime: runtimeProfile.runtime,
+        runtimeProfile,
+        denseParamsB: denseParams,
+        activeParamsB: activeParams,
+        theoreticalSpeedup,
+        overheadMultiplier,
+        multiplier,
+        assumptionSource: profile.assumptionSource
+    };
+}
+module.exports = {
+    MOE_RUNTIME_PROFILES,
+    parseBillionsValue,
+    parsePositiveNumber,
+    normalizeMoERuntime,
+    getMoERuntimeProfile,
+    extractMoEMetadata,
+    resolveMoEParameterProfile,
+    estimateMoESpeedMultiplier
+};

package/src/models/scoring-engine.js CHANGED Viewed

@@ -11,6 +11,11 @@
  */
 const { SCORING_ENGINE_WEIGHTS } = require('./scoring-config');
+const {
+    normalizeMoERuntime,
+    resolveMoEParameterProfile,
+    estimateMoESpeedMultiplier
+} = require('./moe-assumptions');
 class ScoringEngine {
     constructor(options = {}) {
@@ -303,14 +308,22 @@ class ScoringEngine {
         const useCase = options.useCase || 'general';
         const targetContext = options.targetContext || 8192;
         const targetTPS = options.targetTPS || 20;  // Target tokens per second
+        const runtime = normalizeMoERuntime(options.runtime || 'ollama');
         const weights = this.weightPresets[useCase] || this.weightPresets.general;
         // Calculate individual scores
         const Q = this.calculateQualityScore(variant, useCase);
-        const S = this.calculateSpeedScore(variant, hardware, targetTPS);
+        const S = this.calculateSpeedScore(variant, hardware, targetTPS, runtime);
         const F = this.calculateFitScore(variant, hardware);
         const C = this.calculateContextScore(variant, targetContext);
+        const moeProfile = resolveMoEParameterProfile(variant);
+        const moeSpeed = estimateMoESpeedMultiplier({
+            model: variant,
+            runtime,
+            denseParamsB: variant.params_b || variant.paramsB || null,
+            parameterProfile: moeProfile
+        });
         // Calculate weighted final score
         const finalScore = Math.round(
@@ -334,8 +347,17 @@ class ScoringEngine {
                 family: this.extractFamily(variant.model_id || variant.modelId),
                 params: variant.params_b || variant.paramsB,
                 quant: variant.quant,
-                estimatedTPS: this.estimateTPS(variant, hardware),
-                estimatedSize: variant.size_gb || variant.sizeGB
+                estimatedTPS: this.estimateTPS(variant, hardware, runtime),
+                estimatedSize: variant.size_gb || variant.sizeGB,
+                runtime,
+                moe: {
+                    isMoE: moeProfile.isMoE,
+                    assumptionSource: moeProfile.assumptionSource,
+                    activeParamsB: moeProfile.activeParamsB,
+                    totalParamsB: moeProfile.totalParamsB,
+                    speedMultiplier: moeSpeed.multiplier,
+                    overheadMultiplier: moeSpeed.overheadMultiplier
+                }
             }
         };
     }
@@ -368,7 +390,7 @@ class ScoringEngine {
         const taskBonus = this.getTaskBonus(family, useCase);
         // MoE bonus (mixture of experts models are often better quality/speed ratio)
-        const moeBonus = (variant.is_moe || variant.isMoE) ? 5 : 0;
+        const moeBonus = resolveMoEParameterProfile(variant).isMoE ? 5 : 0;
         const score = baseScore + paramBonus - quantPenalty + taskBonus + moeBonus;
@@ -379,8 +401,8 @@ class ScoringEngine {
      * Calculate Speed score (S)
      * Based on estimated tokens per second vs target
      */
-    calculateSpeedScore(variant, hardware, targetTPS) {
-        const estimatedTPS = this.estimateTPS(variant, hardware);
+    calculateSpeedScore(variant, hardware, targetTPS, runtime = 'ollama') {
+        const estimatedTPS = this.estimateTPS(variant, hardware, runtime);
         if (estimatedTPS >= targetTPS * 2) {
             return 100;  // 2x target = perfect score
@@ -459,10 +481,11 @@ class ScoringEngine {
      * - Quantization adjustment
      * - MoE efficiency bonus
      */
-    estimateTPS(variant, hardware) {
+    estimateTPS(variant, hardware, runtime = 'ollama') {
         const params = variant.params_b || variant.paramsB || 7;
         const quant = (variant.quant || 'Q4_K_M').toUpperCase();
-        const isMoE = variant.is_moe || variant.isMoE || false;
+        const normalizedRuntime = normalizeMoERuntime(runtime);
+        const parameterProfile = resolveMoEParameterProfile(variant);
         // Get backend speed coefficient (TPS for 7B Q4_K_M)
         const backendKey = this.getBackendKey(hardware);
@@ -500,11 +523,13 @@ class ScoringEngine {
         // Calculate base TPS
         let tps = baseSpeed * sizeMult * quantMult;
-        // MoE models are faster because only ~1/3 of params are active
-        // But communication overhead limits the speedup
-        if (isMoE) {
-            tps *= 1.8;  // ~1.8x speedup (not 3x due to routing overhead)
-        }
+        const moeSpeed = estimateMoESpeedMultiplier({
+            model: variant,
+            runtime: normalizedRuntime,
+            denseParamsB: params,
+            parameterProfile
+        });
+        if (moeSpeed.applied) tps *= moeSpeed.multiplier;
         // Apply minimum floor (can't go below 1 TPS)
         return Math.max(1, Math.round(tps));