llm-checker 3.2.5 → 3.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,7 @@ class IntelligentSelector {
29
29
  useCase: 'general',
30
30
  targetContext: 8192,
31
31
  targetTPS: 20,
32
+ runtime: 'ollama',
32
33
  preferQuantization: null, // null = auto select
33
34
  preferFamily: null,
34
35
  maxSize: null, // null = auto from hardware
@@ -70,6 +71,7 @@ class IntelligentSelector {
70
71
  useCase: opts.useCase,
71
72
  targetContext: opts.targetContext,
72
73
  targetTPS: opts.targetTPS,
74
+ runtime: opts.runtime,
73
75
  headroom: opts.headroom || 2
74
76
  });
75
77
 
@@ -0,0 +1,311 @@
1
+ /**
2
+ * Canonical MoE helpers shared across recommendation/scoring paths.
3
+ *
4
+ * Centralizes:
5
+ * - MoE feature detection/normalization
6
+ * - Active-vs-total parameter fallback logic
7
+ * - Runtime-aware routing/offload overhead profiles for speed estimation
8
+ */
9
+
10
+ const MOE_RUNTIME_PROFILES = Object.freeze({
11
+ ollama: Object.freeze({
12
+ runtime: 'ollama',
13
+ routingOverhead: 0.18,
14
+ communicationOverhead: 0.13,
15
+ offloadOverhead: 0.08,
16
+ maxEffectiveGain: 2.35,
17
+ notes: ['generic router path', 'mixed expert communication', 'partial offload risk']
18
+ }),
19
+ vllm: Object.freeze({
20
+ runtime: 'vllm',
21
+ routingOverhead: 0.12,
22
+ communicationOverhead: 0.08,
23
+ offloadOverhead: 0.04,
24
+ maxEffectiveGain: 2.65,
25
+ notes: ['optimized scheduler', 'better expert batching', 'lower offload pressure']
26
+ }),
27
+ mlx: Object.freeze({
28
+ runtime: 'mlx',
29
+ routingOverhead: 0.16,
30
+ communicationOverhead: 0.10,
31
+ offloadOverhead: 0.05,
32
+ maxEffectiveGain: 2.45,
33
+ notes: ['apple-unified memory path', 'metal expert routing', 'reduced copy overhead']
34
+ }),
35
+ 'llama.cpp': Object.freeze({
36
+ runtime: 'llama.cpp',
37
+ routingOverhead: 0.20,
38
+ communicationOverhead: 0.14,
39
+ offloadOverhead: 0.09,
40
+ maxEffectiveGain: 2.30,
41
+ notes: ['portable backend path', 'higher routing overhead', 'manual offload tuning']
42
+ })
43
+ });
44
+
45
+ const RUNTIME_ALIASES = Object.freeze({
46
+ ollama: 'ollama',
47
+ vllm: 'vllm',
48
+ mlx: 'mlx',
49
+ 'mlx-lm': 'mlx',
50
+ mlx_lm: 'mlx',
51
+ 'llama.cpp': 'llama.cpp',
52
+ llamacpp: 'llama.cpp',
53
+ llama_cpp: 'llama.cpp'
54
+ });
55
+
56
+ function parseBillionsValue(rawValue) {
57
+ if (rawValue === null || rawValue === undefined || rawValue === '') return null;
58
+
59
+ if (typeof rawValue === 'number') {
60
+ return Number.isFinite(rawValue) && rawValue > 0 ? rawValue : null;
61
+ }
62
+
63
+ if (typeof rawValue !== 'string') return null;
64
+
65
+ const normalized = rawValue.trim().toLowerCase();
66
+ if (!normalized) return null;
67
+
68
+ const match = normalized.match(/(\d+\.?\d*)\s*([bm])?/i);
69
+ if (!match) return null;
70
+
71
+ const value = Number(match[1]);
72
+ if (!Number.isFinite(value) || value <= 0) return null;
73
+
74
+ const suffix = (match[2] || 'b').toLowerCase();
75
+ return suffix === 'm' ? value / 1000 : value;
76
+ }
77
+
78
+ function parsePositiveNumber(rawValue) {
79
+ if (rawValue === null || rawValue === undefined || rawValue === '') return null;
80
+ const candidate = Number(rawValue);
81
+ if (!Number.isFinite(candidate) || candidate <= 0) return null;
82
+ return candidate;
83
+ }
84
+
85
+ function clamp(value, min, max) {
86
+ return Math.min(max, Math.max(min, value));
87
+ }
88
+
89
+ function normalizeMoERuntime(runtime = 'ollama') {
90
+ const normalized = String(runtime || 'ollama').trim().toLowerCase();
91
+ return RUNTIME_ALIASES[normalized] || 'ollama';
92
+ }
93
+
94
+ function getMoERuntimeProfile(runtime = 'ollama') {
95
+ const normalizedRuntime = normalizeMoERuntime(runtime);
96
+ const profile = MOE_RUNTIME_PROFILES[normalizedRuntime] || MOE_RUNTIME_PROFILES.ollama;
97
+
98
+ const routingMultiplier = 1 - profile.routingOverhead;
99
+ const communicationMultiplier = 1 - profile.communicationOverhead;
100
+ const offloadMultiplier = 1 - profile.offloadOverhead;
101
+ const overheadMultiplier = routingMultiplier * communicationMultiplier * offloadMultiplier;
102
+
103
+ return {
104
+ ...profile,
105
+ runtime: normalizedRuntime,
106
+ routingMultiplier,
107
+ communicationMultiplier,
108
+ offloadMultiplier,
109
+ overheadMultiplier
110
+ };
111
+ }
112
+
113
+ function extractMoEMetadata({ model = {}, variant = {}, paramsB = null, baseText = '' } = {}) {
114
+ const totalParamsB = [
115
+ variant.total_params_b,
116
+ variant.totalParamsB,
117
+ variant.total_params,
118
+ variant.totalParams,
119
+ model.total_params_b,
120
+ model.totalParamsB,
121
+ model.total_params,
122
+ model.totalParams
123
+ ]
124
+ .map((value) => parseBillionsValue(value))
125
+ .find((value) => Number.isFinite(value));
126
+
127
+ const activeParamsB = [
128
+ variant.active_params_b,
129
+ variant.activeParamsB,
130
+ variant.active_params,
131
+ variant.activeParams,
132
+ model.active_params_b,
133
+ model.activeParamsB,
134
+ model.active_params,
135
+ model.activeParams
136
+ ]
137
+ .map((value) => parseBillionsValue(value))
138
+ .find((value) => Number.isFinite(value));
139
+
140
+ const expertCount = [
141
+ variant.expert_count,
142
+ variant.expertCount,
143
+ model.expert_count,
144
+ model.expertCount
145
+ ]
146
+ .map((value) => parsePositiveNumber(value))
147
+ .find((value) => Number.isFinite(value));
148
+
149
+ const expertsActivePerToken = [
150
+ variant.experts_active_per_token,
151
+ variant.expertsActivePerToken,
152
+ variant.active_experts,
153
+ variant.activeExperts,
154
+ model.experts_active_per_token,
155
+ model.expertsActivePerToken,
156
+ model.active_experts,
157
+ model.activeExperts
158
+ ]
159
+ .map((value) => parsePositiveNumber(value))
160
+ .find((value) => Number.isFinite(value));
161
+
162
+ const text = String(baseText || '').toLowerCase();
163
+ const isMoE = Boolean(
164
+ variant.is_moe ||
165
+ variant.isMoE ||
166
+ model.is_moe ||
167
+ model.isMoE ||
168
+ Number.isFinite(totalParamsB) ||
169
+ Number.isFinite(activeParamsB) ||
170
+ (Number.isFinite(expertCount) && Number.isFinite(expertsActivePerToken)) ||
171
+ text.includes('moe') ||
172
+ text.includes('mixtral')
173
+ );
174
+
175
+ return {
176
+ isMoE,
177
+ totalParamsB: Number.isFinite(totalParamsB) ? totalParamsB : null,
178
+ activeParamsB: Number.isFinite(activeParamsB) ? activeParamsB : null,
179
+ expertCount: Number.isFinite(expertCount) ? expertCount : null,
180
+ expertsActivePerToken: Number.isFinite(expertsActivePerToken) ? expertsActivePerToken : null,
181
+ paramsB: parseBillionsValue(paramsB)
182
+ };
183
+ }
184
+
185
+ function resolveMoEParameterProfile(model = {}) {
186
+ const denseParamsB = parseBillionsValue(model.paramsB);
187
+ const totalParamsB = parseBillionsValue(model.totalParamsB ?? model.total_params_b ?? model.total_params);
188
+ const activeParamsBRaw = parseBillionsValue(model.activeParamsB ?? model.active_params_b ?? model.active_params);
189
+ const expertCount = parsePositiveNumber(model.expertCount ?? model.expert_count);
190
+ const expertsActivePerToken = parsePositiveNumber(
191
+ model.expertsActivePerToken ??
192
+ model.experts_active_per_token ??
193
+ model.activeExperts ??
194
+ model.active_experts
195
+ );
196
+
197
+ const normalizedTotalParamsB = Number.isFinite(totalParamsB) ? totalParamsB : null;
198
+ const normalizedActiveParamsB =
199
+ Number.isFinite(activeParamsBRaw) && Number.isFinite(normalizedTotalParamsB)
200
+ ? Math.min(activeParamsBRaw, normalizedTotalParamsB)
201
+ : Number.isFinite(activeParamsBRaw)
202
+ ? activeParamsBRaw
203
+ : null;
204
+
205
+ const hasMetadataSignal =
206
+ Number.isFinite(normalizedTotalParamsB) ||
207
+ Number.isFinite(normalizedActiveParamsB) ||
208
+ Number.isFinite(expertCount) ||
209
+ Number.isFinite(expertsActivePerToken);
210
+ const isMoE = Boolean(model.isMoE || model.is_moe || hasMetadataSignal);
211
+
212
+ let effectiveParamsB = Number.isFinite(denseParamsB) ? denseParamsB : 1;
213
+ let assumptionSource = 'dense_params';
214
+
215
+ if (isMoE) {
216
+ if (Number.isFinite(normalizedActiveParamsB)) {
217
+ effectiveParamsB = normalizedActiveParamsB;
218
+ assumptionSource = 'moe_active_metadata';
219
+ } else if (
220
+ Number.isFinite(normalizedTotalParamsB) &&
221
+ Number.isFinite(expertCount) &&
222
+ Number.isFinite(expertsActivePerToken) &&
223
+ expertCount > 0
224
+ ) {
225
+ const activeRatio = Math.min(1, expertsActivePerToken / expertCount);
226
+ effectiveParamsB = Math.max(0.1, normalizedTotalParamsB * activeRatio);
227
+ assumptionSource = 'moe_derived_expert_ratio';
228
+ } else if (Number.isFinite(normalizedTotalParamsB)) {
229
+ effectiveParamsB = normalizedTotalParamsB;
230
+ assumptionSource = 'moe_fallback_total_params';
231
+ } else if (Number.isFinite(denseParamsB)) {
232
+ effectiveParamsB = denseParamsB;
233
+ assumptionSource = 'moe_fallback_model_params';
234
+ } else {
235
+ effectiveParamsB = 1;
236
+ assumptionSource = 'moe_fallback_default';
237
+ }
238
+ }
239
+
240
+ const normalizedEffective = Number.isFinite(effectiveParamsB) && effectiveParamsB > 0 ? effectiveParamsB : 1;
241
+
242
+ return {
243
+ isMoE,
244
+ totalParamsB: normalizedTotalParamsB,
245
+ activeParamsB: normalizedActiveParamsB,
246
+ expertCount: Number.isFinite(expertCount) ? expertCount : null,
247
+ expertsActivePerToken: Number.isFinite(expertsActivePerToken) ? expertsActivePerToken : null,
248
+ effectiveParamsB: normalizedEffective,
249
+ assumptionSource
250
+ };
251
+ }
252
+
253
+ function estimateMoESpeedMultiplier({
254
+ model = {},
255
+ runtime = 'ollama',
256
+ denseParamsB = null,
257
+ parameterProfile = null
258
+ } = {}) {
259
+ const profile = parameterProfile || resolveMoEParameterProfile(model);
260
+ const runtimeProfile = getMoERuntimeProfile(runtime);
261
+
262
+ const denseParams =
263
+ parseBillionsValue(denseParamsB) ??
264
+ parseBillionsValue(model.paramsB) ??
265
+ profile.totalParamsB ??
266
+ profile.effectiveParamsB ??
267
+ 1;
268
+ const activeParams = profile.effectiveParamsB || denseParams;
269
+
270
+ if (!profile.isMoE) {
271
+ return {
272
+ applied: false,
273
+ runtime: runtimeProfile.runtime,
274
+ runtimeProfile,
275
+ denseParamsB: denseParams,
276
+ activeParamsB: activeParams,
277
+ theoreticalSpeedup: 1,
278
+ overheadMultiplier: 1,
279
+ multiplier: 1,
280
+ assumptionSource: profile.assumptionSource
281
+ };
282
+ }
283
+
284
+ const theoreticalSpeedup = clamp(denseParams / Math.max(activeParams, 0.1), 1, 4);
285
+ const overheadMultiplier = runtimeProfile.overheadMultiplier;
286
+ const rawMultiplier = theoreticalSpeedup * overheadMultiplier;
287
+ const multiplier = clamp(rawMultiplier, 1, runtimeProfile.maxEffectiveGain || 2.5);
288
+
289
+ return {
290
+ applied: true,
291
+ runtime: runtimeProfile.runtime,
292
+ runtimeProfile,
293
+ denseParamsB: denseParams,
294
+ activeParamsB: activeParams,
295
+ theoreticalSpeedup,
296
+ overheadMultiplier,
297
+ multiplier,
298
+ assumptionSource: profile.assumptionSource
299
+ };
300
+ }
301
+
302
+ module.exports = {
303
+ MOE_RUNTIME_PROFILES,
304
+ parseBillionsValue,
305
+ parsePositiveNumber,
306
+ normalizeMoERuntime,
307
+ getMoERuntimeProfile,
308
+ extractMoEMetadata,
309
+ resolveMoEParameterProfile,
310
+ estimateMoESpeedMultiplier
311
+ };
@@ -11,6 +11,11 @@
11
11
  */
12
12
 
13
13
  const { SCORING_ENGINE_WEIGHTS } = require('./scoring-config');
14
+ const {
15
+ normalizeMoERuntime,
16
+ resolveMoEParameterProfile,
17
+ estimateMoESpeedMultiplier
18
+ } = require('./moe-assumptions');
14
19
 
15
20
  class ScoringEngine {
16
21
  constructor(options = {}) {
@@ -303,14 +308,22 @@ class ScoringEngine {
303
308
  const useCase = options.useCase || 'general';
304
309
  const targetContext = options.targetContext || 8192;
305
310
  const targetTPS = options.targetTPS || 20; // Target tokens per second
311
+ const runtime = normalizeMoERuntime(options.runtime || 'ollama');
306
312
 
307
313
  const weights = this.weightPresets[useCase] || this.weightPresets.general;
308
314
 
309
315
  // Calculate individual scores
310
316
  const Q = this.calculateQualityScore(variant, useCase);
311
- const S = this.calculateSpeedScore(variant, hardware, targetTPS);
317
+ const S = this.calculateSpeedScore(variant, hardware, targetTPS, runtime);
312
318
  const F = this.calculateFitScore(variant, hardware);
313
319
  const C = this.calculateContextScore(variant, targetContext);
320
+ const moeProfile = resolveMoEParameterProfile(variant);
321
+ const moeSpeed = estimateMoESpeedMultiplier({
322
+ model: variant,
323
+ runtime,
324
+ denseParamsB: variant.params_b || variant.paramsB || null,
325
+ parameterProfile: moeProfile
326
+ });
314
327
 
315
328
  // Calculate weighted final score
316
329
  const finalScore = Math.round(
@@ -334,8 +347,17 @@ class ScoringEngine {
334
347
  family: this.extractFamily(variant.model_id || variant.modelId),
335
348
  params: variant.params_b || variant.paramsB,
336
349
  quant: variant.quant,
337
- estimatedTPS: this.estimateTPS(variant, hardware),
338
- estimatedSize: variant.size_gb || variant.sizeGB
350
+ estimatedTPS: this.estimateTPS(variant, hardware, runtime),
351
+ estimatedSize: variant.size_gb || variant.sizeGB,
352
+ runtime,
353
+ moe: {
354
+ isMoE: moeProfile.isMoE,
355
+ assumptionSource: moeProfile.assumptionSource,
356
+ activeParamsB: moeProfile.activeParamsB,
357
+ totalParamsB: moeProfile.totalParamsB,
358
+ speedMultiplier: moeSpeed.multiplier,
359
+ overheadMultiplier: moeSpeed.overheadMultiplier
360
+ }
339
361
  }
340
362
  };
341
363
  }
@@ -368,7 +390,7 @@ class ScoringEngine {
368
390
  const taskBonus = this.getTaskBonus(family, useCase);
369
391
 
370
392
  // MoE bonus (mixture of experts models are often better quality/speed ratio)
371
- const moeBonus = (variant.is_moe || variant.isMoE) ? 5 : 0;
393
+ const moeBonus = resolveMoEParameterProfile(variant).isMoE ? 5 : 0;
372
394
 
373
395
  const score = baseScore + paramBonus - quantPenalty + taskBonus + moeBonus;
374
396
 
@@ -379,8 +401,8 @@ class ScoringEngine {
379
401
  * Calculate Speed score (S)
380
402
  * Based on estimated tokens per second vs target
381
403
  */
382
- calculateSpeedScore(variant, hardware, targetTPS) {
383
- const estimatedTPS = this.estimateTPS(variant, hardware);
404
+ calculateSpeedScore(variant, hardware, targetTPS, runtime = 'ollama') {
405
+ const estimatedTPS = this.estimateTPS(variant, hardware, runtime);
384
406
 
385
407
  if (estimatedTPS >= targetTPS * 2) {
386
408
  return 100; // 2x target = perfect score
@@ -459,10 +481,11 @@ class ScoringEngine {
459
481
  * - Quantization adjustment
460
482
  * - MoE efficiency bonus
461
483
  */
462
- estimateTPS(variant, hardware) {
484
+ estimateTPS(variant, hardware, runtime = 'ollama') {
463
485
  const params = variant.params_b || variant.paramsB || 7;
464
486
  const quant = (variant.quant || 'Q4_K_M').toUpperCase();
465
- const isMoE = variant.is_moe || variant.isMoE || false;
487
+ const normalizedRuntime = normalizeMoERuntime(runtime);
488
+ const parameterProfile = resolveMoEParameterProfile(variant);
466
489
 
467
490
  // Get backend speed coefficient (TPS for 7B Q4_K_M)
468
491
  const backendKey = this.getBackendKey(hardware);
@@ -500,11 +523,13 @@ class ScoringEngine {
500
523
  // Calculate base TPS
501
524
  let tps = baseSpeed * sizeMult * quantMult;
502
525
 
503
- // MoE models are faster because only ~1/3 of params are active
504
- // But communication overhead limits the speedup
505
- if (isMoE) {
506
- tps *= 1.8; // ~1.8x speedup (not 3x due to routing overhead)
507
- }
526
+ const moeSpeed = estimateMoESpeedMultiplier({
527
+ model: variant,
528
+ runtime: normalizedRuntime,
529
+ denseParamsB: params,
530
+ parameterProfile
531
+ });
532
+ if (moeSpeed.applied) tps *= moeSpeed.multiplier;
508
533
 
509
534
  // Apply minimum floor (can't go below 1 TPS)
510
535
  return Math.max(1, Math.round(tps));