llm-checker 3.2.5 → 3.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -6
- package/bin/enhanced_cli.js +13 -2
- package/package.json +1 -1
- package/src/hardware/backends/rocm-detector.js +20 -1
- package/src/hardware/detector.js +75 -10
- package/src/hardware/unified-detector.js +49 -10
- package/src/index.js +19 -4
- package/src/models/deterministic-selector.js +712 -38
- package/src/models/intelligent-selector.js +2 -0
- package/src/models/moe-assumptions.js +311 -0
- package/src/models/scoring-engine.js +38 -13
|
@@ -29,6 +29,7 @@ class IntelligentSelector {
|
|
|
29
29
|
useCase: 'general',
|
|
30
30
|
targetContext: 8192,
|
|
31
31
|
targetTPS: 20,
|
|
32
|
+
runtime: 'ollama',
|
|
32
33
|
preferQuantization: null, // null = auto select
|
|
33
34
|
preferFamily: null,
|
|
34
35
|
maxSize: null, // null = auto from hardware
|
|
@@ -70,6 +71,7 @@ class IntelligentSelector {
|
|
|
70
71
|
useCase: opts.useCase,
|
|
71
72
|
targetContext: opts.targetContext,
|
|
72
73
|
targetTPS: opts.targetTPS,
|
|
74
|
+
runtime: opts.runtime,
|
|
73
75
|
headroom: opts.headroom || 2
|
|
74
76
|
});
|
|
75
77
|
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Canonical MoE helpers shared across recommendation/scoring paths.
|
|
3
|
+
*
|
|
4
|
+
* Centralizes:
|
|
5
|
+
* - MoE feature detection/normalization
|
|
6
|
+
* - Active-vs-total parameter fallback logic
|
|
7
|
+
* - Runtime-aware routing/offload overhead profiles for speed estimation
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const MOE_RUNTIME_PROFILES = Object.freeze({
|
|
11
|
+
ollama: Object.freeze({
|
|
12
|
+
runtime: 'ollama',
|
|
13
|
+
routingOverhead: 0.18,
|
|
14
|
+
communicationOverhead: 0.13,
|
|
15
|
+
offloadOverhead: 0.08,
|
|
16
|
+
maxEffectiveGain: 2.35,
|
|
17
|
+
notes: ['generic router path', 'mixed expert communication', 'partial offload risk']
|
|
18
|
+
}),
|
|
19
|
+
vllm: Object.freeze({
|
|
20
|
+
runtime: 'vllm',
|
|
21
|
+
routingOverhead: 0.12,
|
|
22
|
+
communicationOverhead: 0.08,
|
|
23
|
+
offloadOverhead: 0.04,
|
|
24
|
+
maxEffectiveGain: 2.65,
|
|
25
|
+
notes: ['optimized scheduler', 'better expert batching', 'lower offload pressure']
|
|
26
|
+
}),
|
|
27
|
+
mlx: Object.freeze({
|
|
28
|
+
runtime: 'mlx',
|
|
29
|
+
routingOverhead: 0.16,
|
|
30
|
+
communicationOverhead: 0.10,
|
|
31
|
+
offloadOverhead: 0.05,
|
|
32
|
+
maxEffectiveGain: 2.45,
|
|
33
|
+
notes: ['apple-unified memory path', 'metal expert routing', 'reduced copy overhead']
|
|
34
|
+
}),
|
|
35
|
+
'llama.cpp': Object.freeze({
|
|
36
|
+
runtime: 'llama.cpp',
|
|
37
|
+
routingOverhead: 0.20,
|
|
38
|
+
communicationOverhead: 0.14,
|
|
39
|
+
offloadOverhead: 0.09,
|
|
40
|
+
maxEffectiveGain: 2.30,
|
|
41
|
+
notes: ['portable backend path', 'higher routing overhead', 'manual offload tuning']
|
|
42
|
+
})
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
const RUNTIME_ALIASES = Object.freeze({
|
|
46
|
+
ollama: 'ollama',
|
|
47
|
+
vllm: 'vllm',
|
|
48
|
+
mlx: 'mlx',
|
|
49
|
+
'mlx-lm': 'mlx',
|
|
50
|
+
mlx_lm: 'mlx',
|
|
51
|
+
'llama.cpp': 'llama.cpp',
|
|
52
|
+
llamacpp: 'llama.cpp',
|
|
53
|
+
llama_cpp: 'llama.cpp'
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
function parseBillionsValue(rawValue) {
|
|
57
|
+
if (rawValue === null || rawValue === undefined || rawValue === '') return null;
|
|
58
|
+
|
|
59
|
+
if (typeof rawValue === 'number') {
|
|
60
|
+
return Number.isFinite(rawValue) && rawValue > 0 ? rawValue : null;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (typeof rawValue !== 'string') return null;
|
|
64
|
+
|
|
65
|
+
const normalized = rawValue.trim().toLowerCase();
|
|
66
|
+
if (!normalized) return null;
|
|
67
|
+
|
|
68
|
+
const match = normalized.match(/(\d+\.?\d*)\s*([bm])?/i);
|
|
69
|
+
if (!match) return null;
|
|
70
|
+
|
|
71
|
+
const value = Number(match[1]);
|
|
72
|
+
if (!Number.isFinite(value) || value <= 0) return null;
|
|
73
|
+
|
|
74
|
+
const suffix = (match[2] || 'b').toLowerCase();
|
|
75
|
+
return suffix === 'm' ? value / 1000 : value;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function parsePositiveNumber(rawValue) {
|
|
79
|
+
if (rawValue === null || rawValue === undefined || rawValue === '') return null;
|
|
80
|
+
const candidate = Number(rawValue);
|
|
81
|
+
if (!Number.isFinite(candidate) || candidate <= 0) return null;
|
|
82
|
+
return candidate;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function clamp(value, min, max) {
|
|
86
|
+
return Math.min(max, Math.max(min, value));
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function normalizeMoERuntime(runtime = 'ollama') {
|
|
90
|
+
const normalized = String(runtime || 'ollama').trim().toLowerCase();
|
|
91
|
+
return RUNTIME_ALIASES[normalized] || 'ollama';
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function getMoERuntimeProfile(runtime = 'ollama') {
|
|
95
|
+
const normalizedRuntime = normalizeMoERuntime(runtime);
|
|
96
|
+
const profile = MOE_RUNTIME_PROFILES[normalizedRuntime] || MOE_RUNTIME_PROFILES.ollama;
|
|
97
|
+
|
|
98
|
+
const routingMultiplier = 1 - profile.routingOverhead;
|
|
99
|
+
const communicationMultiplier = 1 - profile.communicationOverhead;
|
|
100
|
+
const offloadMultiplier = 1 - profile.offloadOverhead;
|
|
101
|
+
const overheadMultiplier = routingMultiplier * communicationMultiplier * offloadMultiplier;
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
...profile,
|
|
105
|
+
runtime: normalizedRuntime,
|
|
106
|
+
routingMultiplier,
|
|
107
|
+
communicationMultiplier,
|
|
108
|
+
offloadMultiplier,
|
|
109
|
+
overheadMultiplier
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function extractMoEMetadata({ model = {}, variant = {}, paramsB = null, baseText = '' } = {}) {
|
|
114
|
+
const totalParamsB = [
|
|
115
|
+
variant.total_params_b,
|
|
116
|
+
variant.totalParamsB,
|
|
117
|
+
variant.total_params,
|
|
118
|
+
variant.totalParams,
|
|
119
|
+
model.total_params_b,
|
|
120
|
+
model.totalParamsB,
|
|
121
|
+
model.total_params,
|
|
122
|
+
model.totalParams
|
|
123
|
+
]
|
|
124
|
+
.map((value) => parseBillionsValue(value))
|
|
125
|
+
.find((value) => Number.isFinite(value));
|
|
126
|
+
|
|
127
|
+
const activeParamsB = [
|
|
128
|
+
variant.active_params_b,
|
|
129
|
+
variant.activeParamsB,
|
|
130
|
+
variant.active_params,
|
|
131
|
+
variant.activeParams,
|
|
132
|
+
model.active_params_b,
|
|
133
|
+
model.activeParamsB,
|
|
134
|
+
model.active_params,
|
|
135
|
+
model.activeParams
|
|
136
|
+
]
|
|
137
|
+
.map((value) => parseBillionsValue(value))
|
|
138
|
+
.find((value) => Number.isFinite(value));
|
|
139
|
+
|
|
140
|
+
const expertCount = [
|
|
141
|
+
variant.expert_count,
|
|
142
|
+
variant.expertCount,
|
|
143
|
+
model.expert_count,
|
|
144
|
+
model.expertCount
|
|
145
|
+
]
|
|
146
|
+
.map((value) => parsePositiveNumber(value))
|
|
147
|
+
.find((value) => Number.isFinite(value));
|
|
148
|
+
|
|
149
|
+
const expertsActivePerToken = [
|
|
150
|
+
variant.experts_active_per_token,
|
|
151
|
+
variant.expertsActivePerToken,
|
|
152
|
+
variant.active_experts,
|
|
153
|
+
variant.activeExperts,
|
|
154
|
+
model.experts_active_per_token,
|
|
155
|
+
model.expertsActivePerToken,
|
|
156
|
+
model.active_experts,
|
|
157
|
+
model.activeExperts
|
|
158
|
+
]
|
|
159
|
+
.map((value) => parsePositiveNumber(value))
|
|
160
|
+
.find((value) => Number.isFinite(value));
|
|
161
|
+
|
|
162
|
+
const text = String(baseText || '').toLowerCase();
|
|
163
|
+
const isMoE = Boolean(
|
|
164
|
+
variant.is_moe ||
|
|
165
|
+
variant.isMoE ||
|
|
166
|
+
model.is_moe ||
|
|
167
|
+
model.isMoE ||
|
|
168
|
+
Number.isFinite(totalParamsB) ||
|
|
169
|
+
Number.isFinite(activeParamsB) ||
|
|
170
|
+
(Number.isFinite(expertCount) && Number.isFinite(expertsActivePerToken)) ||
|
|
171
|
+
text.includes('moe') ||
|
|
172
|
+
text.includes('mixtral')
|
|
173
|
+
);
|
|
174
|
+
|
|
175
|
+
return {
|
|
176
|
+
isMoE,
|
|
177
|
+
totalParamsB: Number.isFinite(totalParamsB) ? totalParamsB : null,
|
|
178
|
+
activeParamsB: Number.isFinite(activeParamsB) ? activeParamsB : null,
|
|
179
|
+
expertCount: Number.isFinite(expertCount) ? expertCount : null,
|
|
180
|
+
expertsActivePerToken: Number.isFinite(expertsActivePerToken) ? expertsActivePerToken : null,
|
|
181
|
+
paramsB: parseBillionsValue(paramsB)
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
function resolveMoEParameterProfile(model = {}) {
|
|
186
|
+
const denseParamsB = parseBillionsValue(model.paramsB);
|
|
187
|
+
const totalParamsB = parseBillionsValue(model.totalParamsB ?? model.total_params_b ?? model.total_params);
|
|
188
|
+
const activeParamsBRaw = parseBillionsValue(model.activeParamsB ?? model.active_params_b ?? model.active_params);
|
|
189
|
+
const expertCount = parsePositiveNumber(model.expertCount ?? model.expert_count);
|
|
190
|
+
const expertsActivePerToken = parsePositiveNumber(
|
|
191
|
+
model.expertsActivePerToken ??
|
|
192
|
+
model.experts_active_per_token ??
|
|
193
|
+
model.activeExperts ??
|
|
194
|
+
model.active_experts
|
|
195
|
+
);
|
|
196
|
+
|
|
197
|
+
const normalizedTotalParamsB = Number.isFinite(totalParamsB) ? totalParamsB : null;
|
|
198
|
+
const normalizedActiveParamsB =
|
|
199
|
+
Number.isFinite(activeParamsBRaw) && Number.isFinite(normalizedTotalParamsB)
|
|
200
|
+
? Math.min(activeParamsBRaw, normalizedTotalParamsB)
|
|
201
|
+
: Number.isFinite(activeParamsBRaw)
|
|
202
|
+
? activeParamsBRaw
|
|
203
|
+
: null;
|
|
204
|
+
|
|
205
|
+
const hasMetadataSignal =
|
|
206
|
+
Number.isFinite(normalizedTotalParamsB) ||
|
|
207
|
+
Number.isFinite(normalizedActiveParamsB) ||
|
|
208
|
+
Number.isFinite(expertCount) ||
|
|
209
|
+
Number.isFinite(expertsActivePerToken);
|
|
210
|
+
const isMoE = Boolean(model.isMoE || model.is_moe || hasMetadataSignal);
|
|
211
|
+
|
|
212
|
+
let effectiveParamsB = Number.isFinite(denseParamsB) ? denseParamsB : 1;
|
|
213
|
+
let assumptionSource = 'dense_params';
|
|
214
|
+
|
|
215
|
+
if (isMoE) {
|
|
216
|
+
if (Number.isFinite(normalizedActiveParamsB)) {
|
|
217
|
+
effectiveParamsB = normalizedActiveParamsB;
|
|
218
|
+
assumptionSource = 'moe_active_metadata';
|
|
219
|
+
} else if (
|
|
220
|
+
Number.isFinite(normalizedTotalParamsB) &&
|
|
221
|
+
Number.isFinite(expertCount) &&
|
|
222
|
+
Number.isFinite(expertsActivePerToken) &&
|
|
223
|
+
expertCount > 0
|
|
224
|
+
) {
|
|
225
|
+
const activeRatio = Math.min(1, expertsActivePerToken / expertCount);
|
|
226
|
+
effectiveParamsB = Math.max(0.1, normalizedTotalParamsB * activeRatio);
|
|
227
|
+
assumptionSource = 'moe_derived_expert_ratio';
|
|
228
|
+
} else if (Number.isFinite(normalizedTotalParamsB)) {
|
|
229
|
+
effectiveParamsB = normalizedTotalParamsB;
|
|
230
|
+
assumptionSource = 'moe_fallback_total_params';
|
|
231
|
+
} else if (Number.isFinite(denseParamsB)) {
|
|
232
|
+
effectiveParamsB = denseParamsB;
|
|
233
|
+
assumptionSource = 'moe_fallback_model_params';
|
|
234
|
+
} else {
|
|
235
|
+
effectiveParamsB = 1;
|
|
236
|
+
assumptionSource = 'moe_fallback_default';
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const normalizedEffective = Number.isFinite(effectiveParamsB) && effectiveParamsB > 0 ? effectiveParamsB : 1;
|
|
241
|
+
|
|
242
|
+
return {
|
|
243
|
+
isMoE,
|
|
244
|
+
totalParamsB: normalizedTotalParamsB,
|
|
245
|
+
activeParamsB: normalizedActiveParamsB,
|
|
246
|
+
expertCount: Number.isFinite(expertCount) ? expertCount : null,
|
|
247
|
+
expertsActivePerToken: Number.isFinite(expertsActivePerToken) ? expertsActivePerToken : null,
|
|
248
|
+
effectiveParamsB: normalizedEffective,
|
|
249
|
+
assumptionSource
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
function estimateMoESpeedMultiplier({
|
|
254
|
+
model = {},
|
|
255
|
+
runtime = 'ollama',
|
|
256
|
+
denseParamsB = null,
|
|
257
|
+
parameterProfile = null
|
|
258
|
+
} = {}) {
|
|
259
|
+
const profile = parameterProfile || resolveMoEParameterProfile(model);
|
|
260
|
+
const runtimeProfile = getMoERuntimeProfile(runtime);
|
|
261
|
+
|
|
262
|
+
const denseParams =
|
|
263
|
+
parseBillionsValue(denseParamsB) ??
|
|
264
|
+
parseBillionsValue(model.paramsB) ??
|
|
265
|
+
profile.totalParamsB ??
|
|
266
|
+
profile.effectiveParamsB ??
|
|
267
|
+
1;
|
|
268
|
+
const activeParams = profile.effectiveParamsB || denseParams;
|
|
269
|
+
|
|
270
|
+
if (!profile.isMoE) {
|
|
271
|
+
return {
|
|
272
|
+
applied: false,
|
|
273
|
+
runtime: runtimeProfile.runtime,
|
|
274
|
+
runtimeProfile,
|
|
275
|
+
denseParamsB: denseParams,
|
|
276
|
+
activeParamsB: activeParams,
|
|
277
|
+
theoreticalSpeedup: 1,
|
|
278
|
+
overheadMultiplier: 1,
|
|
279
|
+
multiplier: 1,
|
|
280
|
+
assumptionSource: profile.assumptionSource
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
const theoreticalSpeedup = clamp(denseParams / Math.max(activeParams, 0.1), 1, 4);
|
|
285
|
+
const overheadMultiplier = runtimeProfile.overheadMultiplier;
|
|
286
|
+
const rawMultiplier = theoreticalSpeedup * overheadMultiplier;
|
|
287
|
+
const multiplier = clamp(rawMultiplier, 1, runtimeProfile.maxEffectiveGain || 2.5);
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
applied: true,
|
|
291
|
+
runtime: runtimeProfile.runtime,
|
|
292
|
+
runtimeProfile,
|
|
293
|
+
denseParamsB: denseParams,
|
|
294
|
+
activeParamsB: activeParams,
|
|
295
|
+
theoreticalSpeedup,
|
|
296
|
+
overheadMultiplier,
|
|
297
|
+
multiplier,
|
|
298
|
+
assumptionSource: profile.assumptionSource
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
module.exports = {
|
|
303
|
+
MOE_RUNTIME_PROFILES,
|
|
304
|
+
parseBillionsValue,
|
|
305
|
+
parsePositiveNumber,
|
|
306
|
+
normalizeMoERuntime,
|
|
307
|
+
getMoERuntimeProfile,
|
|
308
|
+
extractMoEMetadata,
|
|
309
|
+
resolveMoEParameterProfile,
|
|
310
|
+
estimateMoESpeedMultiplier
|
|
311
|
+
};
|
|
@@ -11,6 +11,11 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
const { SCORING_ENGINE_WEIGHTS } = require('./scoring-config');
|
|
14
|
+
const {
|
|
15
|
+
normalizeMoERuntime,
|
|
16
|
+
resolveMoEParameterProfile,
|
|
17
|
+
estimateMoESpeedMultiplier
|
|
18
|
+
} = require('./moe-assumptions');
|
|
14
19
|
|
|
15
20
|
class ScoringEngine {
|
|
16
21
|
constructor(options = {}) {
|
|
@@ -303,14 +308,22 @@ class ScoringEngine {
|
|
|
303
308
|
const useCase = options.useCase || 'general';
|
|
304
309
|
const targetContext = options.targetContext || 8192;
|
|
305
310
|
const targetTPS = options.targetTPS || 20; // Target tokens per second
|
|
311
|
+
const runtime = normalizeMoERuntime(options.runtime || 'ollama');
|
|
306
312
|
|
|
307
313
|
const weights = this.weightPresets[useCase] || this.weightPresets.general;
|
|
308
314
|
|
|
309
315
|
// Calculate individual scores
|
|
310
316
|
const Q = this.calculateQualityScore(variant, useCase);
|
|
311
|
-
const S = this.calculateSpeedScore(variant, hardware, targetTPS);
|
|
317
|
+
const S = this.calculateSpeedScore(variant, hardware, targetTPS, runtime);
|
|
312
318
|
const F = this.calculateFitScore(variant, hardware);
|
|
313
319
|
const C = this.calculateContextScore(variant, targetContext);
|
|
320
|
+
const moeProfile = resolveMoEParameterProfile(variant);
|
|
321
|
+
const moeSpeed = estimateMoESpeedMultiplier({
|
|
322
|
+
model: variant,
|
|
323
|
+
runtime,
|
|
324
|
+
denseParamsB: variant.params_b || variant.paramsB || null,
|
|
325
|
+
parameterProfile: moeProfile
|
|
326
|
+
});
|
|
314
327
|
|
|
315
328
|
// Calculate weighted final score
|
|
316
329
|
const finalScore = Math.round(
|
|
@@ -334,8 +347,17 @@ class ScoringEngine {
|
|
|
334
347
|
family: this.extractFamily(variant.model_id || variant.modelId),
|
|
335
348
|
params: variant.params_b || variant.paramsB,
|
|
336
349
|
quant: variant.quant,
|
|
337
|
-
estimatedTPS: this.estimateTPS(variant, hardware),
|
|
338
|
-
estimatedSize: variant.size_gb || variant.sizeGB
|
|
350
|
+
estimatedTPS: this.estimateTPS(variant, hardware, runtime),
|
|
351
|
+
estimatedSize: variant.size_gb || variant.sizeGB,
|
|
352
|
+
runtime,
|
|
353
|
+
moe: {
|
|
354
|
+
isMoE: moeProfile.isMoE,
|
|
355
|
+
assumptionSource: moeProfile.assumptionSource,
|
|
356
|
+
activeParamsB: moeProfile.activeParamsB,
|
|
357
|
+
totalParamsB: moeProfile.totalParamsB,
|
|
358
|
+
speedMultiplier: moeSpeed.multiplier,
|
|
359
|
+
overheadMultiplier: moeSpeed.overheadMultiplier
|
|
360
|
+
}
|
|
339
361
|
}
|
|
340
362
|
};
|
|
341
363
|
}
|
|
@@ -368,7 +390,7 @@ class ScoringEngine {
|
|
|
368
390
|
const taskBonus = this.getTaskBonus(family, useCase);
|
|
369
391
|
|
|
370
392
|
// MoE bonus (mixture of experts models are often better quality/speed ratio)
|
|
371
|
-
const moeBonus = (variant.
|
|
393
|
+
const moeBonus = resolveMoEParameterProfile(variant).isMoE ? 5 : 0;
|
|
372
394
|
|
|
373
395
|
const score = baseScore + paramBonus - quantPenalty + taskBonus + moeBonus;
|
|
374
396
|
|
|
@@ -379,8 +401,8 @@ class ScoringEngine {
|
|
|
379
401
|
* Calculate Speed score (S)
|
|
380
402
|
* Based on estimated tokens per second vs target
|
|
381
403
|
*/
|
|
382
|
-
calculateSpeedScore(variant, hardware, targetTPS) {
|
|
383
|
-
const estimatedTPS = this.estimateTPS(variant, hardware);
|
|
404
|
+
calculateSpeedScore(variant, hardware, targetTPS, runtime = 'ollama') {
|
|
405
|
+
const estimatedTPS = this.estimateTPS(variant, hardware, runtime);
|
|
384
406
|
|
|
385
407
|
if (estimatedTPS >= targetTPS * 2) {
|
|
386
408
|
return 100; // 2x target = perfect score
|
|
@@ -459,10 +481,11 @@ class ScoringEngine {
|
|
|
459
481
|
* - Quantization adjustment
|
|
460
482
|
* - MoE efficiency bonus
|
|
461
483
|
*/
|
|
462
|
-
estimateTPS(variant, hardware) {
|
|
484
|
+
estimateTPS(variant, hardware, runtime = 'ollama') {
|
|
463
485
|
const params = variant.params_b || variant.paramsB || 7;
|
|
464
486
|
const quant = (variant.quant || 'Q4_K_M').toUpperCase();
|
|
465
|
-
const
|
|
487
|
+
const normalizedRuntime = normalizeMoERuntime(runtime);
|
|
488
|
+
const parameterProfile = resolveMoEParameterProfile(variant);
|
|
466
489
|
|
|
467
490
|
// Get backend speed coefficient (TPS for 7B Q4_K_M)
|
|
468
491
|
const backendKey = this.getBackendKey(hardware);
|
|
@@ -500,11 +523,13 @@ class ScoringEngine {
|
|
|
500
523
|
// Calculate base TPS
|
|
501
524
|
let tps = baseSpeed * sizeMult * quantMult;
|
|
502
525
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
526
|
+
const moeSpeed = estimateMoESpeedMultiplier({
|
|
527
|
+
model: variant,
|
|
528
|
+
runtime: normalizedRuntime,
|
|
529
|
+
denseParamsB: params,
|
|
530
|
+
parameterProfile
|
|
531
|
+
});
|
|
532
|
+
if (moeSpeed.applied) tps *= moeSpeed.multiplier;
|
|
508
533
|
|
|
509
534
|
// Apply minimum floor (can't go below 1 TPS)
|
|
510
535
|
return Math.max(1, Math.round(tps));
|