llm-checker 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +418 -0
- package/analyzer/compatibility.js +584 -0
- package/analyzer/performance.js +505 -0
- package/bin/CLAUDE.md +12 -0
- package/bin/enhanced_cli.js +3118 -0
- package/bin/test-deterministic.js +41 -0
- package/package.json +96 -0
- package/src/CLAUDE.md +12 -0
- package/src/ai/intelligent-selector.js +615 -0
- package/src/ai/model-selector.js +312 -0
- package/src/ai/multi-objective-selector.js +820 -0
- package/src/commands/check.js +58 -0
- package/src/data/CLAUDE.md +11 -0
- package/src/data/model-database.js +637 -0
- package/src/data/sync-manager.js +279 -0
- package/src/hardware/CLAUDE.md +12 -0
- package/src/hardware/backends/CLAUDE.md +11 -0
- package/src/hardware/backends/apple-silicon.js +318 -0
- package/src/hardware/backends/cpu-detector.js +490 -0
- package/src/hardware/backends/cuda-detector.js +417 -0
- package/src/hardware/backends/intel-detector.js +436 -0
- package/src/hardware/backends/rocm-detector.js +440 -0
- package/src/hardware/detector.js +573 -0
- package/src/hardware/pc-optimizer.js +635 -0
- package/src/hardware/specs.js +286 -0
- package/src/hardware/unified-detector.js +442 -0
- package/src/index.js +2289 -0
- package/src/models/CLAUDE.md +17 -0
- package/src/models/ai-check-selector.js +806 -0
- package/src/models/catalog.json +426 -0
- package/src/models/deterministic-selector.js +1145 -0
- package/src/models/expanded_database.js +1142 -0
- package/src/models/intelligent-selector.js +532 -0
- package/src/models/requirements.js +310 -0
- package/src/models/scoring-config.js +57 -0
- package/src/models/scoring-engine.js +715 -0
- package/src/ollama/.cache/README.md +33 -0
- package/src/ollama/CLAUDE.md +24 -0
- package/src/ollama/client.js +438 -0
- package/src/ollama/enhanced-client.js +113 -0
- package/src/ollama/enhanced-scraper.js +634 -0
- package/src/ollama/manager.js +357 -0
- package/src/ollama/native-scraper.js +776 -0
- package/src/plugins/CLAUDE.md +11 -0
- package/src/plugins/examples/custom_model_plugin.js +87 -0
- package/src/plugins/index.js +295 -0
- package/src/utils/CLAUDE.md +11 -0
- package/src/utils/config.js +359 -0
- package/src/utils/formatter.js +315 -0
- package/src/utils/logger.js +272 -0
- package/src/utils/model-classifier.js +167 -0
- package/src/utils/verbose-progress.js +266 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scoring Engine - Multi-dimensional model scoring system
|
|
3
|
+
*
|
|
4
|
+
* Calculates scores based on:
|
|
5
|
+
* - Q (Quality): Model quality based on params, family, quantization
|
|
6
|
+
* - S (Speed): Estimated inference speed on target hardware
|
|
7
|
+
* - F (Fit): How well the model fits in available memory
|
|
8
|
+
* - C (Context): Context length capability
|
|
9
|
+
*
|
|
10
|
+
* FinalScore = Q × wQ + S × wS + F × wF + C × wC
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const { SCORING_ENGINE_WEIGHTS } = require('./scoring-config');
|
|
14
|
+
|
|
15
|
+
class ScoringEngine {
|
|
16
|
+
constructor(options = {}) {
|
|
17
|
+
// Weight presets from centralized config
|
|
18
|
+
this.weightPresets = SCORING_ENGINE_WEIGHTS;
|
|
19
|
+
|
|
20
|
+
// Model family quality rankings (0-100 base score)
|
|
21
|
+
this.familyQuality = {
|
|
22
|
+
// Frontier models
|
|
23
|
+
'qwen2.5': 95,
|
|
24
|
+
'qwen2': 90,
|
|
25
|
+
'llama3.3': 95,
|
|
26
|
+
'llama3.2': 92,
|
|
27
|
+
'llama3.1': 90,
|
|
28
|
+
'llama3': 88,
|
|
29
|
+
'deepseek-v3': 96,
|
|
30
|
+
'deepseek-v2.5': 94,
|
|
31
|
+
'deepseek-coder-v2': 92,
|
|
32
|
+
'deepseek-r1': 96,
|
|
33
|
+
'gemma2': 90,
|
|
34
|
+
'gemma': 82,
|
|
35
|
+
'phi-4': 92,
|
|
36
|
+
'phi-3.5': 88,
|
|
37
|
+
'phi-3': 85,
|
|
38
|
+
'phi-2': 75,
|
|
39
|
+
'mistral-large': 94,
|
|
40
|
+
'mistral': 85,
|
|
41
|
+
'mixtral': 88,
|
|
42
|
+
'command-r': 90,
|
|
43
|
+
'command-r-plus': 93,
|
|
44
|
+
|
|
45
|
+
// Coding specialists
|
|
46
|
+
'qwen2.5-coder': 96,
|
|
47
|
+
'codellama': 82,
|
|
48
|
+
'starcoder2': 85,
|
|
49
|
+
'deepseek-coder': 88,
|
|
50
|
+
'codegemma': 80,
|
|
51
|
+
'granite-code': 78,
|
|
52
|
+
|
|
53
|
+
// Chat/instruct
|
|
54
|
+
'yi': 85,
|
|
55
|
+
'yi-coder': 88,
|
|
56
|
+
'openchat': 78,
|
|
57
|
+
'neural-chat': 75,
|
|
58
|
+
'zephyr': 80,
|
|
59
|
+
'openhermes': 82,
|
|
60
|
+
'nous-hermes': 82,
|
|
61
|
+
'dolphin': 80,
|
|
62
|
+
'orca': 78,
|
|
63
|
+
|
|
64
|
+
// Vision models
|
|
65
|
+
'llava': 82,
|
|
66
|
+
'llava-llama3': 85,
|
|
67
|
+
'llava-phi3': 80,
|
|
68
|
+
'bakllava': 78,
|
|
69
|
+
'moondream': 75,
|
|
70
|
+
|
|
71
|
+
// Embeddings
|
|
72
|
+
'nomic-embed-text': 85,
|
|
73
|
+
'mxbai-embed-large': 88,
|
|
74
|
+
'all-minilm': 80,
|
|
75
|
+
'snowflake-arctic-embed': 85,
|
|
76
|
+
|
|
77
|
+
// Other notable models
|
|
78
|
+
'solar': 82,
|
|
79
|
+
'falcon': 75,
|
|
80
|
+
'vicuna': 72,
|
|
81
|
+
'wizardlm': 78,
|
|
82
|
+
'aya': 85,
|
|
83
|
+
'smollm': 70,
|
|
84
|
+
'tinyllama': 65
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
// Quantization quality penalties (subtracted from base score)
|
|
88
|
+
this.quantPenalties = {
|
|
89
|
+
'FP16': 0,
|
|
90
|
+
'F16': 0,
|
|
91
|
+
'Q8_0': 2,
|
|
92
|
+
'Q6_K': 4,
|
|
93
|
+
'Q5_K_M': 6,
|
|
94
|
+
'Q5_K_S': 7,
|
|
95
|
+
'Q5_0': 8,
|
|
96
|
+
'Q4_K_M': 10,
|
|
97
|
+
'Q4_K_S': 11,
|
|
98
|
+
'Q4_0': 12,
|
|
99
|
+
'Q3_K_M': 16,
|
|
100
|
+
'Q3_K_S': 18,
|
|
101
|
+
'Q3_K_L': 15,
|
|
102
|
+
'IQ4_XS': 11,
|
|
103
|
+
'IQ4_NL': 10,
|
|
104
|
+
'IQ3_XXS': 20,
|
|
105
|
+
'IQ3_XS': 18,
|
|
106
|
+
'IQ3_S': 17,
|
|
107
|
+
'IQ2_XS': 25,
|
|
108
|
+
'IQ2_XXS': 28,
|
|
109
|
+
'Q2_K': 22,
|
|
110
|
+
'Q2_K_S': 24
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
// Task-specific bonuses for model families
|
|
114
|
+
this.taskBonuses = {
|
|
115
|
+
coding: {
|
|
116
|
+
'qwen2.5-coder': 15,
|
|
117
|
+
'deepseek-coder': 12,
|
|
118
|
+
'deepseek-coder-v2': 15,
|
|
119
|
+
'codellama': 10,
|
|
120
|
+
'starcoder2': 12,
|
|
121
|
+
'codegemma': 8,
|
|
122
|
+
'yi-coder': 10,
|
|
123
|
+
'granite-code': 8
|
|
124
|
+
},
|
|
125
|
+
reasoning: {
|
|
126
|
+
'deepseek-r1': 15,
|
|
127
|
+
'qwen2.5': 10,
|
|
128
|
+
'llama3.3': 10,
|
|
129
|
+
'phi-4': 12,
|
|
130
|
+
'command-r-plus': 10,
|
|
131
|
+
'mistral-large': 10
|
|
132
|
+
},
|
|
133
|
+
chat: {
|
|
134
|
+
'llama3.2': 10,
|
|
135
|
+
'mistral': 8,
|
|
136
|
+
'gemma2': 8,
|
|
137
|
+
'openchat': 10,
|
|
138
|
+
'neural-chat': 8,
|
|
139
|
+
'dolphin': 8
|
|
140
|
+
},
|
|
141
|
+
vision: {
|
|
142
|
+
'llava': 15,
|
|
143
|
+
'llava-llama3': 18,
|
|
144
|
+
'llava-phi3': 15,
|
|
145
|
+
'bakllava': 12,
|
|
146
|
+
'moondream': 10
|
|
147
|
+
},
|
|
148
|
+
embeddings: {
|
|
149
|
+
'nomic-embed-text': 15,
|
|
150
|
+
'mxbai-embed-large': 18,
|
|
151
|
+
'all-minilm': 12,
|
|
152
|
+
'snowflake-arctic-embed': 15
|
|
153
|
+
},
|
|
154
|
+
creative: {
|
|
155
|
+
'mistral': 8,
|
|
156
|
+
'mixtral': 10,
|
|
157
|
+
'openhermes': 8,
|
|
158
|
+
'dolphin': 10
|
|
159
|
+
},
|
|
160
|
+
multilingual: {
|
|
161
|
+
'aya': 15,
|
|
162
|
+
'qwen2.5': 10,
|
|
163
|
+
'command-r': 12
|
|
164
|
+
}
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
// Speed coefficients by backend (tokens/sec for 7B Q4_K_M as baseline)
|
|
168
|
+
// These are realistic values based on actual Ollama benchmarks
|
|
169
|
+
this.backendSpeed = {
|
|
170
|
+
// NVIDIA - based on real llama.cpp/Ollama benchmarks
|
|
171
|
+
'cuda_h100': 120, // ~100-140 TPS for 7B Q4
|
|
172
|
+
'cuda_a100': 90, // ~80-100 TPS for 7B Q4
|
|
173
|
+
'cuda_4090': 70, // ~60-80 TPS for 7B Q4
|
|
174
|
+
'cuda_4080': 55, // ~50-60 TPS for 7B Q4
|
|
175
|
+
'cuda_3090': 50, // ~45-55 TPS for 7B Q4
|
|
176
|
+
'cuda_3080': 40, // ~35-45 TPS for 7B Q4
|
|
177
|
+
'cuda_3070': 32, // ~28-35 TPS for 7B Q4
|
|
178
|
+
'cuda_3060': 25, // ~20-28 TPS for 7B Q4
|
|
179
|
+
'cuda_2080': 28, // ~25-30 TPS for 7B Q4
|
|
180
|
+
'cuda_default': 30,
|
|
181
|
+
|
|
182
|
+
// AMD - slightly lower than equivalent NVIDIA
|
|
183
|
+
'rocm_mi300': 100,
|
|
184
|
+
'rocm_mi250': 70,
|
|
185
|
+
'rocm_7900xtx': 55,
|
|
186
|
+
'rocm_7900xt': 45,
|
|
187
|
+
'rocm_7800xt': 38,
|
|
188
|
+
'rocm_6900xt': 35,
|
|
189
|
+
'rocm_default': 30,
|
|
190
|
+
|
|
191
|
+
// Apple Silicon - based on real M-series benchmarks
|
|
192
|
+
'metal_m4_ultra': 75, // ~70-80 TPS for 7B Q4
|
|
193
|
+
'metal_m4_max': 60, // ~55-65 TPS for 7B Q4
|
|
194
|
+
'metal_m4_pro': 45, // ~40-50 TPS for 7B Q4
|
|
195
|
+
'metal_m4': 35, // ~30-40 TPS for 7B Q4
|
|
196
|
+
'metal_m3_ultra': 65,
|
|
197
|
+
'metal_m3_max': 50,
|
|
198
|
+
'metal_m3_pro': 40,
|
|
199
|
+
'metal_m3': 30,
|
|
200
|
+
'metal_m2_ultra': 55,
|
|
201
|
+
'metal_m2_max': 45,
|
|
202
|
+
'metal_m2_pro': 35,
|
|
203
|
+
'metal_m2': 28,
|
|
204
|
+
'metal_m1_ultra': 45,
|
|
205
|
+
'metal_m1_max': 38,
|
|
206
|
+
'metal_m1_pro': 30,
|
|
207
|
+
'metal_m1': 22,
|
|
208
|
+
'metal_default': 30,
|
|
209
|
+
|
|
210
|
+
// Intel Arc - limited real-world data
|
|
211
|
+
'intel_arc_a770': 30,
|
|
212
|
+
'intel_arc_a750': 25,
|
|
213
|
+
'intel_arc_default': 20,
|
|
214
|
+
|
|
215
|
+
// CPU - very conservative, based on actual CPU inference speeds
|
|
216
|
+
'cpu_avx512_amx': 12, // Best case server CPU
|
|
217
|
+
'cpu_avx512': 8, // Good desktop CPU
|
|
218
|
+
'cpu_avx2': 5, // Most modern CPUs
|
|
219
|
+
'cpu_neon': 4, // Apple Silicon fallback
|
|
220
|
+
'cpu_avx': 3, // Older CPUs
|
|
221
|
+
'cpu_default': 2 // Very old CPUs
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
// Quantization speed multipliers (relative to Q4_K_M baseline)
|
|
225
|
+
// More conservative than theoretical - accounts for real overhead
|
|
226
|
+
this.quantSpeedMult = {
|
|
227
|
+
'FP16': 0.5, // Half speed of Q4 (2x memory bandwidth)
|
|
228
|
+
'F16': 0.5,
|
|
229
|
+
'Q8_0': 0.7, // ~70% of Q4 speed
|
|
230
|
+
'Q6_K': 0.85, // ~85% of Q4 speed
|
|
231
|
+
'Q5_K_M': 0.92,
|
|
232
|
+
'Q5_K_S': 0.92,
|
|
233
|
+
'Q5_0': 0.92,
|
|
234
|
+
'Q4_K_M': 1.0, // Baseline
|
|
235
|
+
'Q4_K_S': 1.0,
|
|
236
|
+
'Q4_0': 1.05,
|
|
237
|
+
'Q3_K_M': 1.15, // Faster but quality loss
|
|
238
|
+
'Q3_K_S': 1.15,
|
|
239
|
+
'Q3_K_L': 1.1,
|
|
240
|
+
'IQ4_XS': 1.02,
|
|
241
|
+
'IQ4_NL': 1.0,
|
|
242
|
+
'IQ3_XXS': 1.2,
|
|
243
|
+
'IQ3_XS': 1.18,
|
|
244
|
+
'IQ3_S': 1.15,
|
|
245
|
+
'IQ2_XS': 1.25,
|
|
246
|
+
'IQ2_XXS': 1.28,
|
|
247
|
+
'Q2_K': 1.22,
|
|
248
|
+
'Q2_K_S': 1.25
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
this.options = options;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Estimate model size from params and quantization
|
|
256
|
+
* Returns estimated size in GB, or null if cannot estimate
|
|
257
|
+
*/
|
|
258
|
+
estimateSizeFromParams(variant) {
|
|
259
|
+
const params = variant.params_b || variant.paramsB;
|
|
260
|
+
if (!params) return null;
|
|
261
|
+
|
|
262
|
+
const quant = (variant.quant || 'Q4_K_M').toUpperCase();
|
|
263
|
+
|
|
264
|
+
if (quant.includes('FP16') || quant.includes('F16')) {
|
|
265
|
+
return params * 2; // FP16: ~2GB per 1B params
|
|
266
|
+
} else if (quant.includes('Q8')) {
|
|
267
|
+
return params * 1; // Q8: ~1GB per 1B params
|
|
268
|
+
} else if (quant.includes('Q6')) {
|
|
269
|
+
return params * 0.75; // Q6: ~0.75GB per 1B params
|
|
270
|
+
} else if (quant.includes('Q5')) {
|
|
271
|
+
return params * 0.6; // Q5: ~0.6GB per 1B params
|
|
272
|
+
} else if (quant.includes('Q4')) {
|
|
273
|
+
return params * 0.5; // Q4: ~0.5GB per 1B params
|
|
274
|
+
} else if (quant.includes('Q3')) {
|
|
275
|
+
return params * 0.4; // Q3: ~0.4GB per 1B params
|
|
276
|
+
} else if (quant.includes('Q2') || quant.includes('IQ2')) {
|
|
277
|
+
return params * 0.3; // Q2: ~0.3GB per 1B params
|
|
278
|
+
} else {
|
|
279
|
+
return params * 0.5; // Default to Q4 estimate
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Get model size (actual or estimated)
|
|
285
|
+
*/
|
|
286
|
+
getModelSize(variant) {
|
|
287
|
+
const size = variant.size_gb || variant.sizeGB;
|
|
288
|
+
if (size && size > 0) return size;
|
|
289
|
+
return this.estimateSizeFromParams(variant);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* Calculate overall score for a model variant
|
|
294
|
+
*
|
|
295
|
+
* @param {Object} variant - Model variant data
|
|
296
|
+
* @param {Object} hardware - Hardware info from UnifiedDetector
|
|
297
|
+
* @param {Object} options - Scoring options
|
|
298
|
+
* @returns {Object} Score breakdown and final score
|
|
299
|
+
*/
|
|
300
|
+
score(variant, hardware, options = {}) {
|
|
301
|
+
const useCase = options.useCase || 'general';
|
|
302
|
+
const targetContext = options.targetContext || 8192;
|
|
303
|
+
const targetTPS = options.targetTPS || 20; // Target tokens per second
|
|
304
|
+
|
|
305
|
+
const weights = this.weightPresets[useCase] || this.weightPresets.general;
|
|
306
|
+
|
|
307
|
+
// Calculate individual scores
|
|
308
|
+
const Q = this.calculateQualityScore(variant, useCase);
|
|
309
|
+
const S = this.calculateSpeedScore(variant, hardware, targetTPS);
|
|
310
|
+
const F = this.calculateFitScore(variant, hardware);
|
|
311
|
+
const C = this.calculateContextScore(variant, targetContext);
|
|
312
|
+
|
|
313
|
+
// Calculate weighted final score
|
|
314
|
+
const finalScore = Math.round(
|
|
315
|
+
Q * weights.Q +
|
|
316
|
+
S * weights.S +
|
|
317
|
+
F * weights.F +
|
|
318
|
+
C * weights.C
|
|
319
|
+
);
|
|
320
|
+
|
|
321
|
+
return {
|
|
322
|
+
final: Math.min(100, Math.max(0, finalScore)),
|
|
323
|
+
components: {
|
|
324
|
+
quality: Math.round(Q),
|
|
325
|
+
speed: Math.round(S),
|
|
326
|
+
fit: Math.round(F),
|
|
327
|
+
context: Math.round(C)
|
|
328
|
+
},
|
|
329
|
+
weights,
|
|
330
|
+
meta: {
|
|
331
|
+
useCase,
|
|
332
|
+
family: this.extractFamily(variant.model_id || variant.modelId),
|
|
333
|
+
params: variant.params_b || variant.paramsB,
|
|
334
|
+
quant: variant.quant,
|
|
335
|
+
estimatedTPS: this.estimateTPS(variant, hardware),
|
|
336
|
+
estimatedSize: variant.size_gb || variant.sizeGB
|
|
337
|
+
}
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Calculate Quality score (Q)
|
|
343
|
+
* Based on model family, parameter count, and quantization
|
|
344
|
+
*/
|
|
345
|
+
calculateQualityScore(variant, useCase) {
|
|
346
|
+
const family = this.extractFamily(variant.model_id || variant.modelId);
|
|
347
|
+
const params = variant.params_b || variant.paramsB || 7;
|
|
348
|
+
const quant = (variant.quant || 'Q4_K_M').toUpperCase();
|
|
349
|
+
|
|
350
|
+
// Base family score
|
|
351
|
+
let baseScore = this.getFamilyScore(family);
|
|
352
|
+
|
|
353
|
+
// Parameter size bonus (larger models generally better, with diminishing returns)
|
|
354
|
+
let paramBonus = 0;
|
|
355
|
+
if (params >= 70) paramBonus = 15;
|
|
356
|
+
else if (params >= 32) paramBonus = 12;
|
|
357
|
+
else if (params >= 14) paramBonus = 8;
|
|
358
|
+
else if (params >= 7) paramBonus = 5;
|
|
359
|
+
else if (params >= 3) paramBonus = 2;
|
|
360
|
+
else paramBonus = 0;
|
|
361
|
+
|
|
362
|
+
// Quantization penalty
|
|
363
|
+
const quantPenalty = this.quantPenalties[quant] || 10;
|
|
364
|
+
|
|
365
|
+
// Task-specific bonus
|
|
366
|
+
const taskBonus = this.getTaskBonus(family, useCase);
|
|
367
|
+
|
|
368
|
+
// MoE bonus (mixture of experts models are often better quality/speed ratio)
|
|
369
|
+
const moeBonus = (variant.is_moe || variant.isMoE) ? 5 : 0;
|
|
370
|
+
|
|
371
|
+
const score = baseScore + paramBonus - quantPenalty + taskBonus + moeBonus;
|
|
372
|
+
|
|
373
|
+
return Math.min(100, Math.max(0, score));
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Calculate Speed score (S)
|
|
378
|
+
* Based on estimated tokens per second vs target
|
|
379
|
+
*/
|
|
380
|
+
calculateSpeedScore(variant, hardware, targetTPS) {
|
|
381
|
+
const estimatedTPS = this.estimateTPS(variant, hardware);
|
|
382
|
+
|
|
383
|
+
if (estimatedTPS >= targetTPS * 2) {
|
|
384
|
+
return 100; // 2x target = perfect score
|
|
385
|
+
} else if (estimatedTPS >= targetTPS) {
|
|
386
|
+
// Linear scaling from 80-100 for 1x-2x target
|
|
387
|
+
return 80 + (estimatedTPS - targetTPS) / targetTPS * 20;
|
|
388
|
+
} else if (estimatedTPS >= targetTPS * 0.5) {
|
|
389
|
+
// Linear scaling from 50-80 for 0.5x-1x target
|
|
390
|
+
return 50 + (estimatedTPS / targetTPS) * 30;
|
|
391
|
+
} else {
|
|
392
|
+
// Below 50% target, steep penalty
|
|
393
|
+
return Math.max(0, (estimatedTPS / targetTPS) * 50);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* Calculate Fit score (F)
|
|
399
|
+
* Based on how well model fits in available memory
|
|
400
|
+
*/
|
|
401
|
+
calculateFitScore(variant, hardware) {
|
|
402
|
+
const modelSize = this.getModelSize(variant);
|
|
403
|
+
|
|
404
|
+
// No size info available - give moderate score
|
|
405
|
+
if (!modelSize) return 70;
|
|
406
|
+
|
|
407
|
+
const availableMemory = hardware?.summary?.effectiveMemory || 8;
|
|
408
|
+
const headroom = 2; // GB reserved for system
|
|
409
|
+
|
|
410
|
+
const effectiveAvailable = availableMemory - headroom;
|
|
411
|
+
const usage = modelSize / effectiveAvailable;
|
|
412
|
+
|
|
413
|
+
if (usage <= 0.7) {
|
|
414
|
+
return 100; // Plenty of room
|
|
415
|
+
} else if (usage <= 0.85) {
|
|
416
|
+
// Comfortable fit
|
|
417
|
+
return 90 + (0.85 - usage) / 0.15 * 10;
|
|
418
|
+
} else if (usage <= 1.0) {
|
|
419
|
+
// Tight fit
|
|
420
|
+
return 70 + (1.0 - usage) / 0.15 * 20;
|
|
421
|
+
} else if (usage <= 1.2) {
|
|
422
|
+
// May work with swapping (especially on Mac)
|
|
423
|
+
return 50 - (usage - 1.0) * 100;
|
|
424
|
+
} else {
|
|
425
|
+
// Won't fit
|
|
426
|
+
return 0;
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
/**
|
|
431
|
+
* Calculate Context score (C)
|
|
432
|
+
* Based on context length capability vs target
|
|
433
|
+
*/
|
|
434
|
+
calculateContextScore(variant, targetContext) {
|
|
435
|
+
const contextLength = variant.context_length || variant.contextLength || 4096;
|
|
436
|
+
|
|
437
|
+
if (contextLength >= targetContext * 2) {
|
|
438
|
+
return 100; // Much more than needed
|
|
439
|
+
} else if (contextLength >= targetContext) {
|
|
440
|
+
// Meets requirement
|
|
441
|
+
return 85 + (contextLength - targetContext) / targetContext * 15;
|
|
442
|
+
} else if (contextLength >= targetContext * 0.5) {
|
|
443
|
+
// Partially meets requirement
|
|
444
|
+
return 50 + (contextLength / targetContext) * 35;
|
|
445
|
+
} else {
|
|
446
|
+
// Inadequate
|
|
447
|
+
return (contextLength / targetContext) * 50;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/**
|
|
452
|
+
* Estimate tokens per second
|
|
453
|
+
*
|
|
454
|
+
* Formula is based on:
|
|
455
|
+
* - baseSpeed: realistic TPS for 7B Q4_K_M model on this hardware
|
|
456
|
+
* - Model size scaling with diminishing returns
|
|
457
|
+
* - Quantization adjustment
|
|
458
|
+
* - MoE efficiency bonus
|
|
459
|
+
*/
|
|
460
|
+
estimateTPS(variant, hardware) {
|
|
461
|
+
const params = variant.params_b || variant.paramsB || 7;
|
|
462
|
+
const quant = (variant.quant || 'Q4_K_M').toUpperCase();
|
|
463
|
+
const isMoE = variant.is_moe || variant.isMoE || false;
|
|
464
|
+
|
|
465
|
+
// Get backend speed coefficient (TPS for 7B Q4_K_M)
|
|
466
|
+
const backendKey = this.getBackendKey(hardware);
|
|
467
|
+
const baseSpeed = this.backendSpeed[backendKey] || this.backendSpeed.cpu_default;
|
|
468
|
+
|
|
469
|
+
// Get quantization multiplier (relative to Q4_K_M = 1.0)
|
|
470
|
+
const quantMult = this.quantSpeedMult[quant] || 1.0;
|
|
471
|
+
|
|
472
|
+
// Model size scaling with diminishing returns
|
|
473
|
+
// Small models don't get proportionally faster due to overhead
|
|
474
|
+
// Large models don't slow proportionally due to batching efficiency
|
|
475
|
+
let sizeRatio = 7 / params;
|
|
476
|
+
|
|
477
|
+
// Apply diminishing returns curve
|
|
478
|
+
// For small models (< 7B): cap the speedup factor
|
|
479
|
+
// For large models (> 7B): slow down more gradually
|
|
480
|
+
let sizeMult;
|
|
481
|
+
if (params < 3) {
|
|
482
|
+
// Very small models: limited by overhead, max ~2x baseline
|
|
483
|
+
sizeMult = Math.min(2.0, 1 + (sizeRatio - 1) * 0.35);
|
|
484
|
+
} else if (params < 7) {
|
|
485
|
+
// Small models: some speedup but not linear (~1.4x for 3B)
|
|
486
|
+
sizeMult = 1 + (sizeRatio - 1) * 0.35;
|
|
487
|
+
} else if (params <= 14) {
|
|
488
|
+
// Medium models: close to linear
|
|
489
|
+
sizeMult = sizeRatio;
|
|
490
|
+
} else if (params <= 32) {
|
|
491
|
+
// Large models: slight efficiency boost
|
|
492
|
+
sizeMult = sizeRatio * 1.1;
|
|
493
|
+
} else {
|
|
494
|
+
// Very large models: memory bandwidth limited, slower than linear
|
|
495
|
+
sizeMult = sizeRatio * 0.85;
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// Calculate base TPS
|
|
499
|
+
let tps = baseSpeed * sizeMult * quantMult;
|
|
500
|
+
|
|
501
|
+
// MoE models are faster because only ~1/3 of params are active
|
|
502
|
+
// But communication overhead limits the speedup
|
|
503
|
+
if (isMoE) {
|
|
504
|
+
tps *= 1.8; // ~1.8x speedup (not 3x due to routing overhead)
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// Apply minimum floor (can't go below 1 TPS)
|
|
508
|
+
return Math.max(1, Math.round(tps));
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/**
|
|
512
|
+
* Get backend speed key from hardware info
|
|
513
|
+
*/
|
|
514
|
+
getBackendKey(hardware) {
|
|
515
|
+
if (!hardware?.summary) return 'cpu_default';
|
|
516
|
+
|
|
517
|
+
const backend = hardware.summary.bestBackend;
|
|
518
|
+
const gpuModel = (hardware.summary.gpuModel || '').toLowerCase();
|
|
519
|
+
|
|
520
|
+
if (backend === 'cuda') {
|
|
521
|
+
if (gpuModel.includes('h100')) return 'cuda_h100';
|
|
522
|
+
if (gpuModel.includes('a100')) return 'cuda_a100';
|
|
523
|
+
if (gpuModel.includes('4090')) return 'cuda_4090';
|
|
524
|
+
if (gpuModel.includes('4080')) return 'cuda_4080';
|
|
525
|
+
if (gpuModel.includes('3090')) return 'cuda_3090';
|
|
526
|
+
if (gpuModel.includes('3080')) return 'cuda_3080';
|
|
527
|
+
if (gpuModel.includes('3070')) return 'cuda_3070';
|
|
528
|
+
if (gpuModel.includes('3060')) return 'cuda_3060';
|
|
529
|
+
if (gpuModel.includes('2080')) return 'cuda_2080';
|
|
530
|
+
return 'cuda_default';
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
if (backend === 'rocm') {
|
|
534
|
+
if (gpuModel.includes('mi300')) return 'rocm_mi300';
|
|
535
|
+
if (gpuModel.includes('mi250')) return 'rocm_mi250';
|
|
536
|
+
if (gpuModel.includes('7900 xtx')) return 'rocm_7900xtx';
|
|
537
|
+
if (gpuModel.includes('7900 xt')) return 'rocm_7900xt';
|
|
538
|
+
if (gpuModel.includes('7800')) return 'rocm_7800xt';
|
|
539
|
+
if (gpuModel.includes('6900')) return 'rocm_6900xt';
|
|
540
|
+
return 'rocm_default';
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
if (backend === 'metal') {
|
|
544
|
+
if (gpuModel.includes('m4 ultra')) return 'metal_m4_ultra';
|
|
545
|
+
if (gpuModel.includes('m4 max')) return 'metal_m4_max';
|
|
546
|
+
if (gpuModel.includes('m4 pro')) return 'metal_m4_pro';
|
|
547
|
+
if (gpuModel.includes('m4')) return 'metal_m4';
|
|
548
|
+
if (gpuModel.includes('m3 ultra')) return 'metal_m3_ultra';
|
|
549
|
+
if (gpuModel.includes('m3 max')) return 'metal_m3_max';
|
|
550
|
+
if (gpuModel.includes('m3 pro')) return 'metal_m3_pro';
|
|
551
|
+
if (gpuModel.includes('m3')) return 'metal_m3';
|
|
552
|
+
if (gpuModel.includes('m2 ultra')) return 'metal_m2_ultra';
|
|
553
|
+
if (gpuModel.includes('m2 max')) return 'metal_m2_max';
|
|
554
|
+
if (gpuModel.includes('m2 pro')) return 'metal_m2_pro';
|
|
555
|
+
if (gpuModel.includes('m2')) return 'metal_m2';
|
|
556
|
+
if (gpuModel.includes('m1 ultra')) return 'metal_m1_ultra';
|
|
557
|
+
if (gpuModel.includes('m1 max')) return 'metal_m1_max';
|
|
558
|
+
if (gpuModel.includes('m1 pro')) return 'metal_m1_pro';
|
|
559
|
+
if (gpuModel.includes('m1')) return 'metal_m1';
|
|
560
|
+
return 'metal_default';
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
if (backend === 'intel') {
|
|
564
|
+
if (gpuModel.includes('a770')) return 'intel_arc_a770';
|
|
565
|
+
if (gpuModel.includes('a750')) return 'intel_arc_a750';
|
|
566
|
+
return 'intel_arc_default';
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// CPU backend
|
|
570
|
+
const cpu = hardware.cpu || hardware.backends?.cpu?.info;
|
|
571
|
+
if (cpu?.capabilities) {
|
|
572
|
+
if (cpu.capabilities.amx) return 'cpu_avx512_amx';
|
|
573
|
+
if (cpu.capabilities.avx512) return 'cpu_avx512';
|
|
574
|
+
if (cpu.capabilities.avx2) return 'cpu_avx2';
|
|
575
|
+
if (cpu.capabilities.neon) return 'cpu_neon';
|
|
576
|
+
if (cpu.capabilities.avx) return 'cpu_avx';
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
return 'cpu_default';
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
/**
|
|
583
|
+
* Extract model family from model ID
|
|
584
|
+
*/
|
|
585
|
+
extractFamily(modelId) {
|
|
586
|
+
if (!modelId) return 'unknown';
|
|
587
|
+
|
|
588
|
+
const id = modelId.toLowerCase();
|
|
589
|
+
|
|
590
|
+
// Remove namespace if present (e.g., "library/qwen2.5" -> "qwen2.5")
|
|
591
|
+
const name = id.includes('/') ? id.split('/').pop() : id;
|
|
592
|
+
|
|
593
|
+
// Remove tag if present (e.g., "qwen2.5:7b-q4" -> "qwen2.5")
|
|
594
|
+
const base = name.split(':')[0];
|
|
595
|
+
|
|
596
|
+
// Match against known families
|
|
597
|
+
for (const family of Object.keys(this.familyQuality).sort((a, b) => b.length - a.length)) {
|
|
598
|
+
if (base.includes(family)) {
|
|
599
|
+
return family;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
return base;
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
/**
|
|
607
|
+
* Get family quality score
|
|
608
|
+
*/
|
|
609
|
+
getFamilyScore(family) {
|
|
610
|
+
// Direct match
|
|
611
|
+
if (this.familyQuality[family]) {
|
|
612
|
+
return this.familyQuality[family];
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
// Partial match
|
|
616
|
+
for (const [key, score] of Object.entries(this.familyQuality)) {
|
|
617
|
+
if (family.includes(key) || key.includes(family)) {
|
|
618
|
+
return score;
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// Default for unknown families
|
|
623
|
+
return 70;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
/**
|
|
627
|
+
* Get task-specific bonus
|
|
628
|
+
*/
|
|
629
|
+
getTaskBonus(family, useCase) {
|
|
630
|
+
const bonuses = this.taskBonuses[useCase] || {};
|
|
631
|
+
|
|
632
|
+
// Direct match
|
|
633
|
+
if (bonuses[family]) {
|
|
634
|
+
return bonuses[family];
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
// Partial match
|
|
638
|
+
for (const [key, bonus] of Object.entries(bonuses)) {
|
|
639
|
+
if (family.includes(key) || key.includes(family)) {
|
|
640
|
+
return bonus;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
return 0;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
/**
|
|
648
|
+
* Score multiple variants and return sorted by score
|
|
649
|
+
*/
|
|
650
|
+
scoreAll(variants, hardware, options = {}) {
|
|
651
|
+
const scored = variants.map(variant => ({
|
|
652
|
+
variant,
|
|
653
|
+
score: this.score(variant, hardware, options)
|
|
654
|
+
}));
|
|
655
|
+
|
|
656
|
+
// Sort by final score (descending)
|
|
657
|
+
scored.sort((a, b) => b.score.final - a.score.final);
|
|
658
|
+
|
|
659
|
+
return scored;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
/**
|
|
663
|
+
* Filter and score variants for hardware constraints
|
|
664
|
+
*/
|
|
665
|
+
filterAndScore(variants, hardware, options = {}) {
|
|
666
|
+
const maxSize = hardware?.summary?.effectiveMemory || 8;
|
|
667
|
+
const headroom = options.headroom || 2;
|
|
668
|
+
const effectiveMax = maxSize - headroom;
|
|
669
|
+
|
|
670
|
+
// Filter variants that fit
|
|
671
|
+
const fitting = variants.filter(v => {
|
|
672
|
+
const size = this.getModelSize(v);
|
|
673
|
+
|
|
674
|
+
// No size info - include but will get moderate fit score
|
|
675
|
+
if (!size) return true;
|
|
676
|
+
|
|
677
|
+
return size <= effectiveMax * 1.1; // Allow 10% overflow
|
|
678
|
+
});
|
|
679
|
+
|
|
680
|
+
return this.scoreAll(fitting, hardware, options);
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
/**
|
|
684
|
+
* Get recommendation categories
|
|
685
|
+
*/
|
|
686
|
+
categorizeScores(scoredVariants) {
|
|
687
|
+
const categories = {
|
|
688
|
+
excellent: [], // 85+
|
|
689
|
+
recommended: [], // 70-84
|
|
690
|
+
acceptable: [], // 55-69
|
|
691
|
+
marginal: [], // 40-54
|
|
692
|
+
notRecommended: [] // <40
|
|
693
|
+
};
|
|
694
|
+
|
|
695
|
+
for (const item of scoredVariants) {
|
|
696
|
+
const score = item.score.final;
|
|
697
|
+
|
|
698
|
+
if (score >= 85) {
|
|
699
|
+
categories.excellent.push(item);
|
|
700
|
+
} else if (score >= 70) {
|
|
701
|
+
categories.recommended.push(item);
|
|
702
|
+
} else if (score >= 55) {
|
|
703
|
+
categories.acceptable.push(item);
|
|
704
|
+
} else if (score >= 40) {
|
|
705
|
+
categories.marginal.push(item);
|
|
706
|
+
} else {
|
|
707
|
+
categories.notRecommended.push(item);
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
return categories;
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
module.exports = ScoringEngine;
|