llm-checker 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +418 -0
- package/analyzer/compatibility.js +584 -0
- package/analyzer/performance.js +505 -0
- package/bin/CLAUDE.md +12 -0
- package/bin/enhanced_cli.js +3118 -0
- package/bin/test-deterministic.js +41 -0
- package/package.json +96 -0
- package/src/CLAUDE.md +12 -0
- package/src/ai/intelligent-selector.js +615 -0
- package/src/ai/model-selector.js +312 -0
- package/src/ai/multi-objective-selector.js +820 -0
- package/src/commands/check.js +58 -0
- package/src/data/CLAUDE.md +11 -0
- package/src/data/model-database.js +637 -0
- package/src/data/sync-manager.js +279 -0
- package/src/hardware/CLAUDE.md +12 -0
- package/src/hardware/backends/CLAUDE.md +11 -0
- package/src/hardware/backends/apple-silicon.js +318 -0
- package/src/hardware/backends/cpu-detector.js +490 -0
- package/src/hardware/backends/cuda-detector.js +417 -0
- package/src/hardware/backends/intel-detector.js +436 -0
- package/src/hardware/backends/rocm-detector.js +440 -0
- package/src/hardware/detector.js +573 -0
- package/src/hardware/pc-optimizer.js +635 -0
- package/src/hardware/specs.js +286 -0
- package/src/hardware/unified-detector.js +442 -0
- package/src/index.js +2289 -0
- package/src/models/CLAUDE.md +17 -0
- package/src/models/ai-check-selector.js +806 -0
- package/src/models/catalog.json +426 -0
- package/src/models/deterministic-selector.js +1145 -0
- package/src/models/expanded_database.js +1142 -0
- package/src/models/intelligent-selector.js +532 -0
- package/src/models/requirements.js +310 -0
- package/src/models/scoring-config.js +57 -0
- package/src/models/scoring-engine.js +715 -0
- package/src/ollama/.cache/README.md +33 -0
- package/src/ollama/CLAUDE.md +24 -0
- package/src/ollama/client.js +438 -0
- package/src/ollama/enhanced-client.js +113 -0
- package/src/ollama/enhanced-scraper.js +634 -0
- package/src/ollama/manager.js +357 -0
- package/src/ollama/native-scraper.js +776 -0
- package/src/plugins/CLAUDE.md +11 -0
- package/src/plugins/examples/custom_model_plugin.js +87 -0
- package/src/plugins/index.js +295 -0
- package/src/utils/CLAUDE.md +11 -0
- package/src/utils/config.js +359 -0
- package/src/utils/formatter.js +315 -0
- package/src/utils/logger.js +272 -0
- package/src/utils/model-classifier.js +167 -0
- package/src/utils/verbose-progress.js +266 -0
|
@@ -0,0 +1,635 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PC Hardware Optimizer for Windows/Linux
|
|
3
|
+
*
|
|
4
|
+
* Implements practical backend detection and optimization logic for:
|
|
5
|
+
* - NVIDIA (CUDA/Vulkan/DirectML)
|
|
6
|
+
* - AMD (ROCm/Vulkan/DirectML)
|
|
7
|
+
* - Intel Arc/iGPU (SYCL/OpenVINO/DirectML/Vulkan)
|
|
8
|
+
* - CPU-only (BLAS accelerated)
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
const { spawn } = require('child_process');
|
|
12
|
+
const si = require('systeminformation');
|
|
13
|
+
|
|
14
|
+
class PCOptimizer {
|
|
15
|
+
constructor() {
|
|
16
|
+
// Backend performance coefficients (tokens/sec baseline)
|
|
17
|
+
this.backendCoefficients = {
|
|
18
|
+
cuda: 220, // NVIDIA CUDA/cuBLAS
|
|
19
|
+
rocm: 180, // AMD ROCm/HIP
|
|
20
|
+
sycl: 140, // Intel oneAPI/SYCL
|
|
21
|
+
vulkan: 120, // Vulkan (cross-platform)
|
|
22
|
+
directml: 100, // DirectML (Windows)
|
|
23
|
+
openvino: 90, // Intel OpenVINO
|
|
24
|
+
cpu_avx512: 70, // CPU with AVX512
|
|
25
|
+
cpu_avx2: 60, // CPU with AVX2
|
|
26
|
+
cpu_basic: 40 // Basic CPU
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
// Quantization factors (speed multiplier)
|
|
30
|
+
this.quantFactors = {
|
|
31
|
+
'Q8_0': 0.85, // Highest quality, slower
|
|
32
|
+
'Q6_K': 1.0, // Balanced
|
|
33
|
+
'Q5_K_M': 1.1, // Good speed/quality
|
|
34
|
+
'Q4_K_M': 1.2, // Fast inference
|
|
35
|
+
'Q3_K': 1.35, // Very fast, lower quality
|
|
36
|
+
'Q2_K': 1.5 // Fastest, lowest quality
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// Bytes per parameter for different quantizations
|
|
40
|
+
this.bytesPerParam = {
|
|
41
|
+
'FP16': 2.0,
|
|
42
|
+
'Q8_0': 1.0,
|
|
43
|
+
'Q6_K': 0.75,
|
|
44
|
+
'Q5_K_M': 0.63,
|
|
45
|
+
'Q4_K_M': 0.50,
|
|
46
|
+
'Q3_K': 0.375,
|
|
47
|
+
'Q2_K': 0.25
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
// KV cache estimation per 1k tokens (GB)
|
|
51
|
+
this.kvCacheEstimates = {
|
|
52
|
+
7: 0.3, // 7B models
|
|
53
|
+
13: 0.6, // 13B models
|
|
54
|
+
33: 1.4, // 30-40B models
|
|
55
|
+
70: 2.6 // 70B+ models
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Detect PC hardware capabilities and available backends
|
|
61
|
+
*/
|
|
62
|
+
async detectPCCapabilities() {
|
|
63
|
+
try {
|
|
64
|
+
const [graphics, cpu, memory, osInfo] = await Promise.all([
|
|
65
|
+
si.graphics(),
|
|
66
|
+
si.cpu(),
|
|
67
|
+
si.mem(),
|
|
68
|
+
si.osInfo()
|
|
69
|
+
]);
|
|
70
|
+
|
|
71
|
+
const gpu = graphics.controllers?.[0] || {};
|
|
72
|
+
const vendor = (gpu.vendor || '').toLowerCase();
|
|
73
|
+
const model = (gpu.model || '').toLowerCase();
|
|
74
|
+
|
|
75
|
+
// Enhanced VRAM detection
|
|
76
|
+
let vramGB = Math.round((gpu.vram || 0) / 1024);
|
|
77
|
+
if (vramGB === 0 && gpu.vram) {
|
|
78
|
+
// Handle different VRAM reporting formats
|
|
79
|
+
if (gpu.vram > 100000) {
|
|
80
|
+
vramGB = Math.round(gpu.vram / (1024 * 1024 * 1024)); // bytes to GB
|
|
81
|
+
} else if (gpu.vram > 1000) {
|
|
82
|
+
vramGB = Math.round(gpu.vram / 1024); // MB to GB
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Detect GPU type and capabilities
|
|
87
|
+
const gpuInfo = this.classifyGPU(vendor, model, vramGB);
|
|
88
|
+
|
|
89
|
+
// Detect available backends
|
|
90
|
+
const availableBackends = await this.detectAvailableBackends(osInfo.platform);
|
|
91
|
+
|
|
92
|
+
// Get CPU capabilities
|
|
93
|
+
const cpuInfo = this.analyzeCPUCapabilities(cpu);
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
gpu: gpuInfo,
|
|
97
|
+
cpu: cpuInfo,
|
|
98
|
+
memory: {
|
|
99
|
+
total: Math.round(memory.total / (1024 ** 3)),
|
|
100
|
+
available: Math.round(memory.available / (1024 ** 3))
|
|
101
|
+
},
|
|
102
|
+
os: {
|
|
103
|
+
platform: osInfo.platform,
|
|
104
|
+
distro: osInfo.distro
|
|
105
|
+
},
|
|
106
|
+
backends: availableBackends,
|
|
107
|
+
timestamp: Date.now()
|
|
108
|
+
};
|
|
109
|
+
} catch (error) {
|
|
110
|
+
throw new Error(`PC capability detection failed: ${error.message}`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Classify GPU type and capabilities
|
|
116
|
+
*/
|
|
117
|
+
classifyGPU(vendor, model, vramGB) {
|
|
118
|
+
const isNvidia = vendor.includes('nvidia') || model.includes('nvidia') || model.includes('geforce') || model.includes('rtx') || model.includes('gtx');
|
|
119
|
+
const isAMD = vendor.includes('amd') || vendor.includes('ati') || model.includes('radeon') || model.includes('rx ');
|
|
120
|
+
const isIntel = vendor.includes('intel') || model.includes('intel') || model.includes('arc') || model.includes('iris') || model.includes('uhd');
|
|
121
|
+
|
|
122
|
+
let tier = 'entry';
|
|
123
|
+
let type = 'unknown';
|
|
124
|
+
let computeUnits = 0;
|
|
125
|
+
let memoryBandwidth = 50; // GB/s default
|
|
126
|
+
|
|
127
|
+
if (isNvidia) {
|
|
128
|
+
type = 'nvidia';
|
|
129
|
+
if (model.includes('rtx 5090')) { tier = 'flagship'; computeUnits = 21760; memoryBandwidth = 1792; }
|
|
130
|
+
else if (model.includes('rtx 5080')) { tier = 'high_end'; computeUnits = 10752; memoryBandwidth = 960; }
|
|
131
|
+
else if (model.includes('rtx 5070')) { tier = 'upper_mid'; computeUnits = 6144; memoryBandwidth = 504; }
|
|
132
|
+
else if (model.includes('rtx 4090')) { tier = 'flagship'; computeUnits = 16384; memoryBandwidth = 1008; }
|
|
133
|
+
else if (model.includes('rtx 4080')) { tier = 'high_end'; computeUnits = 9728; memoryBandwidth = 716; }
|
|
134
|
+
else if (model.includes('rtx 4070')) { tier = 'upper_mid'; computeUnits = 5888; memoryBandwidth = 448; }
|
|
135
|
+
else if (model.includes('rtx 4060')) { tier = 'mid_range'; computeUnits = 3072; memoryBandwidth = 272; }
|
|
136
|
+
else if (model.includes('rtx 30')) { tier = 'upper_mid'; computeUnits = 6000; memoryBandwidth = 500; }
|
|
137
|
+
else if (model.includes('rtx 20')) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 300; }
|
|
138
|
+
else if (model.includes('gtx 16')) { tier = 'budget'; computeUnits = 1500; memoryBandwidth = 200; }
|
|
139
|
+
else if (vramGB >= 8) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 300; }
|
|
140
|
+
else if (vramGB >= 4) { tier = 'budget'; computeUnits = 1000; memoryBandwidth = 150; }
|
|
141
|
+
} else if (isAMD) {
|
|
142
|
+
type = 'amd';
|
|
143
|
+
if (model.includes('rx 7900')) { tier = 'high_end'; computeUnits = 6000; memoryBandwidth = 960; }
|
|
144
|
+
else if (model.includes('rx 7800')) { tier = 'upper_mid'; computeUnits = 3840; memoryBandwidth = 624; }
|
|
145
|
+
else if (model.includes('rx 7700')) { tier = 'mid_range'; computeUnits = 2560; memoryBandwidth = 432; }
|
|
146
|
+
else if (model.includes('rx 7600')) { tier = 'budget'; computeUnits = 2048; memoryBandwidth = 288; }
|
|
147
|
+
else if (model.includes('rx 6000')) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 400; }
|
|
148
|
+
else if (vramGB >= 16) { tier = 'upper_mid'; computeUnits = 3000; memoryBandwidth = 500; }
|
|
149
|
+
else if (vramGB >= 8) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 300; }
|
|
150
|
+
else if (vramGB >= 4) { tier = 'budget'; computeUnits = 1000; memoryBandwidth = 200; }
|
|
151
|
+
} else if (isIntel) {
|
|
152
|
+
type = 'intel';
|
|
153
|
+
if (model.includes('arc a770')) { tier = 'mid_range'; computeUnits = 512; memoryBandwidth = 560; }
|
|
154
|
+
else if (model.includes('arc a750')) { tier = 'mid_range'; computeUnits = 448; memoryBandwidth = 512; }
|
|
155
|
+
else if (model.includes('arc a580')) { tier = 'budget'; computeUnits = 384; memoryBandwidth = 448; }
|
|
156
|
+
else if (model.includes('iris xe')) { tier = 'igpu'; computeUnits = 96; memoryBandwidth = 68; }
|
|
157
|
+
else if (model.includes('uhd')) { tier = 'igpu'; computeUnits = 32; memoryBandwidth = 47; }
|
|
158
|
+
else if (vramGB >= 8) { tier = 'mid_range'; computeUnits = 400; memoryBandwidth = 400; }
|
|
159
|
+
else { tier = 'igpu'; computeUnits = 64; memoryBandwidth = 50; }
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Determine if integrated GPU
|
|
163
|
+
const isIntegrated = tier === 'igpu' ||
|
|
164
|
+
(vramGB === 0 && (model.includes('intel') || model.includes('integrated') ||
|
|
165
|
+
model.includes('iris') || model.includes('uhd') || model.includes('vega')));
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
vendor: type,
|
|
169
|
+
model: model || 'Unknown GPU',
|
|
170
|
+
tier,
|
|
171
|
+
vramGB,
|
|
172
|
+
isIntegrated,
|
|
173
|
+
computeUnits,
|
|
174
|
+
memoryBandwidth,
|
|
175
|
+
estimatedTFLOPS: this.estimateGPUTFLOPS(type, tier, computeUnits)
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Estimate GPU TFLOPS for FP16 operations
|
|
181
|
+
*/
|
|
182
|
+
estimateGPUTFLOPS(type, tier, computeUnits) {
|
|
183
|
+
const multipliers = {
|
|
184
|
+
nvidia: {
|
|
185
|
+
flagship: 0.010, // RTX 4090 ~165 TFLOPS
|
|
186
|
+
high_end: 0.008, // RTX 4080 ~120 TFLOPS
|
|
187
|
+
upper_mid: 0.006, // RTX 4070 ~65 TFLOPS
|
|
188
|
+
mid_range: 0.004, // RTX 4060 ~32 TFLOPS
|
|
189
|
+
budget: 0.002, // GTX 1660 ~15 TFLOPS
|
|
190
|
+
igpu: 0.001
|
|
191
|
+
},
|
|
192
|
+
amd: {
|
|
193
|
+
flagship: 0.008, // Similar to NVIDIA but slightly lower
|
|
194
|
+
high_end: 0.006,
|
|
195
|
+
upper_mid: 0.005,
|
|
196
|
+
mid_range: 0.003,
|
|
197
|
+
budget: 0.002,
|
|
198
|
+
igpu: 0.001
|
|
199
|
+
},
|
|
200
|
+
intel: {
|
|
201
|
+
mid_range: 0.003, // Arc A770 ~8 TFLOPS
|
|
202
|
+
budget: 0.002, // Arc A580 ~6 TFLOPS
|
|
203
|
+
igpu: 0.001 // Iris Xe ~2 TFLOPS
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
const multiplier = multipliers[type]?.[tier] || 0.001;
|
|
208
|
+
return Math.round(computeUnits * multiplier * 10) / 10;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Analyze CPU capabilities for LLM inference
|
|
213
|
+
*/
|
|
214
|
+
analyzeCPUCapabilities(cpu) {
|
|
215
|
+
const brand = (cpu.brand || '').toLowerCase();
|
|
216
|
+
const model = (cpu.model || '').toLowerCase();
|
|
217
|
+
const cores = cpu.physicalCores || cpu.cores || 1;
|
|
218
|
+
const threads = cpu.cores || cores;
|
|
219
|
+
const baseSpeed = cpu.speed || 2.0;
|
|
220
|
+
|
|
221
|
+
// Detect instruction set capabilities
|
|
222
|
+
const hasAVX512 = brand.includes('intel') &&
|
|
223
|
+
(model.includes('12th') || model.includes('13th') || model.includes('14th') || model.includes('15th'));
|
|
224
|
+
const hasAVX2 = brand.includes('intel') || brand.includes('amd');
|
|
225
|
+
const hasVNNI = hasAVX512; // VNNI usually comes with AVX512
|
|
226
|
+
|
|
227
|
+
// Estimate CPU tier for LLM inference
|
|
228
|
+
let tier = 'entry';
|
|
229
|
+
if (cores >= 16 && baseSpeed >= 3.0) tier = 'high_end';
|
|
230
|
+
else if (cores >= 8 && baseSpeed >= 2.5) tier = 'mid_range';
|
|
231
|
+
else if (cores >= 4 && baseSpeed >= 2.0) tier = 'budget';
|
|
232
|
+
|
|
233
|
+
return {
|
|
234
|
+
brand: brand,
|
|
235
|
+
model: cpu.model || 'Unknown CPU',
|
|
236
|
+
cores,
|
|
237
|
+
threads,
|
|
238
|
+
baseSpeed,
|
|
239
|
+
tier,
|
|
240
|
+
features: {
|
|
241
|
+
avx512: hasAVX512,
|
|
242
|
+
avx2: hasAVX2,
|
|
243
|
+
vnni: hasVNNI
|
|
244
|
+
},
|
|
245
|
+
estimatedGFLOPS: cores * baseSpeed * (hasAVX512 ? 64 : hasAVX2 ? 32 : 16)
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Detect available backends on the system
|
|
251
|
+
*/
|
|
252
|
+
async detectAvailableBackends(platform) {
|
|
253
|
+
const backends = {
|
|
254
|
+
cuda: false,
|
|
255
|
+
rocm: false,
|
|
256
|
+
sycl: false,
|
|
257
|
+
vulkan: false,
|
|
258
|
+
directml: false,
|
|
259
|
+
openvino: false,
|
|
260
|
+
blas: true // Always assume basic BLAS
|
|
261
|
+
};
|
|
262
|
+
|
|
263
|
+
try {
|
|
264
|
+
// Check for NVIDIA CUDA
|
|
265
|
+
try {
|
|
266
|
+
await this.runCommand('nvidia-smi', ['--query-gpu=name', '--format=csv,noheader']);
|
|
267
|
+
backends.cuda = true;
|
|
268
|
+
} catch (e) {
|
|
269
|
+
// CUDA not available
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// Check for AMD ROCm (Linux mainly)
|
|
273
|
+
if (platform === 'linux') {
|
|
274
|
+
try {
|
|
275
|
+
await this.runCommand('rocm-smi', ['--showid']);
|
|
276
|
+
backends.rocm = true;
|
|
277
|
+
} catch (e) {
|
|
278
|
+
// ROCm not available
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Check for Intel SYCL/oneAPI
|
|
283
|
+
try {
|
|
284
|
+
await this.runCommand('sycl-ls', []);
|
|
285
|
+
backends.sycl = true;
|
|
286
|
+
} catch (e) {
|
|
287
|
+
// Try alternative oneAPI detection
|
|
288
|
+
try {
|
|
289
|
+
await this.runCommand('oneapi-cli', ['--version']);
|
|
290
|
+
backends.sycl = true;
|
|
291
|
+
} catch (e2) {
|
|
292
|
+
// SYCL not available
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// DirectML available on Windows
|
|
297
|
+
if (platform === 'win32') {
|
|
298
|
+
backends.directml = true;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// Vulkan detection (cross-platform)
|
|
302
|
+
try {
|
|
303
|
+
// Basic Vulkan detection - could be enhanced
|
|
304
|
+
backends.vulkan = true; // Assume modern systems have Vulkan
|
|
305
|
+
} catch (e) {
|
|
306
|
+
// Vulkan not available
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// OpenVINO detection
|
|
310
|
+
try {
|
|
311
|
+
await this.runCommand('benchmark_app', ['--help']);
|
|
312
|
+
backends.openvino = true;
|
|
313
|
+
} catch (e) {
|
|
314
|
+
// OpenVINO not available
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
} catch (error) {
|
|
318
|
+
console.warn('Backend detection failed:', error.message);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return backends;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Choose optimal backend based on available options and hardware
|
|
326
|
+
*/
|
|
327
|
+
chooseOptimalBackend(hardware) {
|
|
328
|
+
const { gpu, backends, os } = hardware;
|
|
329
|
+
|
|
330
|
+
// Priority order for different GPU vendors
|
|
331
|
+
if (gpu.vendor === 'nvidia' && backends.cuda) {
|
|
332
|
+
return { backend: 'cuda', reason: 'NVIDIA GPU with CUDA support' };
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if (gpu.vendor === 'amd' && backends.rocm && os.platform === 'linux') {
|
|
336
|
+
return { backend: 'rocm', reason: 'AMD GPU with ROCm support (Linux)' };
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (gpu.vendor === 'intel' && backends.sycl) {
|
|
340
|
+
return { backend: 'sycl', reason: 'Intel GPU with oneAPI/SYCL support' };
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Fallback options for Windows
|
|
344
|
+
if (os.platform === 'win32' && backends.directml && !gpu.isIntegrated) {
|
|
345
|
+
return { backend: 'directml', reason: 'Windows DirectML fallback' };
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// Vulkan as cross-platform fallback
|
|
349
|
+
if (backends.vulkan && !gpu.isIntegrated) {
|
|
350
|
+
return { backend: 'vulkan', reason: 'Vulkan cross-platform acceleration' };
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Intel OpenVINO for Intel hardware
|
|
354
|
+
if (gpu.vendor === 'intel' && backends.openvino) {
|
|
355
|
+
return { backend: 'openvino', reason: 'Intel OpenVINO optimization' };
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// CPU fallback
|
|
359
|
+
if (hardware.cpu.features.avx512) {
|
|
360
|
+
return { backend: 'cpu_avx512', reason: 'CPU with AVX512 acceleration' };
|
|
361
|
+
} else if (hardware.cpu.features.avx2) {
|
|
362
|
+
return { backend: 'cpu_avx2', reason: 'CPU with AVX2 acceleration' };
|
|
363
|
+
} else {
|
|
364
|
+
return { backend: 'cpu_basic', reason: 'Basic CPU inference' };
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Calculate optimal quantization and GPU offload strategy
|
|
370
|
+
*/
|
|
371
|
+
pickQuantAndOffload(modelSizeB, hardware, contextLength = 4096) {
|
|
372
|
+
const { gpu, memory } = hardware;
|
|
373
|
+
const vramGB = gpu.vramGB || 0;
|
|
374
|
+
const ramGB = memory.total;
|
|
375
|
+
|
|
376
|
+
// KV cache estimation
|
|
377
|
+
const kvSizeKey = modelSizeB <= 8 ? 7 : modelSizeB <= 15 ? 13 : modelSizeB <= 40 ? 33 : 70;
|
|
378
|
+
const kvCacheGB = this.kvCacheEstimates[kvSizeKey] * (contextLength / 1000);
|
|
379
|
+
|
|
380
|
+
// Available memory (with safety margins)
|
|
381
|
+
const usableVRAM = vramGB * 0.9; // 90% usable VRAM
|
|
382
|
+
const usableRAM = ramGB * 0.8; // 80% usable RAM
|
|
383
|
+
|
|
384
|
+
// Try quantizations from best to worst
|
|
385
|
+
const quantOptions = ['Q6_K', 'Q5_K_M', 'Q4_K_M', 'Q3_K', 'Q2_K'];
|
|
386
|
+
|
|
387
|
+
for (const quant of quantOptions) {
|
|
388
|
+
const modelSizeGB = modelSizeB * this.bytesPerParam[quant];
|
|
389
|
+
const totalNeed = modelSizeGB + kvCacheGB;
|
|
390
|
+
|
|
391
|
+
// Check if it fits fully in VRAM
|
|
392
|
+
if (totalNeed <= usableVRAM && !gpu.isIntegrated) {
|
|
393
|
+
return {
|
|
394
|
+
quantization: quant,
|
|
395
|
+
gpuLayers: -1, // Full GPU
|
|
396
|
+
memoryUsage: {
|
|
397
|
+
vram: totalNeed,
|
|
398
|
+
ram: 0,
|
|
399
|
+
total: totalNeed
|
|
400
|
+
},
|
|
401
|
+
strategy: 'full_gpu'
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// Calculate partial offload if VRAM is insufficient but available
|
|
406
|
+
if (usableVRAM > 0 && !gpu.isIntegrated) {
|
|
407
|
+
const vramRatio = Math.min(0.85, usableVRAM / totalNeed);
|
|
408
|
+
if (vramRatio > 0.15) { // Minimum 15% on GPU to be worthwhile
|
|
409
|
+
const ramNeed = totalNeed - usableVRAM;
|
|
410
|
+
if (ramNeed <= usableRAM) {
|
|
411
|
+
// Estimate layers (rough heuristic: 80 layers for most models)
|
|
412
|
+
const estimatedLayers = Math.max(16, Math.floor(80 * vramRatio));
|
|
413
|
+
return {
|
|
414
|
+
quantization: quant,
|
|
415
|
+
gpuLayers: estimatedLayers,
|
|
416
|
+
memoryUsage: {
|
|
417
|
+
vram: usableVRAM,
|
|
418
|
+
ram: ramNeed,
|
|
419
|
+
total: totalNeed
|
|
420
|
+
},
|
|
421
|
+
strategy: 'partial_offload'
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// Check if it fits in RAM (CPU-only)
|
|
428
|
+
if (totalNeed <= usableRAM) {
|
|
429
|
+
return {
|
|
430
|
+
quantization: quant,
|
|
431
|
+
gpuLayers: 0, // CPU only
|
|
432
|
+
memoryUsage: {
|
|
433
|
+
vram: 0,
|
|
434
|
+
ram: totalNeed,
|
|
435
|
+
total: totalNeed
|
|
436
|
+
},
|
|
437
|
+
strategy: 'cpu_only'
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// If nothing fits, return most aggressive quantization
|
|
443
|
+
const modelSizeGB = modelSizeB * this.bytesPerParam['Q2_K'];
|
|
444
|
+
return {
|
|
445
|
+
quantization: 'Q2_K',
|
|
446
|
+
gpuLayers: 0,
|
|
447
|
+
memoryUsage: {
|
|
448
|
+
vram: 0,
|
|
449
|
+
ram: modelSizeGB + kvCacheGB,
|
|
450
|
+
total: modelSizeGB + kvCacheGB
|
|
451
|
+
},
|
|
452
|
+
strategy: 'aggressive_quant',
|
|
453
|
+
warning: 'Model may not fit comfortably in available memory'
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Predict inference speed for given configuration
|
|
459
|
+
*/
|
|
460
|
+
predictInferenceSpeed(modelSizeB, config, hardware) {
|
|
461
|
+
const { quantization, gpuLayers, strategy } = config;
|
|
462
|
+
const backend = this.chooseOptimalBackend(hardware).backend;
|
|
463
|
+
|
|
464
|
+
// Base throughput from backend
|
|
465
|
+
const baseK = this.backendCoefficients[backend] || this.backendCoefficients.cpu_basic;
|
|
466
|
+
|
|
467
|
+
// Scale by model size (inverse relationship)
|
|
468
|
+
let throughput = baseK / modelSizeB;
|
|
469
|
+
|
|
470
|
+
// Quantization speed factor
|
|
471
|
+
const quantFactor = this.quantFactors[quantization] || 1.0;
|
|
472
|
+
throughput *= quantFactor;
|
|
473
|
+
|
|
474
|
+
// Offload strategy factor
|
|
475
|
+
let offloadFactor = 1.0;
|
|
476
|
+
if (strategy === 'partial_offload') {
|
|
477
|
+
// Estimate offload efficiency based on GPU layers ratio
|
|
478
|
+
const totalLayers = 80; // Rough estimate
|
|
479
|
+
const gpuRatio = gpuLayers / totalLayers;
|
|
480
|
+
offloadFactor = 0.5 + (gpuRatio * 0.4); // 0.5 to 0.9 range
|
|
481
|
+
} else if (strategy === 'cpu_only') {
|
|
482
|
+
offloadFactor = 0.6; // CPU penalty compared to GPU
|
|
483
|
+
} else if (strategy === 'aggressive_quant') {
|
|
484
|
+
offloadFactor = 0.4; // Heavy memory pressure penalty
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
throughput *= offloadFactor;
|
|
488
|
+
|
|
489
|
+
// Hardware-specific adjustments
|
|
490
|
+
if (hardware.gpu.vendor === 'nvidia' && backend === 'cuda') {
|
|
491
|
+
throughput *= 1.1; // NVIDIA optimization bonus
|
|
492
|
+
} else if (hardware.gpu.vendor === 'amd' && backend === 'rocm') {
|
|
493
|
+
throughput *= 0.9; // AMD slight penalty vs NVIDIA
|
|
494
|
+
} else if (hardware.gpu.isIntegrated) {
|
|
495
|
+
throughput *= 0.7; // Integrated GPU penalty
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
return Math.max(1, Math.round(throughput * 100) / 100);
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
/**
|
|
502
|
+
* Generate hardware-specific model recommendations
|
|
503
|
+
*/
|
|
504
|
+
generateRecommendations(hardware) {
|
|
505
|
+
const recommendations = [];
|
|
506
|
+
const { gpu, memory, cpu } = hardware;
|
|
507
|
+
|
|
508
|
+
// Categorize hardware capability
|
|
509
|
+
let capability = 'basic';
|
|
510
|
+
if (gpu.vramGB >= 24) capability = 'high_end';
|
|
511
|
+
else if (gpu.vramGB >= 12) capability = 'enthusiast';
|
|
512
|
+
else if (gpu.vramGB >= 6) capability = 'gaming';
|
|
513
|
+
else if (memory.total >= 32) capability = 'workstation';
|
|
514
|
+
else if (memory.total >= 16) capability = 'standard';
|
|
515
|
+
|
|
516
|
+
// Model size recommendations based on capability
|
|
517
|
+
const recommendations_map = {
|
|
518
|
+
high_end: [
|
|
519
|
+
{ model: '70B', quant: 'Q4_K_M', reason: 'Large model with good quality' },
|
|
520
|
+
{ model: '33B', quant: 'Q5_K_M', reason: 'Excellent balance of size and quality' },
|
|
521
|
+
{ model: '13B', quant: 'Q6_K', reason: 'High quality medium model' }
|
|
522
|
+
],
|
|
523
|
+
enthusiast: [
|
|
524
|
+
{ model: '33B', quant: 'Q4_K_M', reason: 'Large model fits in VRAM' },
|
|
525
|
+
{ model: '13B', quant: 'Q5_K_M', reason: 'Optimal size for your GPU' },
|
|
526
|
+
{ model: '7B', quant: 'Q6_K', reason: 'Fast high-quality option' }
|
|
527
|
+
],
|
|
528
|
+
gaming: [
|
|
529
|
+
{ model: '13B', quant: 'Q4_K_M', reason: 'Good balance for gaming GPU' },
|
|
530
|
+
{ model: '7B', quant: 'Q5_K_M', reason: 'Recommended for your VRAM' },
|
|
531
|
+
{ model: '3B', quant: 'Q6_K', reason: 'Fast inference option' }
|
|
532
|
+
],
|
|
533
|
+
workstation: [
|
|
534
|
+
{ model: '33B', quant: 'Q4_K_M', reason: 'CPU offload with large RAM' },
|
|
535
|
+
{ model: '13B', quant: 'Q5_K_M', reason: 'Good CPU performance' },
|
|
536
|
+
{ model: '7B', quant: 'Q6_K', reason: 'Reliable CPU inference' }
|
|
537
|
+
],
|
|
538
|
+
standard: [
|
|
539
|
+
{ model: '13B', quant: 'Q4_K_M', reason: 'Fits in 16GB RAM' },
|
|
540
|
+
{ model: '7B', quant: 'Q5_K_M', reason: 'Optimal for your system' },
|
|
541
|
+
{ model: '3B', quant: 'Q6_K', reason: 'Fast and efficient' }
|
|
542
|
+
],
|
|
543
|
+
basic: [
|
|
544
|
+
{ model: '7B', quant: 'Q4_K_M', reason: 'Recommended for limited hardware' },
|
|
545
|
+
{ model: '3B', quant: 'Q5_K_M', reason: 'Good performance on basic systems' },
|
|
546
|
+
{ model: '1B', quant: 'Q6_K', reason: 'Ultra-fast option' }
|
|
547
|
+
]
|
|
548
|
+
};
|
|
549
|
+
|
|
550
|
+
const recs = recommendations_map[capability] || recommendations_map.basic;
|
|
551
|
+
|
|
552
|
+
// Add backend and command suggestions
|
|
553
|
+
const optimalBackend = this.chooseOptimalBackend(hardware);
|
|
554
|
+
|
|
555
|
+
recs.forEach(rec => {
|
|
556
|
+
const modelSizeB = parseFloat(rec.model);
|
|
557
|
+
const config = this.pickQuantAndOffload(modelSizeB, hardware);
|
|
558
|
+
const speed = this.predictInferenceSpeed(modelSizeB, config, hardware);
|
|
559
|
+
|
|
560
|
+
recommendations.push({
|
|
561
|
+
...rec,
|
|
562
|
+
backend: optimalBackend.backend,
|
|
563
|
+
config,
|
|
564
|
+
estimatedSpeed: speed,
|
|
565
|
+
command: this.generateCommand(rec.model, config, optimalBackend.backend)
|
|
566
|
+
});
|
|
567
|
+
});
|
|
568
|
+
|
|
569
|
+
return {
|
|
570
|
+
capability,
|
|
571
|
+
backend: optimalBackend,
|
|
572
|
+
recommendations: recommendations.slice(0, 5) // Top 5
|
|
573
|
+
};
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
/**
|
|
577
|
+
* Generate appropriate CLI command for the configuration
|
|
578
|
+
*/
|
|
579
|
+
generateCommand(modelSize, config, backend) {
|
|
580
|
+
const { quantization, gpuLayers } = config;
|
|
581
|
+
|
|
582
|
+
// llama.cpp style command
|
|
583
|
+
let cmd = 'llama-cli -m model.gguf';
|
|
584
|
+
|
|
585
|
+
if (gpuLayers === -1) {
|
|
586
|
+
cmd += ' -ngl -1'; // Full GPU
|
|
587
|
+
} else if (gpuLayers > 0) {
|
|
588
|
+
cmd += ` -ngl ${gpuLayers}`; // Partial offload
|
|
589
|
+
}
|
|
590
|
+
// No -ngl flag for CPU-only
|
|
591
|
+
|
|
592
|
+
cmd += ' -t 8 -c 4096'; // 8 threads, 4k context
|
|
593
|
+
|
|
594
|
+
return {
|
|
595
|
+
llamacpp: cmd,
|
|
596
|
+
ollama: `ollama run model:${modelSize.toLowerCase()}-${quantization.toLowerCase().replace('_', '-')}`,
|
|
597
|
+
description: `${modelSize} model with ${quantization} quantization (${config.strategy})`
|
|
598
|
+
};
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
/**
|
|
602
|
+
* Run shell command with timeout
|
|
603
|
+
*/
|
|
604
|
+
async runCommand(command, args, timeout = 5000) {
|
|
605
|
+
return new Promise((resolve, reject) => {
|
|
606
|
+
const proc = spawn(command, args, { stdio: 'pipe' });
|
|
607
|
+
let output = '';
|
|
608
|
+
let error = '';
|
|
609
|
+
|
|
610
|
+
const timer = setTimeout(() => {
|
|
611
|
+
proc.kill();
|
|
612
|
+
reject(new Error('Command timeout'));
|
|
613
|
+
}, timeout);
|
|
614
|
+
|
|
615
|
+
proc.stdout.on('data', (data) => output += data);
|
|
616
|
+
proc.stderr.on('data', (data) => error += data);
|
|
617
|
+
|
|
618
|
+
proc.on('close', (code) => {
|
|
619
|
+
clearTimeout(timer);
|
|
620
|
+
if (code === 0) {
|
|
621
|
+
resolve(output.trim());
|
|
622
|
+
} else {
|
|
623
|
+
reject(new Error(`Command failed: ${error || 'Unknown error'}`));
|
|
624
|
+
}
|
|
625
|
+
});
|
|
626
|
+
|
|
627
|
+
proc.on('error', (err) => {
|
|
628
|
+
clearTimeout(timer);
|
|
629
|
+
reject(err);
|
|
630
|
+
});
|
|
631
|
+
});
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
module.exports = PCOptimizer;
|