llm-checker 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +418 -0
  3. package/analyzer/compatibility.js +584 -0
  4. package/analyzer/performance.js +505 -0
  5. package/bin/CLAUDE.md +12 -0
  6. package/bin/enhanced_cli.js +3118 -0
  7. package/bin/test-deterministic.js +41 -0
  8. package/package.json +96 -0
  9. package/src/CLAUDE.md +12 -0
  10. package/src/ai/intelligent-selector.js +615 -0
  11. package/src/ai/model-selector.js +312 -0
  12. package/src/ai/multi-objective-selector.js +820 -0
  13. package/src/commands/check.js +58 -0
  14. package/src/data/CLAUDE.md +11 -0
  15. package/src/data/model-database.js +637 -0
  16. package/src/data/sync-manager.js +279 -0
  17. package/src/hardware/CLAUDE.md +12 -0
  18. package/src/hardware/backends/CLAUDE.md +11 -0
  19. package/src/hardware/backends/apple-silicon.js +318 -0
  20. package/src/hardware/backends/cpu-detector.js +490 -0
  21. package/src/hardware/backends/cuda-detector.js +417 -0
  22. package/src/hardware/backends/intel-detector.js +436 -0
  23. package/src/hardware/backends/rocm-detector.js +440 -0
  24. package/src/hardware/detector.js +573 -0
  25. package/src/hardware/pc-optimizer.js +635 -0
  26. package/src/hardware/specs.js +286 -0
  27. package/src/hardware/unified-detector.js +442 -0
  28. package/src/index.js +2289 -0
  29. package/src/models/CLAUDE.md +17 -0
  30. package/src/models/ai-check-selector.js +806 -0
  31. package/src/models/catalog.json +426 -0
  32. package/src/models/deterministic-selector.js +1145 -0
  33. package/src/models/expanded_database.js +1142 -0
  34. package/src/models/intelligent-selector.js +532 -0
  35. package/src/models/requirements.js +310 -0
  36. package/src/models/scoring-config.js +57 -0
  37. package/src/models/scoring-engine.js +715 -0
  38. package/src/ollama/.cache/README.md +33 -0
  39. package/src/ollama/CLAUDE.md +24 -0
  40. package/src/ollama/client.js +438 -0
  41. package/src/ollama/enhanced-client.js +113 -0
  42. package/src/ollama/enhanced-scraper.js +634 -0
  43. package/src/ollama/manager.js +357 -0
  44. package/src/ollama/native-scraper.js +776 -0
  45. package/src/plugins/CLAUDE.md +11 -0
  46. package/src/plugins/examples/custom_model_plugin.js +87 -0
  47. package/src/plugins/index.js +295 -0
  48. package/src/utils/CLAUDE.md +11 -0
  49. package/src/utils/config.js +359 -0
  50. package/src/utils/formatter.js +315 -0
  51. package/src/utils/logger.js +272 -0
  52. package/src/utils/model-classifier.js +167 -0
  53. package/src/utils/verbose-progress.js +266 -0
@@ -0,0 +1,635 @@
1
+ /**
2
+ * PC Hardware Optimizer for Windows/Linux
3
+ *
4
+ * Implements practical backend detection and optimization logic for:
5
+ * - NVIDIA (CUDA/Vulkan/DirectML)
6
+ * - AMD (ROCm/Vulkan/DirectML)
7
+ * - Intel Arc/iGPU (SYCL/OpenVINO/DirectML/Vulkan)
8
+ * - CPU-only (BLAS accelerated)
9
+ */
10
+
11
+ const { spawn } = require('child_process');
12
+ const si = require('systeminformation');
13
+
14
+ class PCOptimizer {
15
+ constructor() {
16
+ // Backend performance coefficients (tokens/sec baseline)
17
+ this.backendCoefficients = {
18
+ cuda: 220, // NVIDIA CUDA/cuBLAS
19
+ rocm: 180, // AMD ROCm/HIP
20
+ sycl: 140, // Intel oneAPI/SYCL
21
+ vulkan: 120, // Vulkan (cross-platform)
22
+ directml: 100, // DirectML (Windows)
23
+ openvino: 90, // Intel OpenVINO
24
+ cpu_avx512: 70, // CPU with AVX512
25
+ cpu_avx2: 60, // CPU with AVX2
26
+ cpu_basic: 40 // Basic CPU
27
+ };
28
+
29
+ // Quantization factors (speed multiplier)
30
+ this.quantFactors = {
31
+ 'Q8_0': 0.85, // Highest quality, slower
32
+ 'Q6_K': 1.0, // Balanced
33
+ 'Q5_K_M': 1.1, // Good speed/quality
34
+ 'Q4_K_M': 1.2, // Fast inference
35
+ 'Q3_K': 1.35, // Very fast, lower quality
36
+ 'Q2_K': 1.5 // Fastest, lowest quality
37
+ };
38
+
39
+ // Bytes per parameter for different quantizations
40
+ this.bytesPerParam = {
41
+ 'FP16': 2.0,
42
+ 'Q8_0': 1.0,
43
+ 'Q6_K': 0.75,
44
+ 'Q5_K_M': 0.63,
45
+ 'Q4_K_M': 0.50,
46
+ 'Q3_K': 0.375,
47
+ 'Q2_K': 0.25
48
+ };
49
+
50
+ // KV cache estimation per 1k tokens (GB)
51
+ this.kvCacheEstimates = {
52
+ 7: 0.3, // 7B models
53
+ 13: 0.6, // 13B models
54
+ 33: 1.4, // 30-40B models
55
+ 70: 2.6 // 70B+ models
56
+ };
57
+ }
58
+
59
+ /**
60
+ * Detect PC hardware capabilities and available backends
61
+ */
62
+ async detectPCCapabilities() {
63
+ try {
64
+ const [graphics, cpu, memory, osInfo] = await Promise.all([
65
+ si.graphics(),
66
+ si.cpu(),
67
+ si.mem(),
68
+ si.osInfo()
69
+ ]);
70
+
71
+ const gpu = graphics.controllers?.[0] || {};
72
+ const vendor = (gpu.vendor || '').toLowerCase();
73
+ const model = (gpu.model || '').toLowerCase();
74
+
75
+ // Enhanced VRAM detection
76
+ let vramGB = Math.round((gpu.vram || 0) / 1024);
77
+ if (vramGB === 0 && gpu.vram) {
78
+ // Handle different VRAM reporting formats
79
+ if (gpu.vram > 100000) {
80
+ vramGB = Math.round(gpu.vram / (1024 * 1024 * 1024)); // bytes to GB
81
+ } else if (gpu.vram > 1000) {
82
+ vramGB = Math.round(gpu.vram / 1024); // MB to GB
83
+ }
84
+ }
85
+
86
+ // Detect GPU type and capabilities
87
+ const gpuInfo = this.classifyGPU(vendor, model, vramGB);
88
+
89
+ // Detect available backends
90
+ const availableBackends = await this.detectAvailableBackends(osInfo.platform);
91
+
92
+ // Get CPU capabilities
93
+ const cpuInfo = this.analyzeCPUCapabilities(cpu);
94
+
95
+ return {
96
+ gpu: gpuInfo,
97
+ cpu: cpuInfo,
98
+ memory: {
99
+ total: Math.round(memory.total / (1024 ** 3)),
100
+ available: Math.round(memory.available / (1024 ** 3))
101
+ },
102
+ os: {
103
+ platform: osInfo.platform,
104
+ distro: osInfo.distro
105
+ },
106
+ backends: availableBackends,
107
+ timestamp: Date.now()
108
+ };
109
+ } catch (error) {
110
+ throw new Error(`PC capability detection failed: ${error.message}`);
111
+ }
112
+ }
113
+
114
+ /**
115
+ * Classify GPU type and capabilities
116
+ */
117
+ classifyGPU(vendor, model, vramGB) {
118
+ const isNvidia = vendor.includes('nvidia') || model.includes('nvidia') || model.includes('geforce') || model.includes('rtx') || model.includes('gtx');
119
+ const isAMD = vendor.includes('amd') || vendor.includes('ati') || model.includes('radeon') || model.includes('rx ');
120
+ const isIntel = vendor.includes('intel') || model.includes('intel') || model.includes('arc') || model.includes('iris') || model.includes('uhd');
121
+
122
+ let tier = 'entry';
123
+ let type = 'unknown';
124
+ let computeUnits = 0;
125
+ let memoryBandwidth = 50; // GB/s default
126
+
127
+ if (isNvidia) {
128
+ type = 'nvidia';
129
+ if (model.includes('rtx 5090')) { tier = 'flagship'; computeUnits = 21760; memoryBandwidth = 1792; }
130
+ else if (model.includes('rtx 5080')) { tier = 'high_end'; computeUnits = 10752; memoryBandwidth = 960; }
131
+ else if (model.includes('rtx 5070')) { tier = 'upper_mid'; computeUnits = 6144; memoryBandwidth = 504; }
132
+ else if (model.includes('rtx 4090')) { tier = 'flagship'; computeUnits = 16384; memoryBandwidth = 1008; }
133
+ else if (model.includes('rtx 4080')) { tier = 'high_end'; computeUnits = 9728; memoryBandwidth = 716; }
134
+ else if (model.includes('rtx 4070')) { tier = 'upper_mid'; computeUnits = 5888; memoryBandwidth = 448; }
135
+ else if (model.includes('rtx 4060')) { tier = 'mid_range'; computeUnits = 3072; memoryBandwidth = 272; }
136
+ else if (model.includes('rtx 30')) { tier = 'upper_mid'; computeUnits = 6000; memoryBandwidth = 500; }
137
+ else if (model.includes('rtx 20')) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 300; }
138
+ else if (model.includes('gtx 16')) { tier = 'budget'; computeUnits = 1500; memoryBandwidth = 200; }
139
+ else if (vramGB >= 8) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 300; }
140
+ else if (vramGB >= 4) { tier = 'budget'; computeUnits = 1000; memoryBandwidth = 150; }
141
+ } else if (isAMD) {
142
+ type = 'amd';
143
+ if (model.includes('rx 7900')) { tier = 'high_end'; computeUnits = 6000; memoryBandwidth = 960; }
144
+ else if (model.includes('rx 7800')) { tier = 'upper_mid'; computeUnits = 3840; memoryBandwidth = 624; }
145
+ else if (model.includes('rx 7700')) { tier = 'mid_range'; computeUnits = 2560; memoryBandwidth = 432; }
146
+ else if (model.includes('rx 7600')) { tier = 'budget'; computeUnits = 2048; memoryBandwidth = 288; }
147
+ else if (model.includes('rx 6000')) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 400; }
148
+ else if (vramGB >= 16) { tier = 'upper_mid'; computeUnits = 3000; memoryBandwidth = 500; }
149
+ else if (vramGB >= 8) { tier = 'mid_range'; computeUnits = 2000; memoryBandwidth = 300; }
150
+ else if (vramGB >= 4) { tier = 'budget'; computeUnits = 1000; memoryBandwidth = 200; }
151
+ } else if (isIntel) {
152
+ type = 'intel';
153
+ if (model.includes('arc a770')) { tier = 'mid_range'; computeUnits = 512; memoryBandwidth = 560; }
154
+ else if (model.includes('arc a750')) { tier = 'mid_range'; computeUnits = 448; memoryBandwidth = 512; }
155
+ else if (model.includes('arc a580')) { tier = 'budget'; computeUnits = 384; memoryBandwidth = 448; }
156
+ else if (model.includes('iris xe')) { tier = 'igpu'; computeUnits = 96; memoryBandwidth = 68; }
157
+ else if (model.includes('uhd')) { tier = 'igpu'; computeUnits = 32; memoryBandwidth = 47; }
158
+ else if (vramGB >= 8) { tier = 'mid_range'; computeUnits = 400; memoryBandwidth = 400; }
159
+ else { tier = 'igpu'; computeUnits = 64; memoryBandwidth = 50; }
160
+ }
161
+
162
+ // Determine if integrated GPU
163
+ const isIntegrated = tier === 'igpu' ||
164
+ (vramGB === 0 && (model.includes('intel') || model.includes('integrated') ||
165
+ model.includes('iris') || model.includes('uhd') || model.includes('vega')));
166
+
167
+ return {
168
+ vendor: type,
169
+ model: model || 'Unknown GPU',
170
+ tier,
171
+ vramGB,
172
+ isIntegrated,
173
+ computeUnits,
174
+ memoryBandwidth,
175
+ estimatedTFLOPS: this.estimateGPUTFLOPS(type, tier, computeUnits)
176
+ };
177
+ }
178
+
179
+ /**
180
+ * Estimate GPU TFLOPS for FP16 operations
181
+ */
182
+ estimateGPUTFLOPS(type, tier, computeUnits) {
183
+ const multipliers = {
184
+ nvidia: {
185
+ flagship: 0.010, // RTX 4090 ~165 TFLOPS
186
+ high_end: 0.008, // RTX 4080 ~120 TFLOPS
187
+ upper_mid: 0.006, // RTX 4070 ~65 TFLOPS
188
+ mid_range: 0.004, // RTX 4060 ~32 TFLOPS
189
+ budget: 0.002, // GTX 1660 ~15 TFLOPS
190
+ igpu: 0.001
191
+ },
192
+ amd: {
193
+ flagship: 0.008, // Similar to NVIDIA but slightly lower
194
+ high_end: 0.006,
195
+ upper_mid: 0.005,
196
+ mid_range: 0.003,
197
+ budget: 0.002,
198
+ igpu: 0.001
199
+ },
200
+ intel: {
201
+ mid_range: 0.003, // Arc A770 ~8 TFLOPS
202
+ budget: 0.002, // Arc A580 ~6 TFLOPS
203
+ igpu: 0.001 // Iris Xe ~2 TFLOPS
204
+ }
205
+ };
206
+
207
+ const multiplier = multipliers[type]?.[tier] || 0.001;
208
+ return Math.round(computeUnits * multiplier * 10) / 10;
209
+ }
210
+
211
+ /**
212
+ * Analyze CPU capabilities for LLM inference
213
+ */
214
+ analyzeCPUCapabilities(cpu) {
215
+ const brand = (cpu.brand || '').toLowerCase();
216
+ const model = (cpu.model || '').toLowerCase();
217
+ const cores = cpu.physicalCores || cpu.cores || 1;
218
+ const threads = cpu.cores || cores;
219
+ const baseSpeed = cpu.speed || 2.0;
220
+
221
+ // Detect instruction set capabilities
222
+ const hasAVX512 = brand.includes('intel') &&
223
+ (model.includes('12th') || model.includes('13th') || model.includes('14th') || model.includes('15th'));
224
+ const hasAVX2 = brand.includes('intel') || brand.includes('amd');
225
+ const hasVNNI = hasAVX512; // VNNI usually comes with AVX512
226
+
227
+ // Estimate CPU tier for LLM inference
228
+ let tier = 'entry';
229
+ if (cores >= 16 && baseSpeed >= 3.0) tier = 'high_end';
230
+ else if (cores >= 8 && baseSpeed >= 2.5) tier = 'mid_range';
231
+ else if (cores >= 4 && baseSpeed >= 2.0) tier = 'budget';
232
+
233
+ return {
234
+ brand: brand,
235
+ model: cpu.model || 'Unknown CPU',
236
+ cores,
237
+ threads,
238
+ baseSpeed,
239
+ tier,
240
+ features: {
241
+ avx512: hasAVX512,
242
+ avx2: hasAVX2,
243
+ vnni: hasVNNI
244
+ },
245
+ estimatedGFLOPS: cores * baseSpeed * (hasAVX512 ? 64 : hasAVX2 ? 32 : 16)
246
+ };
247
+ }
248
+
249
+ /**
250
+ * Detect available backends on the system
251
+ */
252
+ async detectAvailableBackends(platform) {
253
+ const backends = {
254
+ cuda: false,
255
+ rocm: false,
256
+ sycl: false,
257
+ vulkan: false,
258
+ directml: false,
259
+ openvino: false,
260
+ blas: true // Always assume basic BLAS
261
+ };
262
+
263
+ try {
264
+ // Check for NVIDIA CUDA
265
+ try {
266
+ await this.runCommand('nvidia-smi', ['--query-gpu=name', '--format=csv,noheader']);
267
+ backends.cuda = true;
268
+ } catch (e) {
269
+ // CUDA not available
270
+ }
271
+
272
+ // Check for AMD ROCm (Linux mainly)
273
+ if (platform === 'linux') {
274
+ try {
275
+ await this.runCommand('rocm-smi', ['--showid']);
276
+ backends.rocm = true;
277
+ } catch (e) {
278
+ // ROCm not available
279
+ }
280
+ }
281
+
282
+ // Check for Intel SYCL/oneAPI
283
+ try {
284
+ await this.runCommand('sycl-ls', []);
285
+ backends.sycl = true;
286
+ } catch (e) {
287
+ // Try alternative oneAPI detection
288
+ try {
289
+ await this.runCommand('oneapi-cli', ['--version']);
290
+ backends.sycl = true;
291
+ } catch (e2) {
292
+ // SYCL not available
293
+ }
294
+ }
295
+
296
+ // DirectML available on Windows
297
+ if (platform === 'win32') {
298
+ backends.directml = true;
299
+ }
300
+
301
+ // Vulkan detection (cross-platform)
302
+ try {
303
+ // Basic Vulkan detection - could be enhanced
304
+ backends.vulkan = true; // Assume modern systems have Vulkan
305
+ } catch (e) {
306
+ // Vulkan not available
307
+ }
308
+
309
+ // OpenVINO detection
310
+ try {
311
+ await this.runCommand('benchmark_app', ['--help']);
312
+ backends.openvino = true;
313
+ } catch (e) {
314
+ // OpenVINO not available
315
+ }
316
+
317
+ } catch (error) {
318
+ console.warn('Backend detection failed:', error.message);
319
+ }
320
+
321
+ return backends;
322
+ }
323
+
324
+ /**
325
+ * Choose optimal backend based on available options and hardware
326
+ */
327
+ chooseOptimalBackend(hardware) {
328
+ const { gpu, backends, os } = hardware;
329
+
330
+ // Priority order for different GPU vendors
331
+ if (gpu.vendor === 'nvidia' && backends.cuda) {
332
+ return { backend: 'cuda', reason: 'NVIDIA GPU with CUDA support' };
333
+ }
334
+
335
+ if (gpu.vendor === 'amd' && backends.rocm && os.platform === 'linux') {
336
+ return { backend: 'rocm', reason: 'AMD GPU with ROCm support (Linux)' };
337
+ }
338
+
339
+ if (gpu.vendor === 'intel' && backends.sycl) {
340
+ return { backend: 'sycl', reason: 'Intel GPU with oneAPI/SYCL support' };
341
+ }
342
+
343
+ // Fallback options for Windows
344
+ if (os.platform === 'win32' && backends.directml && !gpu.isIntegrated) {
345
+ return { backend: 'directml', reason: 'Windows DirectML fallback' };
346
+ }
347
+
348
+ // Vulkan as cross-platform fallback
349
+ if (backends.vulkan && !gpu.isIntegrated) {
350
+ return { backend: 'vulkan', reason: 'Vulkan cross-platform acceleration' };
351
+ }
352
+
353
+ // Intel OpenVINO for Intel hardware
354
+ if (gpu.vendor === 'intel' && backends.openvino) {
355
+ return { backend: 'openvino', reason: 'Intel OpenVINO optimization' };
356
+ }
357
+
358
+ // CPU fallback
359
+ if (hardware.cpu.features.avx512) {
360
+ return { backend: 'cpu_avx512', reason: 'CPU with AVX512 acceleration' };
361
+ } else if (hardware.cpu.features.avx2) {
362
+ return { backend: 'cpu_avx2', reason: 'CPU with AVX2 acceleration' };
363
+ } else {
364
+ return { backend: 'cpu_basic', reason: 'Basic CPU inference' };
365
+ }
366
+ }
367
+
368
+ /**
369
+ * Calculate optimal quantization and GPU offload strategy
370
+ */
371
+ pickQuantAndOffload(modelSizeB, hardware, contextLength = 4096) {
372
+ const { gpu, memory } = hardware;
373
+ const vramGB = gpu.vramGB || 0;
374
+ const ramGB = memory.total;
375
+
376
+ // KV cache estimation
377
+ const kvSizeKey = modelSizeB <= 8 ? 7 : modelSizeB <= 15 ? 13 : modelSizeB <= 40 ? 33 : 70;
378
+ const kvCacheGB = this.kvCacheEstimates[kvSizeKey] * (contextLength / 1000);
379
+
380
+ // Available memory (with safety margins)
381
+ const usableVRAM = vramGB * 0.9; // 90% usable VRAM
382
+ const usableRAM = ramGB * 0.8; // 80% usable RAM
383
+
384
+ // Try quantizations from best to worst
385
+ const quantOptions = ['Q6_K', 'Q5_K_M', 'Q4_K_M', 'Q3_K', 'Q2_K'];
386
+
387
+ for (const quant of quantOptions) {
388
+ const modelSizeGB = modelSizeB * this.bytesPerParam[quant];
389
+ const totalNeed = modelSizeGB + kvCacheGB;
390
+
391
+ // Check if it fits fully in VRAM
392
+ if (totalNeed <= usableVRAM && !gpu.isIntegrated) {
393
+ return {
394
+ quantization: quant,
395
+ gpuLayers: -1, // Full GPU
396
+ memoryUsage: {
397
+ vram: totalNeed,
398
+ ram: 0,
399
+ total: totalNeed
400
+ },
401
+ strategy: 'full_gpu'
402
+ };
403
+ }
404
+
405
+ // Calculate partial offload if VRAM is insufficient but available
406
+ if (usableVRAM > 0 && !gpu.isIntegrated) {
407
+ const vramRatio = Math.min(0.85, usableVRAM / totalNeed);
408
+ if (vramRatio > 0.15) { // Minimum 15% on GPU to be worthwhile
409
+ const ramNeed = totalNeed - usableVRAM;
410
+ if (ramNeed <= usableRAM) {
411
+ // Estimate layers (rough heuristic: 80 layers for most models)
412
+ const estimatedLayers = Math.max(16, Math.floor(80 * vramRatio));
413
+ return {
414
+ quantization: quant,
415
+ gpuLayers: estimatedLayers,
416
+ memoryUsage: {
417
+ vram: usableVRAM,
418
+ ram: ramNeed,
419
+ total: totalNeed
420
+ },
421
+ strategy: 'partial_offload'
422
+ };
423
+ }
424
+ }
425
+ }
426
+
427
+ // Check if it fits in RAM (CPU-only)
428
+ if (totalNeed <= usableRAM) {
429
+ return {
430
+ quantization: quant,
431
+ gpuLayers: 0, // CPU only
432
+ memoryUsage: {
433
+ vram: 0,
434
+ ram: totalNeed,
435
+ total: totalNeed
436
+ },
437
+ strategy: 'cpu_only'
438
+ };
439
+ }
440
+ }
441
+
442
+ // If nothing fits, return most aggressive quantization
443
+ const modelSizeGB = modelSizeB * this.bytesPerParam['Q2_K'];
444
+ return {
445
+ quantization: 'Q2_K',
446
+ gpuLayers: 0,
447
+ memoryUsage: {
448
+ vram: 0,
449
+ ram: modelSizeGB + kvCacheGB,
450
+ total: modelSizeGB + kvCacheGB
451
+ },
452
+ strategy: 'aggressive_quant',
453
+ warning: 'Model may not fit comfortably in available memory'
454
+ };
455
+ }
456
+
457
+ /**
458
+ * Predict inference speed for given configuration
459
+ */
460
+ predictInferenceSpeed(modelSizeB, config, hardware) {
461
+ const { quantization, gpuLayers, strategy } = config;
462
+ const backend = this.chooseOptimalBackend(hardware).backend;
463
+
464
+ // Base throughput from backend
465
+ const baseK = this.backendCoefficients[backend] || this.backendCoefficients.cpu_basic;
466
+
467
+ // Scale by model size (inverse relationship)
468
+ let throughput = baseK / modelSizeB;
469
+
470
+ // Quantization speed factor
471
+ const quantFactor = this.quantFactors[quantization] || 1.0;
472
+ throughput *= quantFactor;
473
+
474
+ // Offload strategy factor
475
+ let offloadFactor = 1.0;
476
+ if (strategy === 'partial_offload') {
477
+ // Estimate offload efficiency based on GPU layers ratio
478
+ const totalLayers = 80; // Rough estimate
479
+ const gpuRatio = gpuLayers / totalLayers;
480
+ offloadFactor = 0.5 + (gpuRatio * 0.4); // 0.5 to 0.9 range
481
+ } else if (strategy === 'cpu_only') {
482
+ offloadFactor = 0.6; // CPU penalty compared to GPU
483
+ } else if (strategy === 'aggressive_quant') {
484
+ offloadFactor = 0.4; // Heavy memory pressure penalty
485
+ }
486
+
487
+ throughput *= offloadFactor;
488
+
489
+ // Hardware-specific adjustments
490
+ if (hardware.gpu.vendor === 'nvidia' && backend === 'cuda') {
491
+ throughput *= 1.1; // NVIDIA optimization bonus
492
+ } else if (hardware.gpu.vendor === 'amd' && backend === 'rocm') {
493
+ throughput *= 0.9; // AMD slight penalty vs NVIDIA
494
+ } else if (hardware.gpu.isIntegrated) {
495
+ throughput *= 0.7; // Integrated GPU penalty
496
+ }
497
+
498
+ return Math.max(1, Math.round(throughput * 100) / 100);
499
+ }
500
+
501
+ /**
502
+ * Generate hardware-specific model recommendations
503
+ */
504
+ generateRecommendations(hardware) {
505
+ const recommendations = [];
506
+ const { gpu, memory, cpu } = hardware;
507
+
508
+ // Categorize hardware capability
509
+ let capability = 'basic';
510
+ if (gpu.vramGB >= 24) capability = 'high_end';
511
+ else if (gpu.vramGB >= 12) capability = 'enthusiast';
512
+ else if (gpu.vramGB >= 6) capability = 'gaming';
513
+ else if (memory.total >= 32) capability = 'workstation';
514
+ else if (memory.total >= 16) capability = 'standard';
515
+
516
+ // Model size recommendations based on capability
517
+ const recommendations_map = {
518
+ high_end: [
519
+ { model: '70B', quant: 'Q4_K_M', reason: 'Large model with good quality' },
520
+ { model: '33B', quant: 'Q5_K_M', reason: 'Excellent balance of size and quality' },
521
+ { model: '13B', quant: 'Q6_K', reason: 'High quality medium model' }
522
+ ],
523
+ enthusiast: [
524
+ { model: '33B', quant: 'Q4_K_M', reason: 'Large model fits in VRAM' },
525
+ { model: '13B', quant: 'Q5_K_M', reason: 'Optimal size for your GPU' },
526
+ { model: '7B', quant: 'Q6_K', reason: 'Fast high-quality option' }
527
+ ],
528
+ gaming: [
529
+ { model: '13B', quant: 'Q4_K_M', reason: 'Good balance for gaming GPU' },
530
+ { model: '7B', quant: 'Q5_K_M', reason: 'Recommended for your VRAM' },
531
+ { model: '3B', quant: 'Q6_K', reason: 'Fast inference option' }
532
+ ],
533
+ workstation: [
534
+ { model: '33B', quant: 'Q4_K_M', reason: 'CPU offload with large RAM' },
535
+ { model: '13B', quant: 'Q5_K_M', reason: 'Good CPU performance' },
536
+ { model: '7B', quant: 'Q6_K', reason: 'Reliable CPU inference' }
537
+ ],
538
+ standard: [
539
+ { model: '13B', quant: 'Q4_K_M', reason: 'Fits in 16GB RAM' },
540
+ { model: '7B', quant: 'Q5_K_M', reason: 'Optimal for your system' },
541
+ { model: '3B', quant: 'Q6_K', reason: 'Fast and efficient' }
542
+ ],
543
+ basic: [
544
+ { model: '7B', quant: 'Q4_K_M', reason: 'Recommended for limited hardware' },
545
+ { model: '3B', quant: 'Q5_K_M', reason: 'Good performance on basic systems' },
546
+ { model: '1B', quant: 'Q6_K', reason: 'Ultra-fast option' }
547
+ ]
548
+ };
549
+
550
+ const recs = recommendations_map[capability] || recommendations_map.basic;
551
+
552
+ // Add backend and command suggestions
553
+ const optimalBackend = this.chooseOptimalBackend(hardware);
554
+
555
+ recs.forEach(rec => {
556
+ const modelSizeB = parseFloat(rec.model);
557
+ const config = this.pickQuantAndOffload(modelSizeB, hardware);
558
+ const speed = this.predictInferenceSpeed(modelSizeB, config, hardware);
559
+
560
+ recommendations.push({
561
+ ...rec,
562
+ backend: optimalBackend.backend,
563
+ config,
564
+ estimatedSpeed: speed,
565
+ command: this.generateCommand(rec.model, config, optimalBackend.backend)
566
+ });
567
+ });
568
+
569
+ return {
570
+ capability,
571
+ backend: optimalBackend,
572
+ recommendations: recommendations.slice(0, 5) // Top 5
573
+ };
574
+ }
575
+
576
+ /**
577
+ * Generate appropriate CLI command for the configuration
578
+ */
579
+ generateCommand(modelSize, config, backend) {
580
+ const { quantization, gpuLayers } = config;
581
+
582
+ // llama.cpp style command
583
+ let cmd = 'llama-cli -m model.gguf';
584
+
585
+ if (gpuLayers === -1) {
586
+ cmd += ' -ngl -1'; // Full GPU
587
+ } else if (gpuLayers > 0) {
588
+ cmd += ` -ngl ${gpuLayers}`; // Partial offload
589
+ }
590
+ // No -ngl flag for CPU-only
591
+
592
+ cmd += ' -t 8 -c 4096'; // 8 threads, 4k context
593
+
594
+ return {
595
+ llamacpp: cmd,
596
+ ollama: `ollama run model:${modelSize.toLowerCase()}-${quantization.toLowerCase().replace('_', '-')}`,
597
+ description: `${modelSize} model with ${quantization} quantization (${config.strategy})`
598
+ };
599
+ }
600
+
601
+ /**
602
+ * Run shell command with timeout
603
+ */
604
+ async runCommand(command, args, timeout = 5000) {
605
+ return new Promise((resolve, reject) => {
606
+ const proc = spawn(command, args, { stdio: 'pipe' });
607
+ let output = '';
608
+ let error = '';
609
+
610
+ const timer = setTimeout(() => {
611
+ proc.kill();
612
+ reject(new Error('Command timeout'));
613
+ }, timeout);
614
+
615
+ proc.stdout.on('data', (data) => output += data);
616
+ proc.stderr.on('data', (data) => error += data);
617
+
618
+ proc.on('close', (code) => {
619
+ clearTimeout(timer);
620
+ if (code === 0) {
621
+ resolve(output.trim());
622
+ } else {
623
+ reject(new Error(`Command failed: ${error || 'Unknown error'}`));
624
+ }
625
+ });
626
+
627
+ proc.on('error', (err) => {
628
+ clearTimeout(timer);
629
+ reject(err);
630
+ });
631
+ });
632
+ }
633
+ }
634
+
635
+ module.exports = PCOptimizer;