llm-checker 3.4.2 → 3.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -10
- package/analyzer/performance.js +40 -94
- package/bin/enhanced_cli.js +320 -254
- package/bin/mcp-server.mjs +0 -0
- package/package.json +1 -1
- package/src/models/ai-check-selector.js +2 -2
- package/src/models/deterministic-selector.js +1 -0
- package/src/models/expanded_database.js +10 -83
- package/src/ollama/client.js +29 -4
- package/src/ui/cli-theme.js +733 -0
- package/src/ui/interactive-panel.js +599 -0
- package/src/utils/fetch.js +17 -0
- package/src/utils/token-speed-estimator.js +207 -0
- package/src/ollama/gpu-placement-planner.js +0 -496
|
@@ -9,6 +9,7 @@ const fs = require('fs');
|
|
|
9
9
|
const path = require('path');
|
|
10
10
|
const os = require('os');
|
|
11
11
|
const { spawn } = require('child_process');
|
|
12
|
+
const fetch = require('../utils/fetch');
|
|
12
13
|
const { DETERMINISTIC_WEIGHTS } = require('./scoring-config');
|
|
13
14
|
const {
|
|
14
15
|
parseBillionsValue: parseMoEBillionsValue,
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
const { estimateTokenSpeedFromHardware } = require('../utils/token-speed-estimator');
|
|
2
|
+
|
|
1
3
|
class ExpandedModelsDatabase {
|
|
2
4
|
constructor() {
|
|
3
5
|
this.models = this.initializeExpandedModels();
|
|
@@ -958,88 +960,12 @@ class ExpandedModelsDatabase {
|
|
|
958
960
|
}
|
|
959
961
|
|
|
960
962
|
calculateRealisticTokensPerSecond(model, hardware) {
|
|
961
|
-
// Extract model parameters from name or size
|
|
962
963
|
const modelParams = this.extractModelParams(model);
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
const baseSpeed = hardware.cpu?.speed || 2.4;
|
|
969
|
-
const vramGB = hardware.gpu?.vram || 0;
|
|
970
|
-
|
|
971
|
-
// Check hardware type
|
|
972
|
-
const isAppleSilicon = process.platform === 'darwin' && (
|
|
973
|
-
gpuModel.toLowerCase().includes('apple') ||
|
|
974
|
-
gpuModel.toLowerCase().includes('m1') ||
|
|
975
|
-
gpuModel.toLowerCase().includes('m2') ||
|
|
976
|
-
gpuModel.toLowerCase().includes('m3') ||
|
|
977
|
-
gpuModel.toLowerCase().includes('m4')
|
|
978
|
-
);
|
|
979
|
-
const isIntegratedGPU = /iris.*xe|iris.*graphics|uhd.*graphics|vega.*integrated|radeon.*graphics/i.test(gpuModel);
|
|
980
|
-
const hasDedicatedGPU = vramGB > 0 && !isIntegratedGPU && !isAppleSilicon;
|
|
981
|
-
|
|
982
|
-
let tokensPerSecond;
|
|
983
|
-
|
|
984
|
-
if (isAppleSilicon) {
|
|
985
|
-
// Apple Silicon unified memory - more optimistic but realistic
|
|
986
|
-
let baseTPS = 25;
|
|
987
|
-
if (gpuModel.toLowerCase().includes('m4 pro')) baseTPS = 35;
|
|
988
|
-
else if (gpuModel.toLowerCase().includes('m4')) baseTPS = 30;
|
|
989
|
-
else if (gpuModel.toLowerCase().includes('m3 pro')) baseTPS = 30;
|
|
990
|
-
else if (gpuModel.toLowerCase().includes('m3')) baseTPS = 25;
|
|
991
|
-
else if (gpuModel.toLowerCase().includes('m2 pro')) baseTPS = 28;
|
|
992
|
-
else if (gpuModel.toLowerCase().includes('m2')) baseTPS = 22;
|
|
993
|
-
else if (gpuModel.toLowerCase().includes('m1 pro')) baseTPS = 25;
|
|
994
|
-
else if (gpuModel.toLowerCase().includes('m1')) baseTPS = 20;
|
|
995
|
-
|
|
996
|
-
// Scale by model size (Apple Silicon handles larger models better)
|
|
997
|
-
tokensPerSecond = Math.max(8, Math.round(baseTPS / Math.max(0.8, modelParams)));
|
|
998
|
-
|
|
999
|
-
} else if (hasDedicatedGPU) {
|
|
1000
|
-
// Dedicated GPU - much better performance
|
|
1001
|
-
let gpuTPS = 30;
|
|
1002
|
-
if (gpuModel.toLowerCase().includes('gb10') ||
|
|
1003
|
-
gpuModel.toLowerCase().includes('grace blackwell') ||
|
|
1004
|
-
gpuModel.toLowerCase().includes('dgx spark')) gpuTPS = 90;
|
|
1005
|
-
else if (gpuModel.toLowerCase().includes('h100')) gpuTPS = 120;
|
|
1006
|
-
else if (gpuModel.toLowerCase().includes('a100')) gpuTPS = 95;
|
|
1007
|
-
else if (gpuModel.toLowerCase().includes('rtx 50')) gpuTPS = 65;
|
|
1008
|
-
else if (gpuModel.toLowerCase().includes('rtx 40')) gpuTPS = 50;
|
|
1009
|
-
else if (gpuModel.toLowerCase().includes('rtx 30')) gpuTPS = 40;
|
|
1010
|
-
else if (gpuModel.toLowerCase().includes('rtx 20')) gpuTPS = 30;
|
|
1011
|
-
else if (gpuModel.toLowerCase().includes('p100')) gpuTPS = 32;
|
|
1012
|
-
else if (vramGB >= 16) gpuTPS = 45;
|
|
1013
|
-
else if (vramGB >= 8) gpuTPS = 35;
|
|
1014
|
-
else if (vramGB >= 4) gpuTPS = 25;
|
|
1015
|
-
|
|
1016
|
-
// Scale by model size for GPU
|
|
1017
|
-
tokensPerSecond = Math.max(10, Math.round(gpuTPS / Math.max(0.5, modelParams)));
|
|
1018
|
-
|
|
1019
|
-
} else {
|
|
1020
|
-
// CPU-only or integrated GPU - most realistic and conservative
|
|
1021
|
-
const hasAVX512 = cpuModel.toLowerCase().includes('intel') &&
|
|
1022
|
-
(cpuModel.includes('12th') || cpuModel.includes('13th') || cpuModel.includes('14th'));
|
|
1023
|
-
const hasAVX2 = cpuModel.toLowerCase().includes('intel') || cpuModel.toLowerCase().includes('amd');
|
|
1024
|
-
|
|
1025
|
-
// Base CPU coefficient - much more conservative
|
|
1026
|
-
let cpuK = 1.8; // Conservative baseline
|
|
1027
|
-
if (hasAVX512) cpuK = 2.6;
|
|
1028
|
-
else if (hasAVX2) cpuK = 2.2;
|
|
1029
|
-
|
|
1030
|
-
// iGPU boost (small)
|
|
1031
|
-
const iGpuMultiplier = isIntegratedGPU ? 1.3 : 1.0;
|
|
1032
|
-
|
|
1033
|
-
// Calculate with realistic threading limits
|
|
1034
|
-
const effectiveThreads = Math.min(cores, 8); // Diminishing returns after 8 threads
|
|
1035
|
-
const baseTPS = (cpuK * baseSpeed * effectiveThreads * iGpuMultiplier) / Math.max(1.5, modelParams);
|
|
1036
|
-
|
|
1037
|
-
// Apply realistic CPU limits
|
|
1038
|
-
const maxCPUTPS = hasAVX512 ? 25 : (isIntegratedGPU ? 20 : 15);
|
|
1039
|
-
tokensPerSecond = Math.max(2, Math.min(maxCPUTPS, Math.round(baseTPS)));
|
|
1040
|
-
}
|
|
1041
|
-
|
|
1042
|
-
return tokensPerSecond;
|
|
964
|
+
const speedProfile = estimateTokenSpeedFromHardware(hardware, {
|
|
965
|
+
modelSizeB: modelParams,
|
|
966
|
+
modelName: model.name
|
|
967
|
+
});
|
|
968
|
+
return speedProfile.tokensPerSecond;
|
|
1043
969
|
}
|
|
1044
970
|
|
|
1045
971
|
extractModelParams(model) {
|
|
@@ -1047,11 +973,12 @@ class ExpandedModelsDatabase {
|
|
|
1047
973
|
const name = model.name.toLowerCase();
|
|
1048
974
|
|
|
1049
975
|
// Look for patterns like "7b", "3.8b", "0.5b", etc.
|
|
1050
|
-
const paramMatch = name.match(/(\d+\.?\d*)[bm](?:\s|$)/);
|
|
976
|
+
const paramMatch = name.match(/(\d+\.?\d*)\s*([bm])(?:\s|$)/);
|
|
1051
977
|
if (paramMatch) {
|
|
1052
978
|
const value = parseFloat(paramMatch[1]);
|
|
979
|
+
const unit = paramMatch[2].toLowerCase();
|
|
1053
980
|
// Convert millions to billions if needed
|
|
1054
|
-
return
|
|
981
|
+
return unit === 'm' ? value / 1000 : value;
|
|
1055
982
|
}
|
|
1056
983
|
|
|
1057
984
|
// Fallback to size-based estimation
|
package/src/ollama/client.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
const fetch = require('
|
|
1
|
+
const fetch = require('../utils/fetch');
|
|
2
2
|
|
|
3
3
|
class OllamaClient {
|
|
4
4
|
constructor(baseURL = null) {
|
|
@@ -377,6 +377,29 @@ class OllamaClient {
|
|
|
377
377
|
}
|
|
378
378
|
}
|
|
379
379
|
|
|
380
|
+
calculateTokensPerSecond(data, totalTimeMs) {
|
|
381
|
+
const evalCount = Number(data?.eval_count) || 0;
|
|
382
|
+
const evalDurationNs = Number(data?.eval_duration) || 0;
|
|
383
|
+
const totalSeconds = Math.max(0, Number(totalTimeMs) || 0) / 1000;
|
|
384
|
+
|
|
385
|
+
const evalTokensPerSecond = evalDurationNs > 0 && evalCount > 0
|
|
386
|
+
? (evalCount / (evalDurationNs / 1_000_000_000))
|
|
387
|
+
: 0;
|
|
388
|
+
|
|
389
|
+
const endToEndTokensPerSecond = totalSeconds > 0 && evalCount > 0
|
|
390
|
+
? (evalCount / totalSeconds)
|
|
391
|
+
: 0;
|
|
392
|
+
|
|
393
|
+
// Prefer eval-only throughput when available because it excludes load/setup overhead.
|
|
394
|
+
const preferred = evalTokensPerSecond > 0 ? evalTokensPerSecond : endToEndTokensPerSecond;
|
|
395
|
+
|
|
396
|
+
return {
|
|
397
|
+
tokensPerSecond: Math.round(preferred * 10) / 10,
|
|
398
|
+
evalTokensPerSecond: Math.round(evalTokensPerSecond * 10) / 10,
|
|
399
|
+
endToEndTokensPerSecond: Math.round(endToEndTokensPerSecond * 10) / 10
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
380
403
|
async testModelPerformance(modelName, testPrompt = "Hello, how are you?") {
|
|
381
404
|
const availability = await this.checkOllamaAvailability();
|
|
382
405
|
if (!availability.available) {
|
|
@@ -413,13 +436,15 @@ class OllamaClient {
|
|
|
413
436
|
const endTime = Date.now();
|
|
414
437
|
|
|
415
438
|
const totalTime = endTime - startTime;
|
|
416
|
-
const tokensGenerated = data.eval_count ||
|
|
417
|
-
const
|
|
439
|
+
const tokensGenerated = Number(data.eval_count) || 0;
|
|
440
|
+
const speed = this.calculateTokensPerSecond(data, totalTime);
|
|
418
441
|
|
|
419
442
|
return {
|
|
420
443
|
success: true,
|
|
421
444
|
responseTime: totalTime,
|
|
422
|
-
tokensPerSecond,
|
|
445
|
+
tokensPerSecond: speed.tokensPerSecond,
|
|
446
|
+
evalTokensPerSecond: speed.evalTokensPerSecond,
|
|
447
|
+
endToEndTokensPerSecond: speed.endToEndTokensPerSecond,
|
|
423
448
|
tokensGenerated,
|
|
424
449
|
loadTime: data.load_duration ? Math.round(data.load_duration / 1000000) : null,
|
|
425
450
|
evalTime: data.eval_duration ? Math.round(data.eval_duration / 1000000) : null,
|