llm-checker 3.4.2 → 3.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ const fs = require('fs');
9
9
  const path = require('path');
10
10
  const os = require('os');
11
11
  const { spawn } = require('child_process');
12
+ const fetch = require('../utils/fetch');
12
13
  const { DETERMINISTIC_WEIGHTS } = require('./scoring-config');
13
14
  const {
14
15
  parseBillionsValue: parseMoEBillionsValue,
@@ -1,3 +1,5 @@
1
+ const { estimateTokenSpeedFromHardware } = require('../utils/token-speed-estimator');
2
+
1
3
  class ExpandedModelsDatabase {
2
4
  constructor() {
3
5
  this.models = this.initializeExpandedModels();
@@ -958,88 +960,12 @@ class ExpandedModelsDatabase {
958
960
  }
959
961
 
960
962
  calculateRealisticTokensPerSecond(model, hardware) {
961
- // Extract model parameters from name or size
962
963
  const modelParams = this.extractModelParams(model);
963
-
964
- // Get hardware specifics
965
- const cpuModel = hardware.cpu?.brand || hardware.cpu?.model || '';
966
- const gpuModel = hardware.gpu?.model || '';
967
- const cores = hardware.cpu?.physicalCores || hardware.cpu?.cores || 1;
968
- const baseSpeed = hardware.cpu?.speed || 2.4;
969
- const vramGB = hardware.gpu?.vram || 0;
970
-
971
- // Check hardware type
972
- const isAppleSilicon = process.platform === 'darwin' && (
973
- gpuModel.toLowerCase().includes('apple') ||
974
- gpuModel.toLowerCase().includes('m1') ||
975
- gpuModel.toLowerCase().includes('m2') ||
976
- gpuModel.toLowerCase().includes('m3') ||
977
- gpuModel.toLowerCase().includes('m4')
978
- );
979
- const isIntegratedGPU = /iris.*xe|iris.*graphics|uhd.*graphics|vega.*integrated|radeon.*graphics/i.test(gpuModel);
980
- const hasDedicatedGPU = vramGB > 0 && !isIntegratedGPU && !isAppleSilicon;
981
-
982
- let tokensPerSecond;
983
-
984
- if (isAppleSilicon) {
985
- // Apple Silicon unified memory - more optimistic but realistic
986
- let baseTPS = 25;
987
- if (gpuModel.toLowerCase().includes('m4 pro')) baseTPS = 35;
988
- else if (gpuModel.toLowerCase().includes('m4')) baseTPS = 30;
989
- else if (gpuModel.toLowerCase().includes('m3 pro')) baseTPS = 30;
990
- else if (gpuModel.toLowerCase().includes('m3')) baseTPS = 25;
991
- else if (gpuModel.toLowerCase().includes('m2 pro')) baseTPS = 28;
992
- else if (gpuModel.toLowerCase().includes('m2')) baseTPS = 22;
993
- else if (gpuModel.toLowerCase().includes('m1 pro')) baseTPS = 25;
994
- else if (gpuModel.toLowerCase().includes('m1')) baseTPS = 20;
995
-
996
- // Scale by model size (Apple Silicon handles larger models better)
997
- tokensPerSecond = Math.max(8, Math.round(baseTPS / Math.max(0.8, modelParams)));
998
-
999
- } else if (hasDedicatedGPU) {
1000
- // Dedicated GPU - much better performance
1001
- let gpuTPS = 30;
1002
- if (gpuModel.toLowerCase().includes('gb10') ||
1003
- gpuModel.toLowerCase().includes('grace blackwell') ||
1004
- gpuModel.toLowerCase().includes('dgx spark')) gpuTPS = 90;
1005
- else if (gpuModel.toLowerCase().includes('h100')) gpuTPS = 120;
1006
- else if (gpuModel.toLowerCase().includes('a100')) gpuTPS = 95;
1007
- else if (gpuModel.toLowerCase().includes('rtx 50')) gpuTPS = 65;
1008
- else if (gpuModel.toLowerCase().includes('rtx 40')) gpuTPS = 50;
1009
- else if (gpuModel.toLowerCase().includes('rtx 30')) gpuTPS = 40;
1010
- else if (gpuModel.toLowerCase().includes('rtx 20')) gpuTPS = 30;
1011
- else if (gpuModel.toLowerCase().includes('p100')) gpuTPS = 32;
1012
- else if (vramGB >= 16) gpuTPS = 45;
1013
- else if (vramGB >= 8) gpuTPS = 35;
1014
- else if (vramGB >= 4) gpuTPS = 25;
1015
-
1016
- // Scale by model size for GPU
1017
- tokensPerSecond = Math.max(10, Math.round(gpuTPS / Math.max(0.5, modelParams)));
1018
-
1019
- } else {
1020
- // CPU-only or integrated GPU - most realistic and conservative
1021
- const hasAVX512 = cpuModel.toLowerCase().includes('intel') &&
1022
- (cpuModel.includes('12th') || cpuModel.includes('13th') || cpuModel.includes('14th'));
1023
- const hasAVX2 = cpuModel.toLowerCase().includes('intel') || cpuModel.toLowerCase().includes('amd');
1024
-
1025
- // Base CPU coefficient - much more conservative
1026
- let cpuK = 1.8; // Conservative baseline
1027
- if (hasAVX512) cpuK = 2.6;
1028
- else if (hasAVX2) cpuK = 2.2;
1029
-
1030
- // iGPU boost (small)
1031
- const iGpuMultiplier = isIntegratedGPU ? 1.3 : 1.0;
1032
-
1033
- // Calculate with realistic threading limits
1034
- const effectiveThreads = Math.min(cores, 8); // Diminishing returns after 8 threads
1035
- const baseTPS = (cpuK * baseSpeed * effectiveThreads * iGpuMultiplier) / Math.max(1.5, modelParams);
1036
-
1037
- // Apply realistic CPU limits
1038
- const maxCPUTPS = hasAVX512 ? 25 : (isIntegratedGPU ? 20 : 15);
1039
- tokensPerSecond = Math.max(2, Math.min(maxCPUTPS, Math.round(baseTPS)));
1040
- }
1041
-
1042
- return tokensPerSecond;
964
+ const speedProfile = estimateTokenSpeedFromHardware(hardware, {
965
+ modelSizeB: modelParams,
966
+ modelName: model.name
967
+ });
968
+ return speedProfile.tokensPerSecond;
1043
969
  }
1044
970
 
1045
971
  extractModelParams(model) {
@@ -1047,11 +973,12 @@ class ExpandedModelsDatabase {
1047
973
  const name = model.name.toLowerCase();
1048
974
 
1049
975
  // Look for patterns like "7b", "3.8b", "0.5b", etc.
1050
- const paramMatch = name.match(/(\d+\.?\d*)[bm](?:\s|$)/);
976
+ const paramMatch = name.match(/(\d+\.?\d*)\s*([bm])(?:\s|$)/);
1051
977
  if (paramMatch) {
1052
978
  const value = parseFloat(paramMatch[1]);
979
+ const unit = paramMatch[2].toLowerCase();
1053
980
  // Convert millions to billions if needed
1054
- return paramMatch[1].includes('m') ? value / 1000 : value;
981
+ return unit === 'm' ? value / 1000 : value;
1055
982
  }
1056
983
 
1057
984
  // Fallback to size-based estimation
@@ -1,4 +1,4 @@
1
- const fetch = require('node-fetch');
1
+ const fetch = require('../utils/fetch');
2
2
 
3
3
  class OllamaClient {
4
4
  constructor(baseURL = null) {
@@ -377,6 +377,29 @@ class OllamaClient {
377
377
  }
378
378
  }
379
379
 
380
+ calculateTokensPerSecond(data, totalTimeMs) {
381
+ const evalCount = Number(data?.eval_count) || 0;
382
+ const evalDurationNs = Number(data?.eval_duration) || 0;
383
+ const totalSeconds = Math.max(0, Number(totalTimeMs) || 0) / 1000;
384
+
385
+ const evalTokensPerSecond = evalDurationNs > 0 && evalCount > 0
386
+ ? (evalCount / (evalDurationNs / 1_000_000_000))
387
+ : 0;
388
+
389
+ const endToEndTokensPerSecond = totalSeconds > 0 && evalCount > 0
390
+ ? (evalCount / totalSeconds)
391
+ : 0;
392
+
393
+ // Prefer eval-only throughput when available because it excludes load/setup overhead.
394
+ const preferred = evalTokensPerSecond > 0 ? evalTokensPerSecond : endToEndTokensPerSecond;
395
+
396
+ return {
397
+ tokensPerSecond: Math.round(preferred * 10) / 10,
398
+ evalTokensPerSecond: Math.round(evalTokensPerSecond * 10) / 10,
399
+ endToEndTokensPerSecond: Math.round(endToEndTokensPerSecond * 10) / 10
400
+ };
401
+ }
402
+
380
403
  async testModelPerformance(modelName, testPrompt = "Hello, how are you?") {
381
404
  const availability = await this.checkOllamaAvailability();
382
405
  if (!availability.available) {
@@ -413,13 +436,15 @@ class OllamaClient {
413
436
  const endTime = Date.now();
414
437
 
415
438
  const totalTime = endTime - startTime;
416
- const tokensGenerated = data.eval_count || 50;
417
- const tokensPerSecond = Math.round((tokensGenerated / (totalTime / 1000)) * 10) / 10;
439
+ const tokensGenerated = Number(data.eval_count) || 0;
440
+ const speed = this.calculateTokensPerSecond(data, totalTime);
418
441
 
419
442
  return {
420
443
  success: true,
421
444
  responseTime: totalTime,
422
- tokensPerSecond,
445
+ tokensPerSecond: speed.tokensPerSecond,
446
+ evalTokensPerSecond: speed.evalTokensPerSecond,
447
+ endToEndTokensPerSecond: speed.endToEndTokensPerSecond,
423
448
  tokensGenerated,
424
449
  loadTime: data.load_duration ? Math.round(data.load_duration / 1000000) : null,
425
450
  evalTime: data.eval_duration ? Math.round(data.eval_duration / 1000000) : null,