llm-checker 3.5.14 → 3.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/analyzer/compatibility.js +5 -0
- package/analyzer/performance.js +5 -4
- package/bin/cli.js +5 -39
- package/bin/enhanced_cli.js +88 -19
- package/bin/mcp-server.mjs +266 -101
- package/package.json +7 -7
- package/src/ai/multi-objective-selector.js +118 -11
- package/src/calibration/calibration-manager.js +4 -1
- package/src/data/model-database.js +39 -5
- package/src/data/sync-manager.js +32 -18
- package/src/hardware/backends/apple-silicon.js +5 -1
- package/src/hardware/backends/cuda-detector.js +47 -19
- package/src/hardware/backends/intel-detector.js +6 -2
- package/src/hardware/backends/rocm-detector.js +6 -2
- package/src/hardware/detector.js +57 -30
- package/src/hardware/unified-detector.js +129 -25
- package/src/models/ai-check-selector.js +36 -5
- package/src/models/deterministic-selector.js +163 -15
- package/src/models/expanded_database.js +9 -5
- package/src/models/intelligent-selector.js +87 -1
- package/src/models/requirements.js +16 -11
- package/src/models/scoring-core.js +341 -0
- package/src/models/scoring-engine.js +9 -2
- package/src/ollama/capacity-planner.js +15 -2
- package/src/ollama/client.js +70 -30
- package/src/ollama/enhanced-client.js +20 -2
- package/src/ollama/manager.js +14 -2
- package/src/policy/cli-policy.js +8 -2
- package/src/policy/policy-engine.js +2 -1
- package/src/provenance/model-provenance.js +4 -1
- package/src/ui/cli-theme.js +57 -7
- package/src/ui/interactive-panel.js +176 -20
|
@@ -13,6 +13,21 @@ const si = require('systeminformation');
|
|
|
13
13
|
const { execSync } = require('child_process');
|
|
14
14
|
const { normalizePlatform } = require('../utils/platform');
|
|
15
15
|
|
|
16
|
+
// Recent GPUs whose PCI device id is not yet resolved to a model name by the
|
|
17
|
+
// distro pci.ids database (so lspci / systeminformation report them as a bare
|
|
18
|
+
// "Device <id>"). Mapping the device id lets us (a) give them a real name and
|
|
19
|
+
// (b) collapse the multiple raw views of the SAME card that different detection
|
|
20
|
+
// sources produce into one inventory entry. Unknown ids degrade gracefully to a
|
|
21
|
+
// stable `pci:<id>` match key, so this table only needs the newest cards.
|
|
22
|
+
const PCI_GPU_MAP = {
|
|
23
|
+
// NVIDIA Blackwell (RTX 50 series, desktop)
|
|
24
|
+
'2f04': { family: 'rtx5070', type: 'dedicated', name: 'NVIDIA GeForce RTX 5070' },
|
|
25
|
+
'2c02': { family: 'rtx5080', type: 'dedicated', name: 'NVIDIA GeForce RTX 5080' },
|
|
26
|
+
'2b85': { family: 'rtx5090', type: 'dedicated', name: 'NVIDIA GeForce RTX 5090' },
|
|
27
|
+
// AMD Raphael / Granite Ridge desktop iGPU (Ryzen 7000/9000 non-G)
|
|
28
|
+
'13c0': { family: 'amd-raphael-igpu', type: 'integrated', name: 'AMD Radeon Graphics (Raphael)' }
|
|
29
|
+
};
|
|
30
|
+
|
|
16
31
|
class UnifiedDetector {
|
|
17
32
|
constructor() {
|
|
18
33
|
this.backends = {
|
|
@@ -613,14 +628,19 @@ class UnifiedDetector {
|
|
|
613
628
|
|
|
614
629
|
const normalized = controllers
|
|
615
630
|
.map((controller) => {
|
|
616
|
-
|
|
631
|
+
let name = String(controller?.model || controller?.name || '').replace(/\s+/g, ' ').trim();
|
|
617
632
|
if (!name || name.toLowerCase() === 'unknown') return null;
|
|
618
633
|
if (this.isRemoteDisplayModel(name)) return null;
|
|
619
634
|
|
|
620
635
|
const nameLower = name.toLowerCase();
|
|
621
636
|
if (nameLower.includes('microsoft basic') || nameLower.includes('standard vga')) return null;
|
|
622
637
|
|
|
623
|
-
|
|
638
|
+
// Resolve recent cards that the runtime could only report as a bare
|
|
639
|
+
// "Device <id>" so they get a real name and correct integrated flag.
|
|
640
|
+
const mapped = this.resolveMappedGpu(name) || this.resolveMappedGpu(controller?.deviceId);
|
|
641
|
+
if (mapped) name = mapped.name;
|
|
642
|
+
|
|
643
|
+
const isIntegrated = mapped ? mapped.type === 'integrated' : this.isIntegratedGPUModel(name);
|
|
624
644
|
let vram = isIntegrated
|
|
625
645
|
? this.estimateIntegratedFallbackMemory(controller, memoryInfo)
|
|
626
646
|
: this.normalizeFallbackVRAM(controller?.vram || controller?.memoryTotal || controller?.memory || 0);
|
|
@@ -711,30 +731,55 @@ class UnifiedDetector {
|
|
|
711
731
|
|
|
712
732
|
if (!isNvidia && !isAMD && !isIntel) continue;
|
|
713
733
|
|
|
714
|
-
const
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
.trim();
|
|
734
|
+
const vendorLabel = isNvidia ? 'NVIDIA' : (isAMD ? 'AMD' : 'Intel');
|
|
735
|
+
const pciId = this.extractPciDeviceId(line);
|
|
736
|
+
const mapped = this.resolveMappedGpu(line);
|
|
718
737
|
|
|
738
|
+
// Prefer the resolved model name inside a trailing "[Model] [vvvv:dddd]"
|
|
739
|
+
// pair (e.g. "[GeForce RTX 4060]"). Otherwise clean the raw lspci line
|
|
740
|
+
// down to a readable device string instead of using the whole line.
|
|
719
741
|
const bracketName = line.match(/\[(?![0-9a-f]{4}:[0-9a-f]{4}\])([^\]]+)\]\s*\[[0-9a-f]{4}:[0-9a-f]{4}\]/i);
|
|
720
|
-
|
|
721
|
-
|
|
742
|
+
let name = (bracketName?.[1] || '').replace(/\s+/g, ' ').trim();
|
|
743
|
+
|
|
744
|
+
if (!name) {
|
|
745
|
+
name = line
|
|
746
|
+
.replace(/^[0-9a-f]{2,4}:[0-9a-f]{2}\.[0-9a-f]\s+/i, '') // PCI address
|
|
747
|
+
.replace(/^(?:vga compatible|3d|display)\s+controller\s+\[[0-9a-f]{4}\]:\s*/i, '') // class prefix
|
|
748
|
+
.replace(/\s*\[[0-9a-f]{4}:[0-9a-f]{4}\]/i, '') // [vvvv:dddd]
|
|
749
|
+
.replace(/\s*\(rev\s+[0-9a-f]+\)\s*$/i, '') // (rev xx)
|
|
750
|
+
.replace(/\b(?:corporation|corp\.?|inc\.?|advanced micro devices,?)\b/gi, '')
|
|
751
|
+
.replace(/\[amd\/ati\]/gi, '')
|
|
752
|
+
.replace(/\s+/g, ' ')
|
|
753
|
+
.trim();
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
// If the card could not be resolved to a real model, give it a stable,
|
|
757
|
+
// readable name that carries the PCI id so it dedupes across sources.
|
|
758
|
+
const meaningful = name.replace(/\b(?:nvidia|amd|ati|intel|device|graphics|gpu|controller)\b/gi, '').replace(/[^a-z0-9]/gi, '').trim();
|
|
759
|
+
if (mapped) {
|
|
760
|
+
name = mapped.name;
|
|
761
|
+
} else if (!meaningful) {
|
|
762
|
+
name = pciId ? `${vendorLabel} Device ${pciId.toUpperCase()}` : `${vendorLabel} GPU`;
|
|
763
|
+
}
|
|
722
764
|
|
|
723
|
-
const isIntegrated =
|
|
765
|
+
const isIntegrated = mapped
|
|
766
|
+
? mapped.type === 'integrated'
|
|
767
|
+
: (this.isIntegratedGPUModel(name) || (isIntel && !/\barc\b/i.test(name)));
|
|
724
768
|
let vram = this.estimateFallbackVRAM(name);
|
|
725
769
|
if (isIntegrated) {
|
|
726
770
|
vram = 0;
|
|
727
771
|
}
|
|
728
772
|
|
|
729
|
-
const dedupeKey = `${
|
|
773
|
+
const dedupeKey = `${this.getGpuMatchKey(name)}|${isIntegrated ? 'i' : 'd'}`;
|
|
730
774
|
if (seen.has(dedupeKey)) continue;
|
|
731
775
|
seen.add(dedupeKey);
|
|
732
776
|
|
|
733
777
|
results.push({
|
|
734
778
|
name,
|
|
735
|
-
vendor:
|
|
779
|
+
vendor: vendorLabel,
|
|
736
780
|
type: isIntegrated ? 'integrated' : 'dedicated',
|
|
737
781
|
memory: { total: vram },
|
|
782
|
+
pciId: pciId || null,
|
|
738
783
|
source: 'lspci'
|
|
739
784
|
});
|
|
740
785
|
}
|
|
@@ -746,22 +791,27 @@ class UnifiedDetector {
|
|
|
746
791
|
const num = Number(value);
|
|
747
792
|
if (!Number.isFinite(num) || num <= 0) return 0;
|
|
748
793
|
|
|
749
|
-
//
|
|
750
|
-
|
|
751
|
-
|
|
794
|
+
// Unit inference by magnitude, kept consistent with
|
|
795
|
+
// HardwareDetector.normalizeVRAM so both detection paths agree:
|
|
796
|
+
//
|
|
797
|
+
// > 1e6 -> raw bytes.
|
|
798
|
+
// >= 1024 -> megabytes (systeminformation reporting range).
|
|
799
|
+
// 1 <= v <= 256 -> already gigabytes. The previous "1..80 means GB"
|
|
800
|
+
// band silently returned 0 for legitimate large GB
|
|
801
|
+
// values, so normalizeFallbackVRAM(192) was 0 — the
|
|
802
|
+
// 192 GB box in issue #88 collapsed to nothing. A
|
|
803
|
+
// single GPU realistically tops out around 192 GB.
|
|
804
|
+
// 257 <= v < 1024 -> sub-gigabyte framebuffer in MB -> rounds to 0/1 GB.
|
|
805
|
+
if (num > 1_000_000) {
|
|
806
|
+
return Math.max(0, Math.round(num / (1024 * 1024 * 1024))); // bytes -> GB
|
|
752
807
|
}
|
|
753
|
-
|
|
754
|
-
// MB -> GB
|
|
755
808
|
if (num >= 1024) {
|
|
756
|
-
return Math.round(num / 1024);
|
|
809
|
+
return Math.max(0, Math.round(num / 1024)); // MB -> GB
|
|
757
810
|
}
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
if (num >= 1 && num <= 80) {
|
|
761
|
-
return Math.round(num);
|
|
811
|
+
if (num <= 256) {
|
|
812
|
+
return Math.round(num); // already GB (plausible single-GPU range)
|
|
762
813
|
}
|
|
763
|
-
|
|
764
|
-
return 0;
|
|
814
|
+
return Math.max(0, Math.round(num / 1024)); // 257..1023 MB -> GB
|
|
765
815
|
}
|
|
766
816
|
|
|
767
817
|
isIntegratedGPUModel(model) {
|
|
@@ -799,6 +849,21 @@ class UnifiedDetector {
|
|
|
799
849
|
if (lower.includes('rx 6900') || lower.includes('rx 6800')) return 16;
|
|
800
850
|
if (lower.includes('rx 6700')) return 12;
|
|
801
851
|
|
|
852
|
+
// NVIDIA workstation / datacenter (Blackwell / Ada / Hopper / Ampere).
|
|
853
|
+
// Matched BEFORE the consumer RTX entries and the generic fallbacks so a
|
|
854
|
+
// high-VRAM professional card is not collapsed to a consumer-tier value or
|
|
855
|
+
// 0 (issue #88: dual "RTX PRO 6000" must reach ~192GB total, not ~16GB).
|
|
856
|
+
if (lower.includes('rtx pro 6000') || lower.includes('rtx 6000 blackwell')) return 96;
|
|
857
|
+
if (lower.includes('rtx 6000 ada') || lower.includes('rtx 5000 ada')) return 48;
|
|
858
|
+
if (lower.includes('rtx a6000') || lower.includes('a6000')) return 48;
|
|
859
|
+
if (lower.includes('rtx a5000') || lower.includes('a5000')) return 24;
|
|
860
|
+
if (lower.includes('l40s') || lower.includes('l40')) return 48;
|
|
861
|
+
if (lower.includes('h200')) return 141;
|
|
862
|
+
if (lower.includes('h100')) return 80;
|
|
863
|
+
if (lower.includes('a100') && (lower.includes('40gb') || /a100[\s-]?(?:pcie[\s-]?)?40\b/.test(lower))) return 40;
|
|
864
|
+
if (lower.includes('a100')) return 80; // A100 defaults to the 80GB SKU
|
|
865
|
+
if (lower.includes('a40')) return 48;
|
|
866
|
+
|
|
802
867
|
if (lower.includes('rtx 5090')) return 32;
|
|
803
868
|
if (lower.includes('rtx 4090') || lower.includes('rtx 3090')) return 24;
|
|
804
869
|
if (lower.includes('rtx 5080') || lower.includes('rtx 4080')) return 16;
|
|
@@ -817,6 +882,15 @@ class UnifiedDetector {
|
|
|
817
882
|
return `${familyMatch[1]}${familyMatch[2]}`;
|
|
818
883
|
}
|
|
819
884
|
|
|
885
|
+
// Different detection sources describe an unresolved card in different
|
|
886
|
+
// ways for the SAME hardware, e.g. systeminformation "Device 2f04" and
|
|
887
|
+
// lspci "...Device [10de:2f04]". Key on the PCI device id (mapped to a
|
|
888
|
+
// canonical family when known) so those collapse to one inventory entry.
|
|
889
|
+
const pciId = this.extractPciDeviceId(name);
|
|
890
|
+
if (pciId) {
|
|
891
|
+
return (PCI_GPU_MAP[pciId] && PCI_GPU_MAP[pciId].family) || `pci:${pciId}`;
|
|
892
|
+
}
|
|
893
|
+
|
|
820
894
|
const concise = lower
|
|
821
895
|
.replace(/nvidia|amd|ati|intel|corporation|geforce|radeon|graphics/g, '')
|
|
822
896
|
.replace(/\s+/g, ' ')
|
|
@@ -825,6 +899,26 @@ class UnifiedDetector {
|
|
|
825
899
|
return concise || lower;
|
|
826
900
|
}
|
|
827
901
|
|
|
902
|
+
/**
|
|
903
|
+
* Extract a 4-hex PCI device id from a GPU name/description, handling both the
|
|
904
|
+
* lspci "[vendor:device]" form and the bare "Device <id>" form that
|
|
905
|
+
* systeminformation emits for cards it cannot name. Returns null when none.
|
|
906
|
+
*/
|
|
907
|
+
extractPciDeviceId(text) {
|
|
908
|
+
const value = String(text || '');
|
|
909
|
+
const bracket = value.match(/\[[0-9a-f]{4}:([0-9a-f]{4})\]/i);
|
|
910
|
+
if (bracket) return bracket[1].toLowerCase();
|
|
911
|
+
const bare = value.match(/\bdevice\s+([0-9a-f]{4})\b/i);
|
|
912
|
+
if (bare) return bare[1].toLowerCase();
|
|
913
|
+
return null;
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
/** Look up a curated mapping for a recent card by PCI device id (or null). */
|
|
917
|
+
resolveMappedGpu(text) {
|
|
918
|
+
const pciId = this.extractPciDeviceId(text);
|
|
919
|
+
return pciId && PCI_GPU_MAP[pciId] ? { pciId, ...PCI_GPU_MAP[pciId] } : null;
|
|
920
|
+
}
|
|
921
|
+
|
|
828
922
|
/**
|
|
829
923
|
* Generate hardware fingerprint for benchmarks
|
|
830
924
|
*/
|
|
@@ -879,9 +973,19 @@ class UnifiedDetector {
|
|
|
879
973
|
summary.bestBackend === 'metal' ||
|
|
880
974
|
(summary.hasIntegratedGPU && !summary.hasDedicatedGPU && summary.integratedSharedMemory > 0)
|
|
881
975
|
) {
|
|
882
|
-
|
|
976
|
+
const effectiveMemory = Number(summary.effectiveMemory);
|
|
977
|
+
if (!Number.isFinite(effectiveMemory) || effectiveMemory <= 0) return false;
|
|
978
|
+
return sizeGB <= (effectiveMemory - 2);
|
|
883
979
|
} else {
|
|
884
|
-
const
|
|
980
|
+
const totalVRAM = Number(summary.totalVRAM);
|
|
981
|
+
if (!Number.isFinite(totalVRAM) || totalVRAM <= 0) return false;
|
|
982
|
+
|
|
983
|
+
// Guard the per-GPU divisor: gpuCount can be 0 when the summary was
|
|
984
|
+
// built without resolved GPU memory, which previously produced
|
|
985
|
+
// Infinity (totalVRAM / 0) and made any model "fit".
|
|
986
|
+
const gpuCount = Math.max(1, Number(summary.gpuCount) || 0);
|
|
987
|
+
const availableVRAM = useMultiGPU ? totalVRAM : (totalVRAM / gpuCount);
|
|
988
|
+
if (!Number.isFinite(availableVRAM) || availableVRAM <= 0) return false;
|
|
885
989
|
return sizeGB <= (availableVRAM - 2);
|
|
886
990
|
}
|
|
887
991
|
}
|
|
@@ -62,6 +62,25 @@ Respond with JSON only, no additional text.`;
|
|
|
62
62
|
/**
|
|
63
63
|
* Main AI-Check function
|
|
64
64
|
*/
|
|
65
|
+
/** Normalize the --models option (array, or comma/space-separated string) to a list. */
|
|
66
|
+
parseModelFilter(models) {
|
|
67
|
+
if (!models) return [];
|
|
68
|
+
const list = Array.isArray(models) ? models : String(models).split(/[,\s]+/);
|
|
69
|
+
return list.map((m) => String(m).trim().toLowerCase()).filter(Boolean);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** True when an Ollama DB model matches a user-supplied name fragment. */
|
|
73
|
+
modelMatchesFilter(model, needle) {
|
|
74
|
+
const identifier = String(model?.model_identifier || '').toLowerCase();
|
|
75
|
+
const name = String(model?.model_name || '').toLowerCase();
|
|
76
|
+
return (
|
|
77
|
+
identifier === needle ||
|
|
78
|
+
name === needle ||
|
|
79
|
+
identifier.includes(needle) ||
|
|
80
|
+
name.includes(needle)
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
|
|
65
84
|
async aiCheck(options = {}) {
|
|
66
85
|
const {
|
|
67
86
|
category = 'general',
|
|
@@ -90,11 +109,23 @@ Respond with JSON only, no additional text.`;
|
|
|
90
109
|
const budget = hardware.gpu.unified ? hardware.usableMemGB :
|
|
91
110
|
(hardware.gpu.vramGB || hardware.usableMemGB);
|
|
92
111
|
|
|
93
|
-
//
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
112
|
+
// Optional explicit model filter (--models qwen2.5,llama3.1). When present
|
|
113
|
+
// it overrides the category filter: the user asked for specific models.
|
|
114
|
+
const modelFilter = this.parseModelFilter(options.models);
|
|
115
|
+
let categoryModels;
|
|
116
|
+
if (modelFilter.length > 0) {
|
|
117
|
+
categoryModels = allOllamaModels.filter((model) =>
|
|
118
|
+
modelFilter.some((needle) => this.modelMatchesFilter(model, needle))
|
|
119
|
+
);
|
|
120
|
+
if (!silent) {
|
|
121
|
+
console.log(chalk.cyan('│') + ` Restricted to ${categoryModels.length} model(s) matching --models`);
|
|
122
|
+
}
|
|
123
|
+
} else {
|
|
124
|
+
// Filter models by category first
|
|
125
|
+
categoryModels = this.filterOllamaModelsByCategory(allOllamaModels, category);
|
|
126
|
+
if (!silent) {
|
|
127
|
+
console.log(chalk.cyan('│') + ` ${categoryModels.length} models match ${category} category`);
|
|
128
|
+
}
|
|
98
129
|
}
|
|
99
130
|
|
|
100
131
|
// Evaluate each model using deterministic scoring
|
|
@@ -1556,10 +1556,21 @@ class DeterministicModelSelector {
|
|
|
1556
1556
|
const S = speedEstimate.score;
|
|
1557
1557
|
const F = this.calculateFitScore(requiredGB, budget);
|
|
1558
1558
|
const C = this.calculateContextScore(model, targetCtx);
|
|
1559
|
+
const capacityAdjustment = this.calculateHighCapacitySizeAdjustment(
|
|
1560
|
+
hardware,
|
|
1561
|
+
model,
|
|
1562
|
+
budget,
|
|
1563
|
+
category,
|
|
1564
|
+
optimizeFor
|
|
1565
|
+
);
|
|
1559
1566
|
|
|
1560
1567
|
// 4. Calculate final weighted score
|
|
1561
1568
|
const weights = this.getScoringWeights(category, optimizeFor);
|
|
1562
|
-
const
|
|
1569
|
+
const weightedScore = Q * weights[0] + S * weights[1] + F * weights[2] + C * weights[3];
|
|
1570
|
+
const score = Math.max(
|
|
1571
|
+
0,
|
|
1572
|
+
Math.min(100, Math.round((weightedScore + capacityAdjustment.score) * 10) / 10)
|
|
1573
|
+
);
|
|
1563
1574
|
|
|
1564
1575
|
// 5. Build rationale
|
|
1565
1576
|
const rationale = this.buildRationale(
|
|
@@ -1572,7 +1583,8 @@ class DeterministicModelSelector {
|
|
|
1572
1583
|
Q,
|
|
1573
1584
|
S,
|
|
1574
1585
|
memoryEstimate,
|
|
1575
|
-
speedEstimate
|
|
1586
|
+
speedEstimate,
|
|
1587
|
+
capacityAdjustment
|
|
1576
1588
|
);
|
|
1577
1589
|
|
|
1578
1590
|
return {
|
|
@@ -1599,7 +1611,8 @@ class DeterministicModelSelector {
|
|
|
1599
1611
|
runtime: speedEstimate.runtime,
|
|
1600
1612
|
moe: speedEstimate.moe
|
|
1601
1613
|
},
|
|
1602
|
-
components: { Q, S, F, C }
|
|
1614
|
+
components: { Q, S, F, C, H: capacityAdjustment.score },
|
|
1615
|
+
optimizeFor
|
|
1603
1616
|
};
|
|
1604
1617
|
}
|
|
1605
1618
|
|
|
@@ -1858,6 +1871,9 @@ class DeterministicModelSelector {
|
|
|
1858
1871
|
if (hardware.cpu.cores >= 8) base *= 1.1;
|
|
1859
1872
|
if (hardware.acceleration.supports_metal || hardware.acceleration.supports_cuda) base *= 1.2;
|
|
1860
1873
|
|
|
1874
|
+
const acceleratorScale = this.calculateAcceleratorSpeedScale(hardware, backend);
|
|
1875
|
+
base *= acceleratorScale.multiplier;
|
|
1876
|
+
|
|
1861
1877
|
const normalizedRuntime = normalizeMoERuntime(runtime);
|
|
1862
1878
|
const moe = estimateMoESpeedMultiplier({
|
|
1863
1879
|
model,
|
|
@@ -1880,7 +1896,46 @@ class DeterministicModelSelector {
|
|
|
1880
1896
|
estimatedTPS,
|
|
1881
1897
|
score,
|
|
1882
1898
|
runtime: normalizedRuntime,
|
|
1883
|
-
moe
|
|
1899
|
+
moe,
|
|
1900
|
+
acceleratorScale
|
|
1901
|
+
};
|
|
1902
|
+
}
|
|
1903
|
+
|
|
1904
|
+
calculateAcceleratorSpeedScale(hardware = {}, backend = 'cpu_x86') {
|
|
1905
|
+
if (backend !== 'cuda' && backend !== 'metal') {
|
|
1906
|
+
return { multiplier: 1, reason: null };
|
|
1907
|
+
}
|
|
1908
|
+
|
|
1909
|
+
const gpu = hardware.gpu || {};
|
|
1910
|
+
const memory = hardware.memory || {};
|
|
1911
|
+
const toFiniteNumber = (value, fallback = 0) => {
|
|
1912
|
+
const parsed = Number(value);
|
|
1913
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
1914
|
+
};
|
|
1915
|
+
const vramGB = toFiniteNumber(gpu.vramGB ?? gpu.vram ?? gpu.totalVRAM, 0);
|
|
1916
|
+
const ramGB = toFiniteNumber(memory.totalGB ?? memory.total, 0);
|
|
1917
|
+
const acceleratorMemoryGB = backend === 'metal' && Boolean(gpu.unified)
|
|
1918
|
+
? Math.max(vramGB, ramGB)
|
|
1919
|
+
: vramGB;
|
|
1920
|
+
const gpuCount = Math.max(1, toFiniteNumber(gpu.gpuCount ?? gpu.count, 1));
|
|
1921
|
+
|
|
1922
|
+
let multiplier = 1;
|
|
1923
|
+
if (acceleratorMemoryGB >= 160) multiplier *= 3.2;
|
|
1924
|
+
else if (acceleratorMemoryGB >= 96) multiplier *= 2.6;
|
|
1925
|
+
else if (acceleratorMemoryGB >= 80) multiplier *= 2.2;
|
|
1926
|
+
else if (acceleratorMemoryGB >= 48) multiplier *= 1.7;
|
|
1927
|
+
else if (acceleratorMemoryGB >= 24) multiplier *= 1.15;
|
|
1928
|
+
|
|
1929
|
+
if (backend === 'cuda' && gpuCount > 1) {
|
|
1930
|
+
multiplier *= Math.min(1.8, 1 + ((gpuCount - 1) * 0.25));
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
const rounded = Math.round(multiplier * 100) / 100;
|
|
1934
|
+
return {
|
|
1935
|
+
multiplier: rounded,
|
|
1936
|
+
reason: rounded > 1
|
|
1937
|
+
? `${backend.toUpperCase()} capacity x${rounded}`
|
|
1938
|
+
: null
|
|
1884
1939
|
};
|
|
1885
1940
|
}
|
|
1886
1941
|
|
|
@@ -1888,13 +1943,79 @@ class DeterministicModelSelector {
|
|
|
1888
1943
|
const ratio = requiredGB / budgetGB;
|
|
1889
1944
|
if (ratio <= 0.9) return 100;
|
|
1890
1945
|
if (ratio <= 1.0) return 70;
|
|
1891
|
-
return 0; //
|
|
1946
|
+
return 0; // Unreachable in practice: evaluateModel drops requiredGB > budget.
|
|
1892
1947
|
}
|
|
1893
1948
|
|
|
1894
1949
|
calculateContextScore(model, targetCtx) {
|
|
1895
|
-
|
|
1896
|
-
if (
|
|
1897
|
-
|
|
1950
|
+
const ctxMax = Number(model?.ctxMax) || 0;
|
|
1951
|
+
if (ctxMax >= targetCtx) return 100;
|
|
1952
|
+
if (ctxMax >= targetCtx * 0.5) return 70;
|
|
1953
|
+
// Context is NOT pre-filtered: a model that cannot serve the requested
|
|
1954
|
+
// context still scores here (0 for this component) and stays eligible,
|
|
1955
|
+
// weighted down rather than excluded.
|
|
1956
|
+
return 0;
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
getHighCapacitySizeTarget(budgetGB, hardware = {}) {
|
|
1960
|
+
if (!Number.isFinite(budgetGB) || budgetGB < 32) return null;
|
|
1961
|
+
|
|
1962
|
+
const isMultiGPU = Boolean(hardware?.gpu?.isMultiGPU);
|
|
1963
|
+
if (budgetGB >= 128) return { minParamsB: 30, sweetSpotParamsB: 70 };
|
|
1964
|
+
if (budgetGB >= 80) return { minParamsB: 30, sweetSpotParamsB: 70 };
|
|
1965
|
+
if (budgetGB >= 48) return { minParamsB: 20, sweetSpotParamsB: 34 };
|
|
1966
|
+
if (budgetGB >= 32 && isMultiGPU) return { minParamsB: 30, sweetSpotParamsB: 30 };
|
|
1967
|
+
if (budgetGB >= 32) return { minParamsB: 13, sweetSpotParamsB: 30 };
|
|
1968
|
+
return null;
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1971
|
+
calculateHighCapacitySizeAdjustment(hardware, model, budgetGB, category, optimizeFor = 'balanced') {
|
|
1972
|
+
const objective = this.normalizeOptimizationObjective(optimizeFor);
|
|
1973
|
+
if (objective === 'speed' || category === 'embeddings') {
|
|
1974
|
+
return { score: 0, reason: null };
|
|
1975
|
+
}
|
|
1976
|
+
|
|
1977
|
+
const normalizedHardware = this.normalizeHardwareProfile(hardware || {});
|
|
1978
|
+
const tier = this.mapHardwareTier(normalizedHardware);
|
|
1979
|
+
const highCapacityTiers = new Set(['very_high', 'ultra_high', 'extreme', 'flagship']);
|
|
1980
|
+
const target = this.getHighCapacitySizeTarget(budgetGB, normalizedHardware);
|
|
1981
|
+
const hasHighCapacitySignal =
|
|
1982
|
+
Boolean(target) ||
|
|
1983
|
+
highCapacityTiers.has(tier) ||
|
|
1984
|
+
Number(normalizedHardware?.gpu?.vramGB || 0) >= 48;
|
|
1985
|
+
|
|
1986
|
+
if (!hasHighCapacitySignal || !target) {
|
|
1987
|
+
return { score: 0, reason: null };
|
|
1988
|
+
}
|
|
1989
|
+
|
|
1990
|
+
const params = this.parseBillionsValue(model?.paramsB);
|
|
1991
|
+
if (!Number.isFinite(params) || params <= 0) {
|
|
1992
|
+
return { score: 0, reason: null };
|
|
1993
|
+
}
|
|
1994
|
+
|
|
1995
|
+
const categoryMultiplier = category === 'multimodal' ? 0.6 : 1;
|
|
1996
|
+
if (params < target.minParamsB) {
|
|
1997
|
+
const deficitRatio = (target.minParamsB - params) / target.minParamsB;
|
|
1998
|
+
const penalty = -Math.min(24, deficitRatio * 24) * categoryMultiplier;
|
|
1999
|
+
const roundedPenalty = Math.round(penalty * 10) / 10;
|
|
2000
|
+
return {
|
|
2001
|
+
score: roundedPenalty,
|
|
2002
|
+
reason: `below ${target.minParamsB}B high-capacity floor`
|
|
2003
|
+
};
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
const distanceRatio = Math.min(
|
|
2007
|
+
1,
|
|
2008
|
+
Math.abs(params - target.sweetSpotParamsB) / target.sweetSpotParamsB
|
|
2009
|
+
);
|
|
2010
|
+
const bonus = Math.max(0, 12 * (1 - distanceRatio)) * categoryMultiplier;
|
|
2011
|
+
const roundedBonus = Math.round(bonus * 10) / 10;
|
|
2012
|
+
|
|
2013
|
+
return {
|
|
2014
|
+
score: roundedBonus,
|
|
2015
|
+
reason: roundedBonus > 0
|
|
2016
|
+
? `${target.sweetSpotParamsB}B high-capacity target`
|
|
2017
|
+
: null
|
|
2018
|
+
};
|
|
1898
2019
|
}
|
|
1899
2020
|
|
|
1900
2021
|
estimatePracticalMaxParamsForBudget(budgetGB) {
|
|
@@ -1994,7 +2115,19 @@ class DeterministicModelSelector {
|
|
|
1994
2115
|
return highCapacityPromoted;
|
|
1995
2116
|
}
|
|
1996
2117
|
|
|
1997
|
-
buildRationale(
|
|
2118
|
+
buildRationale(
|
|
2119
|
+
hardware,
|
|
2120
|
+
model,
|
|
2121
|
+
quant,
|
|
2122
|
+
requiredGB,
|
|
2123
|
+
budget,
|
|
2124
|
+
category,
|
|
2125
|
+
Q,
|
|
2126
|
+
S,
|
|
2127
|
+
memoryEstimate = null,
|
|
2128
|
+
speedEstimate = null,
|
|
2129
|
+
capacityAdjustment = null
|
|
2130
|
+
) {
|
|
1998
2131
|
const parts = [];
|
|
1999
2132
|
|
|
2000
2133
|
// Memory fit
|
|
@@ -2027,6 +2160,14 @@ class DeterministicModelSelector {
|
|
|
2027
2160
|
const multiplier = Number(speedEstimate.moe.multiplier || 1).toFixed(2);
|
|
2028
2161
|
parts.push(`MoE speed x${multiplier} (${runtimeLabel})`);
|
|
2029
2162
|
}
|
|
2163
|
+
|
|
2164
|
+
if (speedEstimate?.acceleratorScale?.multiplier > 1) {
|
|
2165
|
+
parts.push(speedEstimate.acceleratorScale.reason);
|
|
2166
|
+
}
|
|
2167
|
+
|
|
2168
|
+
if (capacityAdjustment?.reason) {
|
|
2169
|
+
parts.push(capacityAdjustment.reason);
|
|
2170
|
+
}
|
|
2030
2171
|
|
|
2031
2172
|
// Size sweet spot
|
|
2032
2173
|
if (model.paramsB >= 7 && model.paramsB <= 13) {
|
|
@@ -2114,14 +2255,21 @@ class DeterministicModelSelector {
|
|
|
2114
2255
|
|
|
2115
2256
|
updateCandidateWithMeasuredSpeed(candidate, measuredTPS, category) {
|
|
2116
2257
|
const normalizedS = this.normalizeTPSToScore(measuredTPS, category);
|
|
2117
|
-
|
|
2118
|
-
//
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2258
|
+
|
|
2259
|
+
// Re-score with the measured speed using the SAME weighting source as
|
|
2260
|
+
// evaluateModel: getScoringWeights honours the user's optimizeFor profile and
|
|
2261
|
+
// falls back to the general weights for categories (e.g. 'talking') that have
|
|
2262
|
+
// no entry in DETERMINISTIC_WEIGHTS — indexing this.categoryWeights[category]
|
|
2263
|
+
// directly threw a TypeError for those. We also re-add the stored capacity
|
|
2264
|
+
// adjustment (H) and clamp, so a probed score stays comparable to a
|
|
2265
|
+
// non-probed one instead of being silently lower.
|
|
2266
|
+
const weights = this.getScoringWeights(category, candidate.optimizeFor || 'balanced');
|
|
2267
|
+
const { Q, F, C, H = 0 } = candidate.components;
|
|
2268
|
+
|
|
2122
2269
|
candidate.estTPS = measuredTPS;
|
|
2123
2270
|
candidate.components.S = normalizedS;
|
|
2124
|
-
|
|
2271
|
+
const weighted = Q * weights[0] + normalizedS * weights[1] + F * weights[2] + C * weights[3];
|
|
2272
|
+
candidate.score = Math.max(0, Math.min(100, Math.round((weighted + H) * 10) / 10));
|
|
2125
2273
|
}
|
|
2126
2274
|
|
|
2127
2275
|
normalizeTPSToScore(tps, category) {
|
|
@@ -1007,18 +1007,22 @@ class ExpandedModelsDatabase {
|
|
|
1007
1007
|
}
|
|
1008
1008
|
|
|
1009
1009
|
estimateMemoryUsage(model) {
|
|
1010
|
-
|
|
1010
|
+
// Derive footprint from parameter count, not by stripping the unit off the
|
|
1011
|
+
// size string and treating the bare number as gigabytes — that read a 774M
|
|
1012
|
+
// model ("774M") as ~774 GB and a 22M model as ~22 GB. ~0.7 GB per 1B params
|
|
1013
|
+
// is a reasonable quantized-runtime footprint baseline.
|
|
1014
|
+
const sizeGB = this.extractModelParams(model) * 0.7;
|
|
1011
1015
|
|
|
1012
1016
|
// Rough estimates including model loading overhead
|
|
1013
1017
|
return {
|
|
1014
|
-
minimal: Math.round(sizeGB * 1.2), // With quantization
|
|
1015
|
-
typical: Math.round(sizeGB * 1.5), // Standard loading
|
|
1016
|
-
maximum: Math.round(sizeGB * 2.0) // With full context
|
|
1018
|
+
minimal: Math.max(1, Math.round(sizeGB * 1.2)), // With quantization
|
|
1019
|
+
typical: Math.max(1, Math.round(sizeGB * 1.5)), // Standard loading
|
|
1020
|
+
maximum: Math.max(1, Math.round(sizeGB * 2.0)) // With full context
|
|
1017
1021
|
};
|
|
1018
1022
|
}
|
|
1019
1023
|
|
|
1020
1024
|
estimatePowerConsumption(model, hardware) {
|
|
1021
|
-
const sizeGB =
|
|
1025
|
+
const sizeGB = this.extractModelParams(model) * 0.7;
|
|
1022
1026
|
const tier = this.getHardwareTier(hardware);
|
|
1023
1027
|
|
|
1024
1028
|
const basePower = {
|