llm-checker 3.4.2 → 3.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -10
- package/analyzer/performance.js +40 -94
- package/bin/enhanced_cli.js +320 -254
- package/bin/mcp-server.mjs +0 -0
- package/package.json +1 -1
- package/src/models/ai-check-selector.js +2 -2
- package/src/models/deterministic-selector.js +1 -0
- package/src/models/expanded_database.js +10 -83
- package/src/ollama/client.js +29 -4
- package/src/ui/cli-theme.js +733 -0
- package/src/ui/interactive-panel.js +599 -0
- package/src/utils/fetch.js +17 -0
- package/src/utils/token-speed-estimator.js +207 -0
- package/src/ollama/gpu-placement-planner.js +0 -496
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
function toNumber(value, fallback = 0) {
|
|
2
|
+
if (typeof value === 'number' && Number.isFinite(value)) return value;
|
|
3
|
+
if (typeof value === 'string') {
|
|
4
|
+
const trimmed = value.trim();
|
|
5
|
+
if (trimmed) {
|
|
6
|
+
const parsed = Number(trimmed);
|
|
7
|
+
if (Number.isFinite(parsed)) return parsed;
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
return fallback;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function normalizeModelSizeB(rawModelSizeB) {
|
|
14
|
+
const value = toNumber(rawModelSizeB, 1);
|
|
15
|
+
return Math.max(0.7, value);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function detectAppleSilicon(architecture, cpuModel, gpuModel) {
|
|
19
|
+
const signal = `${architecture} ${cpuModel} ${gpuModel}`.toLowerCase();
|
|
20
|
+
return (
|
|
21
|
+
signal.includes('apple silicon') ||
|
|
22
|
+
/(^|\s)m[1-4](\s|$)/.test(signal) ||
|
|
23
|
+
signal.includes('apple m1') ||
|
|
24
|
+
signal.includes('apple m2') ||
|
|
25
|
+
signal.includes('apple m3') ||
|
|
26
|
+
signal.includes('apple m4')
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function isIntegratedGPU(gpuModel = '') {
|
|
31
|
+
return /iris.*xe|iris.*graphics|uhd.*graphics|vega.*integrated|radeon.*graphics|intel.*integrated|integrated/i.test(gpuModel);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function getAppleSiliconBaseline(cpuModel, gpuModel) {
|
|
35
|
+
const signal = `${cpuModel} ${gpuModel}`.toLowerCase();
|
|
36
|
+
const profiles = [
|
|
37
|
+
{ pattern: /m4 ultra/, tps7b: 95 },
|
|
38
|
+
{ pattern: /m4 max/, tps7b: 65 },
|
|
39
|
+
{ pattern: /m4 pro/, tps7b: 43 },
|
|
40
|
+
{ pattern: /m4/, tps7b: 30 },
|
|
41
|
+
{ pattern: /m3 ultra/, tps7b: 88 },
|
|
42
|
+
{ pattern: /m3 max/, tps7b: 58 },
|
|
43
|
+
{ pattern: /m3 pro/, tps7b: 34 },
|
|
44
|
+
{ pattern: /m3/, tps7b: 27 },
|
|
45
|
+
{ pattern: /m2 ultra/, tps7b: 80 },
|
|
46
|
+
{ pattern: /m2 max/, tps7b: 52 },
|
|
47
|
+
{ pattern: /m2 pro/, tps7b: 32 },
|
|
48
|
+
{ pattern: /m2/, tps7b: 24 },
|
|
49
|
+
{ pattern: /m1 ultra/, tps7b: 72 },
|
|
50
|
+
{ pattern: /m1 max/, tps7b: 48 },
|
|
51
|
+
{ pattern: /m1 pro/, tps7b: 30 },
|
|
52
|
+
{ pattern: /m1/, tps7b: 22 }
|
|
53
|
+
];
|
|
54
|
+
|
|
55
|
+
for (const profile of profiles) {
|
|
56
|
+
if (profile.pattern.test(signal)) {
|
|
57
|
+
return profile.tps7b;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return 24;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function getDedicatedGPUBaseline(gpuModel, vramGB) {
|
|
64
|
+
const signal = (gpuModel || '').toLowerCase();
|
|
65
|
+
const profiles = [
|
|
66
|
+
{ pattern: /h100/, tps7b: 170 },
|
|
67
|
+
{ pattern: /a100/, tps7b: 130 },
|
|
68
|
+
{ pattern: /rtx 50|rtx50|blackwell/, tps7b: 105 },
|
|
69
|
+
{ pattern: /rtx 4090/, tps7b: 80 },
|
|
70
|
+
{ pattern: /rtx 4080/, tps7b: 65 },
|
|
71
|
+
{ pattern: /rtx 3090/, tps7b: 62 },
|
|
72
|
+
{ pattern: /rtx 3080/, tps7b: 50 },
|
|
73
|
+
{ pattern: /rtx 3070/, tps7b: 42 },
|
|
74
|
+
{ pattern: /rtx 3060/, tps7b: 34 },
|
|
75
|
+
{ pattern: /rtx 20/, tps7b: 26 },
|
|
76
|
+
{ pattern: /rx 79|7900|7800/, tps7b: 52 },
|
|
77
|
+
{ pattern: /rx 69|6800/, tps7b: 42 }
|
|
78
|
+
];
|
|
79
|
+
|
|
80
|
+
for (const profile of profiles) {
|
|
81
|
+
if (profile.pattern.test(signal)) {
|
|
82
|
+
return profile.tps7b;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (vramGB >= 24) return 60;
|
|
87
|
+
if (vramGB >= 16) return 48;
|
|
88
|
+
if (vramGB >= 12) return 40;
|
|
89
|
+
if (vramGB >= 8) return 30;
|
|
90
|
+
if (vramGB >= 4) return 18;
|
|
91
|
+
return 14;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function hasAVX512(cpuModel = '') {
|
|
95
|
+
const signal = cpuModel.toLowerCase();
|
|
96
|
+
return signal.includes('avx512') ||
|
|
97
|
+
(signal.includes('intel') &&
|
|
98
|
+
(signal.includes('12th') || signal.includes('13th') || signal.includes('14th')));
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function hasAVX2(cpuModel = '') {
|
|
102
|
+
const signal = cpuModel.toLowerCase();
|
|
103
|
+
return signal.includes('avx2') || signal.includes('intel') || signal.includes('amd');
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function getCPUBaseline(cpuModel, cores, baseSpeedGHz, integratedGpuAssist) {
|
|
107
|
+
let baseline = 4 + (cores * 0.6) + ((Math.max(1.5, baseSpeedGHz) - 2.0) * 2.5);
|
|
108
|
+
|
|
109
|
+
if (hasAVX512(cpuModel)) baseline += 3;
|
|
110
|
+
else if (hasAVX2(cpuModel)) baseline += 1.5;
|
|
111
|
+
|
|
112
|
+
if (integratedGpuAssist) baseline += 1;
|
|
113
|
+
|
|
114
|
+
const maxBaseline = integratedGpuAssist ? 22 : 18;
|
|
115
|
+
return Math.max(3, Math.min(maxBaseline, baseline));
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function calculateSizeScale(modelSizeB) {
|
|
119
|
+
const scale = Math.pow(7 / modelSizeB, 0.72);
|
|
120
|
+
return Math.max(0.18, Math.min(2.2, scale));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function calculateMemoryFactor(modelSizeB, availableInferenceMemoryGB) {
|
|
124
|
+
const estimatedWorkingSetGB = (modelSizeB * 0.75) + 2;
|
|
125
|
+
const ratio = availableInferenceMemoryGB / Math.max(1, estimatedWorkingSetGB);
|
|
126
|
+
|
|
127
|
+
if (ratio >= 1.2) return 1.05;
|
|
128
|
+
if (ratio >= 1.0) return 1.0;
|
|
129
|
+
if (ratio >= 0.75) return 0.85;
|
|
130
|
+
if (ratio >= 0.6) return 0.65;
|
|
131
|
+
return 0.45;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function estimateTokenSpeedFromHardware(hardware = {}, options = {}) {
|
|
135
|
+
const cpuModel = String(hardware.cpu?.brand || hardware.cpu?.model || '');
|
|
136
|
+
const gpuModel = String(hardware.gpu?.model || '');
|
|
137
|
+
const architecture = String(hardware.cpu?.architecture || '');
|
|
138
|
+
|
|
139
|
+
const modelSizeB = normalizeModelSizeB(options.modelSizeB);
|
|
140
|
+
const cores = Math.max(1, toNumber(hardware.cpu?.physicalCores || hardware.cpu?.cores, 1));
|
|
141
|
+
const baseSpeedGHz = Math.max(1.5, toNumber(hardware.cpu?.speed || hardware.cpu?.speedMax, 2.4));
|
|
142
|
+
|
|
143
|
+
const memoryTotalGB = Math.max(
|
|
144
|
+
2,
|
|
145
|
+
toNumber(
|
|
146
|
+
hardware.memory?.total ||
|
|
147
|
+
hardware.memory?.totalGB ||
|
|
148
|
+
hardware.memory_gb,
|
|
149
|
+
8
|
|
150
|
+
)
|
|
151
|
+
);
|
|
152
|
+
|
|
153
|
+
const vramGB = Math.max(
|
|
154
|
+
0,
|
|
155
|
+
toNumber(
|
|
156
|
+
hardware.gpu?.vram ||
|
|
157
|
+
hardware.gpu?.vramGB ||
|
|
158
|
+
hardware.gpu?.totalVRAM ||
|
|
159
|
+
hardware.gpu?.memory?.total,
|
|
160
|
+
0
|
|
161
|
+
)
|
|
162
|
+
);
|
|
163
|
+
|
|
164
|
+
const appleSilicon = detectAppleSilicon(architecture, cpuModel, gpuModel);
|
|
165
|
+
const integrated = isIntegratedGPU(gpuModel);
|
|
166
|
+
const dedicatedGPU = vramGB > 0 && !integrated && !appleSilicon;
|
|
167
|
+
|
|
168
|
+
let baselineTPS7B;
|
|
169
|
+
let backend;
|
|
170
|
+
let availableInferenceMemoryGB;
|
|
171
|
+
|
|
172
|
+
if (appleSilicon) {
|
|
173
|
+
backend = 'metal';
|
|
174
|
+
baselineTPS7B = getAppleSiliconBaseline(cpuModel, gpuModel);
|
|
175
|
+
availableInferenceMemoryGB = memoryTotalGB * 0.82;
|
|
176
|
+
} else if (dedicatedGPU) {
|
|
177
|
+
backend = 'gpu';
|
|
178
|
+
baselineTPS7B = getDedicatedGPUBaseline(gpuModel, vramGB);
|
|
179
|
+
availableInferenceMemoryGB = vramGB + Math.min(memoryTotalGB * 0.15, 8);
|
|
180
|
+
} else {
|
|
181
|
+
backend = integrated ? 'integrated' : 'cpu';
|
|
182
|
+
baselineTPS7B = getCPUBaseline(cpuModel, cores, baseSpeedGHz, integrated);
|
|
183
|
+
availableInferenceMemoryGB = memoryTotalGB * 0.65;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const sizeScale = calculateSizeScale(modelSizeB);
|
|
187
|
+
const memoryFactor = calculateMemoryFactor(modelSizeB, availableInferenceMemoryGB);
|
|
188
|
+
|
|
189
|
+
const maxTPS = backend === 'metal' ? 140 : backend === 'gpu' ? 220 : 35;
|
|
190
|
+
const minTPS = backend === 'cpu' ? 1 : 2;
|
|
191
|
+
|
|
192
|
+
const estimated = baselineTPS7B * sizeScale * memoryFactor;
|
|
193
|
+
const tokensPerSecond = Math.max(minTPS, Math.min(maxTPS, Math.round(estimated)));
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
tokensPerSecond,
|
|
197
|
+
backend,
|
|
198
|
+
baselineTPS7B: Math.round(baselineTPS7B * 10) / 10,
|
|
199
|
+
sizeScale: Math.round(sizeScale * 1000) / 1000,
|
|
200
|
+
memoryFactor: Math.round(memoryFactor * 1000) / 1000,
|
|
201
|
+
modelSizeB
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
module.exports = {
|
|
206
|
+
estimateTokenSpeedFromHardware
|
|
207
|
+
};
|
|
@@ -1,496 +0,0 @@
|
|
|
1
|
-
class OllamaGPUPlacementPlanner {
|
|
2
|
-
constructor(options = {}) {
|
|
3
|
-
this.minContext = options.minContext || 2048;
|
|
4
|
-
this.defaultReserveGB = options.defaultReserveGB || 1;
|
|
5
|
-
this.kvFactorPer4k = options.kvFactorPer4k || 0.08; // GB per 1B params at 4K context
|
|
6
|
-
this.modelOverheadGB = options.modelOverheadGB || 0.7;
|
|
7
|
-
this.spreadOverheadGB = options.spreadOverheadGB || 0.35;
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
toFiniteNumber(value, fallback = 0) {
|
|
11
|
-
const numeric = Number(value);
|
|
12
|
-
return Number.isFinite(numeric) ? numeric : fallback;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
clamp(value, min, max) {
|
|
16
|
-
return Math.min(max, Math.max(min, value));
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
normalizeObjective(objective) {
|
|
20
|
-
const normalized = String(objective || 'balanced').toLowerCase();
|
|
21
|
-
if (normalized === 'latency' || normalized === 'balanced' || normalized === 'throughput') {
|
|
22
|
-
return normalized;
|
|
23
|
-
}
|
|
24
|
-
return 'balanced';
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
parseParamsB(model = {}) {
|
|
28
|
-
const bySize = String(model.size || '').match(/(\d+(?:\.\d+)?)\s*b/i);
|
|
29
|
-
if (bySize) {
|
|
30
|
-
return this.toFiniteNumber(bySize[1], 0);
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
const byName = String(model.name || '').match(/(\d+(?:\.\d+)?)\s*b\b/i);
|
|
34
|
-
if (byName) {
|
|
35
|
-
return this.toFiniteNumber(byName[1], 0);
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
39
|
-
if (fileSizeGB > 0) {
|
|
40
|
-
return fileSizeGB / 0.65;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
return 7;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
estimateBaseMemoryGB(model = {}) {
|
|
47
|
-
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
48
|
-
if (fileSizeGB > 0) {
|
|
49
|
-
return fileSizeGB + this.modelOverheadGB;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
const paramsB = this.parseParamsB(model);
|
|
53
|
-
return paramsB * 0.65 + this.modelOverheadGB;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
estimateKVCacheGB(paramsB, contextTokens) {
|
|
57
|
-
const ctx = this.toFiniteNumber(contextTokens, this.minContext);
|
|
58
|
-
return paramsB * this.kvFactorPer4k * (ctx / 4096);
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
normalizeQuantization(rawQuant) {
|
|
62
|
-
const quant = String(rawQuant || 'Q4_K_M').toUpperCase();
|
|
63
|
-
if (quant.includes('FP16') || quant.includes('F16')) return 'FP16';
|
|
64
|
-
if (quant.includes('Q8')) return 'Q8_0';
|
|
65
|
-
if (quant.includes('Q6')) return 'Q6_K';
|
|
66
|
-
if (quant.includes('Q5')) return 'Q5_K_M';
|
|
67
|
-
if (quant.includes('IQ4')) return 'IQ4_XS';
|
|
68
|
-
if (quant.includes('Q4')) return 'Q4_K_M';
|
|
69
|
-
if (quant.includes('IQ3')) return 'IQ3_XXS';
|
|
70
|
-
if (quant.includes('Q3')) return 'Q3_K_M';
|
|
71
|
-
if (quant.includes('Q2')) return 'Q2_K';
|
|
72
|
-
return 'Q4_K_M';
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
quantizationMultiplier(quantization) {
|
|
76
|
-
const table = {
|
|
77
|
-
FP16: 1.0,
|
|
78
|
-
Q8_0: 1.5,
|
|
79
|
-
Q6_K: 1.8,
|
|
80
|
-
Q5_K_M: 2.0,
|
|
81
|
-
Q4_K_M: 2.4,
|
|
82
|
-
IQ4_XS: 2.5,
|
|
83
|
-
Q3_K_M: 2.9,
|
|
84
|
-
IQ3_XXS: 3.1,
|
|
85
|
-
Q2_K: 3.4
|
|
86
|
-
};
|
|
87
|
-
return table[this.normalizeQuantization(quantization)] || 2.0;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
estimateTokensPerSecond(gpu, model, contextTokens) {
|
|
91
|
-
const paramsB = Math.max(0.5, this.toFiniteNumber(model.paramsB, 7));
|
|
92
|
-
const speedCoefficient = Math.max(1, this.toFiniteNumber(gpu.speedCoefficient, 60));
|
|
93
|
-
const quantMult = this.quantizationMultiplier(model.quantization);
|
|
94
|
-
|
|
95
|
-
// Larger contexts reduce generation speed in practice.
|
|
96
|
-
const contextScale = Math.max(0.55, Math.pow(4096 / Math.max(4096, contextTokens), 0.12));
|
|
97
|
-
return Math.max(1, Math.round((speedCoefficient / paramsB) * quantMult * contextScale));
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
normalizeModels(models = []) {
|
|
101
|
-
const normalized = models
|
|
102
|
-
.filter((model) => model && model.name)
|
|
103
|
-
.map((model) => {
|
|
104
|
-
const paramsB = this.parseParamsB(model);
|
|
105
|
-
const baseMemoryGB = this.estimateBaseMemoryGB(model);
|
|
106
|
-
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, Math.max(0, baseMemoryGB - this.modelOverheadGB));
|
|
107
|
-
const quantization = model.quantization || model.details?.quantization_level || 'Q4_K_M';
|
|
108
|
-
return {
|
|
109
|
-
name: model.name,
|
|
110
|
-
size: model.size || `${Math.round(paramsB)}B`,
|
|
111
|
-
fileSizeGB: Math.round(fileSizeGB * 10) / 10,
|
|
112
|
-
paramsB: Math.round(paramsB * 10) / 10,
|
|
113
|
-
baseMemoryGB: Math.round(baseMemoryGB * 100) / 100,
|
|
114
|
-
quantization
|
|
115
|
-
};
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
normalized.sort((a, b) => b.baseMemoryGB - a.baseMemoryGB);
|
|
119
|
-
return normalized;
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
resolveDeviceEnvVar(backend) {
|
|
123
|
-
if (backend === 'cuda') return 'CUDA_VISIBLE_DEVICES';
|
|
124
|
-
if (backend === 'rocm') return 'HIP_VISIBLE_DEVICES';
|
|
125
|
-
return null;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
resolveHardware(hardware = {}, reserveGB = null) {
|
|
129
|
-
const summary = hardware.summary || {};
|
|
130
|
-
const primary = hardware.primary || {};
|
|
131
|
-
const backend = summary.bestBackend || primary.type || 'cpu';
|
|
132
|
-
const backendName = summary.backendName || primary.name || 'CPU';
|
|
133
|
-
|
|
134
|
-
const backendInfo = primary.info || hardware.backends?.[backend]?.info || {};
|
|
135
|
-
const rawGpus = Array.isArray(backendInfo.gpus) ? backendInfo.gpus : [];
|
|
136
|
-
|
|
137
|
-
let gpus = rawGpus.map((gpu, index) => ({
|
|
138
|
-
index: this.toFiniteNumber(gpu.index, index),
|
|
139
|
-
name: String(gpu.name || `GPU ${index}`),
|
|
140
|
-
memoryGB: Math.max(1, this.toFiniteNumber(gpu.memory?.total, 0)),
|
|
141
|
-
speedCoefficient: Math.max(1, this.toFiniteNumber(gpu.speedCoefficient, summary.speedCoefficient || 60))
|
|
142
|
-
}));
|
|
143
|
-
|
|
144
|
-
if (!gpus.length && this.toFiniteNumber(summary.totalVRAM, 0) > 0) {
|
|
145
|
-
gpus = [{
|
|
146
|
-
index: 0,
|
|
147
|
-
name: summary.gpuModel || 'GPU 0',
|
|
148
|
-
memoryGB: Math.max(1, this.toFiniteNumber(summary.totalVRAM, 0)),
|
|
149
|
-
speedCoefficient: Math.max(1, this.toFiniteNumber(summary.speedCoefficient, 80))
|
|
150
|
-
}];
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
if (!gpus.length) {
|
|
154
|
-
gpus = [{
|
|
155
|
-
index: 0,
|
|
156
|
-
name: summary.cpuModel || 'CPU',
|
|
157
|
-
memoryGB: Math.max(4, this.toFiniteNumber(summary.effectiveMemory, this.toFiniteNumber(hardware.memory?.total, 16) * 0.7)),
|
|
158
|
-
speedCoefficient: Math.max(1, this.toFiniteNumber(summary.speedCoefficient, 25))
|
|
159
|
-
}];
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
gpus.sort((a, b) => {
|
|
163
|
-
if (b.speedCoefficient !== a.speedCoefficient) return b.speedCoefficient - a.speedCoefficient;
|
|
164
|
-
return b.memoryGB - a.memoryGB;
|
|
165
|
-
});
|
|
166
|
-
|
|
167
|
-
const reserve = this.toFiniteNumber(reserveGB, this.defaultReserveGB);
|
|
168
|
-
const reservePerGPU = reserve / Math.max(1, gpus.length);
|
|
169
|
-
const usableGPUs = gpus.map((gpu) => ({
|
|
170
|
-
...gpu,
|
|
171
|
-
usableMemoryGB: Math.max(1, Math.round((gpu.memoryGB - reservePerGPU) * 100) / 100)
|
|
172
|
-
}));
|
|
173
|
-
|
|
174
|
-
const totalUsableGB = usableGPUs.reduce((sum, gpu) => sum + gpu.usableMemoryGB, 0);
|
|
175
|
-
|
|
176
|
-
return {
|
|
177
|
-
backend,
|
|
178
|
-
backendName,
|
|
179
|
-
reserveGB: Math.round(reserve * 100) / 100,
|
|
180
|
-
isMultiGPU: usableGPUs.length > 1,
|
|
181
|
-
deviceEnvVar: this.resolveDeviceEnvVar(backend),
|
|
182
|
-
gpus: usableGPUs,
|
|
183
|
-
totalUsableGB: Math.round(totalUsableGB * 100) / 100
|
|
184
|
-
};
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
makeRisk(utilization, fits, strategy) {
|
|
188
|
-
if (!fits) {
|
|
189
|
-
return { level: 'critical', score: 95 };
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
const complexityPenalty = strategy === 'spread' ? 12 : strategy === 'replica' ? 6 : 0;
|
|
193
|
-
const score = Math.min(100, Math.round((utilization * 72) + complexityPenalty));
|
|
194
|
-
|
|
195
|
-
let level = 'low';
|
|
196
|
-
if (score >= 75) level = 'critical';
|
|
197
|
-
else if (score >= 55) level = 'high';
|
|
198
|
-
else if (score >= 35) level = 'medium';
|
|
199
|
-
|
|
200
|
-
return { level, score };
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
strategyScore(strategyPlan, objective) {
|
|
204
|
-
const complexityPenalty = strategyPlan.strategy === 'spread'
|
|
205
|
-
? (objective === 'latency' ? 12 : 8)
|
|
206
|
-
: strategyPlan.strategy === 'replica'
|
|
207
|
-
? (objective === 'latency' ? 5 : 3)
|
|
208
|
-
: 0;
|
|
209
|
-
|
|
210
|
-
const riskWeight = objective === 'throughput' ? 0.15 : objective === 'latency' ? 0.28 : 0.22;
|
|
211
|
-
const infeasiblePenalty = strategyPlan.feasible ? 0 : 220;
|
|
212
|
-
|
|
213
|
-
return strategyPlan.estimated_tps - (strategyPlan.risk.score * riskWeight) - complexityPenalty - infeasiblePenalty;
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
simulatePin(model, hardwarePlan, contextTokens, targetConcurrency) {
|
|
217
|
-
const gpu = hardwarePlan.gpus[0];
|
|
218
|
-
const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
|
|
219
|
-
const totalMemoryGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
|
|
220
|
-
const fits = totalMemoryGB <= gpu.usableMemoryGB;
|
|
221
|
-
const utilization = totalMemoryGB / Math.max(0.1, gpu.usableMemoryGB);
|
|
222
|
-
|
|
223
|
-
const baseTPS = this.estimateTokensPerSecond(gpu, model, contextTokens);
|
|
224
|
-
const throughputPenalty = fits ? 1 : Math.max(0.25, gpu.usableMemoryGB / Math.max(0.1, totalMemoryGB));
|
|
225
|
-
|
|
226
|
-
return {
|
|
227
|
-
strategy: 'pin',
|
|
228
|
-
feasible: fits,
|
|
229
|
-
estimated_tps: Math.max(1, Math.round(baseTPS * throughputPenalty)),
|
|
230
|
-
memory_per_gpu_gb: Math.round(totalMemoryGB * 100) / 100,
|
|
231
|
-
total_memory_gb: Math.round(totalMemoryGB * 100) / 100,
|
|
232
|
-
utilization_percent: Math.round(utilization * 100),
|
|
233
|
-
gpu_count: 1,
|
|
234
|
-
placement: [{
|
|
235
|
-
gpu_index: gpu.index,
|
|
236
|
-
gpu_name: gpu.name,
|
|
237
|
-
concurrency: targetConcurrency
|
|
238
|
-
}],
|
|
239
|
-
device_env_var: hardwarePlan.deviceEnvVar,
|
|
240
|
-
visible_devices: hardwarePlan.deviceEnvVar ? String(gpu.index) : null,
|
|
241
|
-
risk: this.makeRisk(utilization, fits, 'pin'),
|
|
242
|
-
notes: fits
|
|
243
|
-
? ['Single-GPU placement keeps routing simple and minimizes scheduling overhead.']
|
|
244
|
-
: ['Model+context+concurrency exceeds single-GPU memory.']
|
|
245
|
-
};
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
simulateReplica(model, hardwarePlan, contextTokens, targetConcurrency) {
|
|
249
|
-
const gpus = hardwarePlan.gpus;
|
|
250
|
-
const maxReplicas = Math.min(gpus.length, targetConcurrency);
|
|
251
|
-
const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
|
|
252
|
-
|
|
253
|
-
let selectedReplicas = 1;
|
|
254
|
-
let memoryPerReplicaGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
|
|
255
|
-
let feasible = false;
|
|
256
|
-
|
|
257
|
-
for (let replicas = maxReplicas; replicas >= 1; replicas -= 1) {
|
|
258
|
-
const perReplicaConcurrency = Math.ceil(targetConcurrency / replicas);
|
|
259
|
-
const candidateMemory = model.baseMemoryGB + (kvPerRequestGB * perReplicaConcurrency);
|
|
260
|
-
const candidateGPUs = gpus.slice(0, replicas);
|
|
261
|
-
const fitsAll = candidateGPUs.every((gpu) => candidateMemory <= gpu.usableMemoryGB);
|
|
262
|
-
if (fitsAll) {
|
|
263
|
-
selectedReplicas = replicas;
|
|
264
|
-
memoryPerReplicaGB = candidateMemory;
|
|
265
|
-
feasible = true;
|
|
266
|
-
break;
|
|
267
|
-
}
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
const chosenGPUs = gpus.slice(0, selectedReplicas);
|
|
271
|
-
const baseTPS = chosenGPUs.reduce(
|
|
272
|
-
(sum, gpu) => sum + this.estimateTokensPerSecond(gpu, model, contextTokens),
|
|
273
|
-
0
|
|
274
|
-
);
|
|
275
|
-
const replicaEfficiency = Math.max(0.8, 0.95 - ((selectedReplicas - 1) * 0.02));
|
|
276
|
-
const estimatedTPS = Math.max(1, Math.round(baseTPS * replicaEfficiency));
|
|
277
|
-
|
|
278
|
-
const maxUtilization = chosenGPUs.reduce((max, gpu) => {
|
|
279
|
-
const util = memoryPerReplicaGB / Math.max(0.1, gpu.usableMemoryGB);
|
|
280
|
-
return Math.max(max, util);
|
|
281
|
-
}, 0);
|
|
282
|
-
|
|
283
|
-
const placement = [];
|
|
284
|
-
let remaining = targetConcurrency;
|
|
285
|
-
for (let i = 0; i < chosenGPUs.length; i += 1) {
|
|
286
|
-
const gpu = chosenGPUs[i];
|
|
287
|
-
const slotsLeft = chosenGPUs.length - i;
|
|
288
|
-
const assigned = Math.ceil(remaining / slotsLeft);
|
|
289
|
-
placement.push({
|
|
290
|
-
gpu_index: gpu.index,
|
|
291
|
-
gpu_name: gpu.name,
|
|
292
|
-
concurrency: assigned
|
|
293
|
-
});
|
|
294
|
-
remaining -= assigned;
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
const visibleDevices = chosenGPUs.map((gpu) => gpu.index).join(',');
|
|
298
|
-
|
|
299
|
-
return {
|
|
300
|
-
strategy: 'replica',
|
|
301
|
-
feasible,
|
|
302
|
-
estimated_tps: estimatedTPS,
|
|
303
|
-
memory_per_gpu_gb: Math.round(memoryPerReplicaGB * 100) / 100,
|
|
304
|
-
total_memory_gb: Math.round(memoryPerReplicaGB * selectedReplicas * 100) / 100,
|
|
305
|
-
utilization_percent: Math.round(maxUtilization * 100),
|
|
306
|
-
gpu_count: selectedReplicas,
|
|
307
|
-
placement,
|
|
308
|
-
device_env_var: hardwarePlan.deviceEnvVar,
|
|
309
|
-
visible_devices: hardwarePlan.deviceEnvVar ? visibleDevices : null,
|
|
310
|
-
risk: this.makeRisk(maxUtilization, feasible, 'replica'),
|
|
311
|
-
notes: feasible
|
|
312
|
-
? ['Replica strategy scales throughput by running independent model copies per GPU.']
|
|
313
|
-
: ['No replica count can satisfy per-GPU memory constraints at requested settings.']
|
|
314
|
-
};
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
simulateSpread(model, hardwarePlan, contextTokens, targetConcurrency) {
|
|
318
|
-
const gpus = hardwarePlan.gpus;
|
|
319
|
-
if (gpus.length < 2) {
|
|
320
|
-
return {
|
|
321
|
-
strategy: 'spread',
|
|
322
|
-
feasible: false,
|
|
323
|
-
estimated_tps: 0,
|
|
324
|
-
memory_per_gpu_gb: 0,
|
|
325
|
-
total_memory_gb: 0,
|
|
326
|
-
utilization_percent: 0,
|
|
327
|
-
gpu_count: 1,
|
|
328
|
-
placement: [],
|
|
329
|
-
device_env_var: hardwarePlan.deviceEnvVar,
|
|
330
|
-
visible_devices: null,
|
|
331
|
-
risk: { level: 'critical', score: 100 },
|
|
332
|
-
notes: ['Tensor/spread placement requires at least two GPUs.']
|
|
333
|
-
};
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
|
|
337
|
-
const totalMemoryGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
|
|
338
|
-
|
|
339
|
-
let selectedGPUCount = 2;
|
|
340
|
-
let memoryPerGPU = totalMemoryGB / selectedGPUCount + this.spreadOverheadGB;
|
|
341
|
-
let feasible = false;
|
|
342
|
-
|
|
343
|
-
for (let shardCount = 2; shardCount <= gpus.length; shardCount += 1) {
|
|
344
|
-
const candidatePerGPU = totalMemoryGB / shardCount + this.spreadOverheadGB;
|
|
345
|
-
const shardGPUs = gpus.slice(0, shardCount);
|
|
346
|
-
const fits = shardGPUs.every((gpu) => candidatePerGPU <= gpu.usableMemoryGB);
|
|
347
|
-
if (fits) {
|
|
348
|
-
selectedGPUCount = shardCount;
|
|
349
|
-
memoryPerGPU = candidatePerGPU;
|
|
350
|
-
feasible = true;
|
|
351
|
-
break;
|
|
352
|
-
}
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
if (!feasible) {
|
|
356
|
-
selectedGPUCount = gpus.length;
|
|
357
|
-
memoryPerGPU = totalMemoryGB / selectedGPUCount + this.spreadOverheadGB;
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
const chosenGPUs = gpus.slice(0, selectedGPUCount);
|
|
361
|
-
const primaryTPS = this.estimateTokensPerSecond(chosenGPUs[0], model, contextTokens);
|
|
362
|
-
const scaleFactor = 1 + (0.55 * (selectedGPUCount - 1));
|
|
363
|
-
const interconnectPenalty = Math.max(0.65, 1 - (0.07 * (selectedGPUCount - 1)));
|
|
364
|
-
const estimatedTPS = Math.max(1, Math.round(primaryTPS * scaleFactor * interconnectPenalty));
|
|
365
|
-
|
|
366
|
-
const minUsableMemory = chosenGPUs.reduce((min, gpu) => Math.min(min, gpu.usableMemoryGB), Infinity);
|
|
367
|
-
const utilization = memoryPerGPU / Math.max(0.1, minUsableMemory);
|
|
368
|
-
const visibleDevices = chosenGPUs.map((gpu) => gpu.index).join(',');
|
|
369
|
-
|
|
370
|
-
return {
|
|
371
|
-
strategy: 'spread',
|
|
372
|
-
feasible,
|
|
373
|
-
estimated_tps: estimatedTPS,
|
|
374
|
-
memory_per_gpu_gb: Math.round(memoryPerGPU * 100) / 100,
|
|
375
|
-
total_memory_gb: Math.round(memoryPerGPU * selectedGPUCount * 100) / 100,
|
|
376
|
-
utilization_percent: Math.round(utilization * 100),
|
|
377
|
-
gpu_count: selectedGPUCount,
|
|
378
|
-
placement: chosenGPUs.map((gpu) => ({
|
|
379
|
-
gpu_index: gpu.index,
|
|
380
|
-
gpu_name: gpu.name,
|
|
381
|
-
role: 'shard'
|
|
382
|
-
})),
|
|
383
|
-
device_env_var: hardwarePlan.deviceEnvVar,
|
|
384
|
-
visible_devices: hardwarePlan.deviceEnvVar ? visibleDevices : null,
|
|
385
|
-
risk: this.makeRisk(utilization, feasible, 'spread'),
|
|
386
|
-
notes: feasible
|
|
387
|
-
? ['Spread strategy shards one model across multiple GPUs and favors capacity over simplicity.']
|
|
388
|
-
: ['Even full spread cannot fit requested settings within per-GPU memory limits.']
|
|
389
|
-
};
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
pickRecommendedStrategy(strategies, objective) {
|
|
393
|
-
const scored = strategies.map((plan) => ({
|
|
394
|
-
...plan,
|
|
395
|
-
objective_score: Math.round(this.strategyScore(plan, objective) * 100) / 100
|
|
396
|
-
}));
|
|
397
|
-
|
|
398
|
-
scored.sort((a, b) => b.objective_score - a.objective_score);
|
|
399
|
-
return scored[0];
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
buildNotes(hardwarePlan, modelPlans) {
|
|
403
|
-
const notes = [];
|
|
404
|
-
if (!hardwarePlan.isMultiGPU) {
|
|
405
|
-
notes.push('Detected single-GPU (or CPU-only) runtime; replica/spread strategies may not be feasible.');
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
const infeasibleCount = modelPlans.filter((model) => !model.recommended?.feasible).length;
|
|
409
|
-
if (infeasibleCount > 0) {
|
|
410
|
-
notes.push(`${infeasibleCount} model(s) exceed safe memory at requested ctx/concurrency. Lower --ctx or --concurrency.`);
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
if (!hardwarePlan.deviceEnvVar) {
|
|
414
|
-
notes.push('Backend does not expose a standard GPU visibility env var; use strategy output as conceptual placement guidance.');
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
return notes;
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
plan({
|
|
421
|
-
hardware,
|
|
422
|
-
models,
|
|
423
|
-
targetContext = 8192,
|
|
424
|
-
targetConcurrency = 2,
|
|
425
|
-
objective = 'balanced',
|
|
426
|
-
reserveGB = null
|
|
427
|
-
}) {
|
|
428
|
-
const normalizedModels = this.normalizeModels(models);
|
|
429
|
-
if (!normalizedModels.length) {
|
|
430
|
-
throw new Error('At least one model is required for GPU planning.');
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
const normalizedObjective = this.normalizeObjective(objective);
|
|
434
|
-
const contextTokens = this.clamp(
|
|
435
|
-
Math.round(this.toFiniteNumber(targetContext, 8192)),
|
|
436
|
-
512,
|
|
437
|
-
131072
|
|
438
|
-
);
|
|
439
|
-
const concurrency = this.clamp(
|
|
440
|
-
Math.round(this.toFiniteNumber(targetConcurrency, 2)),
|
|
441
|
-
1,
|
|
442
|
-
64
|
|
443
|
-
);
|
|
444
|
-
|
|
445
|
-
const hardwarePlan = this.resolveHardware(hardware, reserveGB);
|
|
446
|
-
|
|
447
|
-
const modelPlans = normalizedModels.map((model) => {
|
|
448
|
-
const strategies = [
|
|
449
|
-
this.simulatePin(model, hardwarePlan, contextTokens, concurrency),
|
|
450
|
-
this.simulateReplica(model, hardwarePlan, contextTokens, concurrency),
|
|
451
|
-
this.simulateSpread(model, hardwarePlan, contextTokens, concurrency)
|
|
452
|
-
];
|
|
453
|
-
|
|
454
|
-
const recommended = this.pickRecommendedStrategy(strategies, normalizedObjective);
|
|
455
|
-
|
|
456
|
-
return {
|
|
457
|
-
name: model.name,
|
|
458
|
-
size: model.size,
|
|
459
|
-
file_size_gb: model.fileSizeGB,
|
|
460
|
-
params_b: model.paramsB,
|
|
461
|
-
quantization: this.normalizeQuantization(model.quantization),
|
|
462
|
-
estimated_base_memory_gb: model.baseMemoryGB,
|
|
463
|
-
recommended,
|
|
464
|
-
strategies
|
|
465
|
-
};
|
|
466
|
-
});
|
|
467
|
-
|
|
468
|
-
return {
|
|
469
|
-
objective: normalizedObjective,
|
|
470
|
-
inputs: {
|
|
471
|
-
target_context: contextTokens,
|
|
472
|
-
target_concurrency: concurrency
|
|
473
|
-
},
|
|
474
|
-
hardware: {
|
|
475
|
-
backend: hardwarePlan.backend,
|
|
476
|
-
backend_name: hardwarePlan.backendName,
|
|
477
|
-
is_multi_gpu: hardwarePlan.isMultiGPU,
|
|
478
|
-
gpu_count: hardwarePlan.gpus.length,
|
|
479
|
-
reserve_gb: hardwarePlan.reserveGB,
|
|
480
|
-
total_usable_memory_gb: hardwarePlan.totalUsableGB,
|
|
481
|
-
device_env_var: hardwarePlan.deviceEnvVar,
|
|
482
|
-
gpus: hardwarePlan.gpus.map((gpu) => ({
|
|
483
|
-
index: gpu.index,
|
|
484
|
-
name: gpu.name,
|
|
485
|
-
memory_gb: gpu.memoryGB,
|
|
486
|
-
usable_memory_gb: gpu.usableMemoryGB,
|
|
487
|
-
speed_coefficient: gpu.speedCoefficient
|
|
488
|
-
}))
|
|
489
|
-
},
|
|
490
|
-
models: modelPlans,
|
|
491
|
-
notes: this.buildNotes(hardwarePlan, modelPlans)
|
|
492
|
-
};
|
|
493
|
-
}
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
module.exports = OllamaGPUPlacementPlanner;
|