llm-checker 3.4.1 → 3.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/bin/enhanced_cli.js +134 -0
- package/package.json +1 -1
- package/src/ollama/gpu-placement-planner.js +496 -0
package/README.md
CHANGED
|
@@ -332,6 +332,7 @@ Claude will automatically call the right tools and give you actionable results.
|
|
|
332
332
|
| `calibrate` | Generate calibration result + routing policy artifacts from a JSONL prompt suite |
|
|
333
333
|
| `installed` | Rank your installed Ollama models by compatibility |
|
|
334
334
|
| `ollama-plan` | Compute safe Ollama runtime env vars (`NUM_CTX`, `NUM_PARALLEL`, `MAX_LOADED_MODELS`) for selected local models |
|
|
335
|
+
| `gpu-plan` | Simulate `pin`/`replica`/`spread` multi-GPU placement with memory-fit and throughput estimates per model |
|
|
335
336
|
|
|
336
337
|
### Advanced Commands (require `sql.js`)
|
|
337
338
|
|
package/bin/enhanced_cli.js
CHANGED
|
@@ -3255,6 +3255,140 @@ program
|
|
|
3255
3255
|
}
|
|
3256
3256
|
});
|
|
3257
3257
|
|
|
3258
|
+
program
|
|
3259
|
+
.command('gpu-plan')
|
|
3260
|
+
.description('Recommend multi-GPU placement strategies for selected local models')
|
|
3261
|
+
.option('--models <models...>', 'Model tags/families to include (default: all local models)')
|
|
3262
|
+
.option('--ctx <tokens>', 'Target context window in tokens', '8192')
|
|
3263
|
+
.option('--concurrency <n>', 'Target parallel request count', '2')
|
|
3264
|
+
.option('--objective <mode>', 'Optimization objective (latency|balanced|throughput)', 'balanced')
|
|
3265
|
+
.option('--reserve-gb <gb>', 'Memory reserve to subtract from available GPU memory', '1')
|
|
3266
|
+
.option('--json', 'Output plan as JSON')
|
|
3267
|
+
.action(async (options) => {
|
|
3268
|
+
const spinner = options.json ? null : ora('Building GPU placement plan...').start();
|
|
3269
|
+
|
|
3270
|
+
try {
|
|
3271
|
+
const requestedObjective = String(options.objective || 'balanced').toLowerCase();
|
|
3272
|
+
const supportedObjectives = new Set(['latency', 'balanced', 'throughput']);
|
|
3273
|
+
if (!supportedObjectives.has(requestedObjective)) {
|
|
3274
|
+
throw new Error(`Invalid objective "${options.objective}". Use latency, balanced, or throughput.`);
|
|
3275
|
+
}
|
|
3276
|
+
|
|
3277
|
+
const targetContext = parsePositiveIntegerOption(options.ctx, '--ctx');
|
|
3278
|
+
const targetConcurrency = parsePositiveIntegerOption(options.concurrency, '--concurrency');
|
|
3279
|
+
const reserveGB = parseNonNegativeNumberOption(options.reserveGb, '--reserve-gb');
|
|
3280
|
+
|
|
3281
|
+
const OllamaClient = require('../src/ollama/client');
|
|
3282
|
+
const UnifiedDetector = require('../src/hardware/unified-detector');
|
|
3283
|
+
const OllamaGPUPlacementPlanner = require('../src/ollama/gpu-placement-planner');
|
|
3284
|
+
|
|
3285
|
+
const ollamaClient = new OllamaClient();
|
|
3286
|
+
const availability = await ollamaClient.checkOllamaAvailability();
|
|
3287
|
+
if (!availability.available) {
|
|
3288
|
+
throw new Error(availability.error || 'Ollama is not available');
|
|
3289
|
+
}
|
|
3290
|
+
|
|
3291
|
+
const localModels = await ollamaClient.getLocalModels();
|
|
3292
|
+
if (!localModels || localModels.length === 0) {
|
|
3293
|
+
throw new Error('No local Ollama models found. Install one with: ollama pull llama3.2:3b');
|
|
3294
|
+
}
|
|
3295
|
+
|
|
3296
|
+
const { selected, missing } = selectModelsForPlan(localModels, options.models || []);
|
|
3297
|
+
if (selected.length === 0) {
|
|
3298
|
+
throw new Error(
|
|
3299
|
+
`No matching local models found for: ${(options.models || []).join(', ')}`
|
|
3300
|
+
);
|
|
3301
|
+
}
|
|
3302
|
+
|
|
3303
|
+
const detector = new UnifiedDetector();
|
|
3304
|
+
const hardware = await detector.detect();
|
|
3305
|
+
const planner = new OllamaGPUPlacementPlanner();
|
|
3306
|
+
|
|
3307
|
+
const plan = planner.plan({
|
|
3308
|
+
hardware,
|
|
3309
|
+
models: selected,
|
|
3310
|
+
targetContext,
|
|
3311
|
+
targetConcurrency,
|
|
3312
|
+
objective: requestedObjective,
|
|
3313
|
+
reserveGB
|
|
3314
|
+
});
|
|
3315
|
+
|
|
3316
|
+
if (options.json) {
|
|
3317
|
+
console.log(JSON.stringify({
|
|
3318
|
+
generated_at: new Date().toISOString(),
|
|
3319
|
+
selection: {
|
|
3320
|
+
requested: options.models || [],
|
|
3321
|
+
selected: selected.map((model) => model.name),
|
|
3322
|
+
missing
|
|
3323
|
+
},
|
|
3324
|
+
plan
|
|
3325
|
+
}, null, 2));
|
|
3326
|
+
return;
|
|
3327
|
+
}
|
|
3328
|
+
|
|
3329
|
+
if (spinner) spinner.succeed('GPU placement plan generated');
|
|
3330
|
+
|
|
3331
|
+
console.log('\n' + chalk.bgMagenta.white.bold(' GPU PLACEMENT PLAN '));
|
|
3332
|
+
console.log(chalk.magenta('Backend:'), `${plan.hardware.backend_name} (${plan.hardware.backend})`);
|
|
3333
|
+
console.log(
|
|
3334
|
+
chalk.magenta('GPU inventory:'),
|
|
3335
|
+
`${plan.hardware.gpu_count} device(s), ${plan.hardware.total_usable_memory_gb}GB usable (reserve ${plan.hardware.reserve_gb}GB)`
|
|
3336
|
+
);
|
|
3337
|
+
console.log(
|
|
3338
|
+
chalk.magenta('Target envelope:'),
|
|
3339
|
+
`ctx=${plan.inputs.target_context}, concurrency=${plan.inputs.target_concurrency}, objective=${plan.objective}`
|
|
3340
|
+
);
|
|
3341
|
+
|
|
3342
|
+
if (missing.length > 0) {
|
|
3343
|
+
console.log(chalk.yellow('Missing model filters:'), missing.join(', '));
|
|
3344
|
+
}
|
|
3345
|
+
|
|
3346
|
+
if (!plan.hardware.is_multi_gpu) {
|
|
3347
|
+
console.log(chalk.yellow('Only one GPU detected: replica/spread are included for simulation but may be infeasible.'));
|
|
3348
|
+
}
|
|
3349
|
+
|
|
3350
|
+
for (const modelPlan of plan.models) {
|
|
3351
|
+
const recommended = modelPlan.recommended || {};
|
|
3352
|
+
const recFit = recommended.feasible ? chalk.green('fit') : chalk.red('no-fit');
|
|
3353
|
+
const recRisk = recommended.risk ? `${recommended.risk.level.toUpperCase()} (${recommended.risk.score})` : 'N/A';
|
|
3354
|
+
|
|
3355
|
+
console.log(chalk.magenta.bold(`\nModel: ${modelPlan.name} (${modelPlan.size})`));
|
|
3356
|
+
console.log(
|
|
3357
|
+
` Recommended: ${chalk.bold((recommended.strategy || 'unknown').toUpperCase())} | ${recFit} | ~${recommended.estimated_tps || 0} tok/s | risk ${recRisk}`
|
|
3358
|
+
);
|
|
3359
|
+
|
|
3360
|
+
if (recommended.device_env_var && recommended.visible_devices) {
|
|
3361
|
+
console.log(` Device pinning hint: export ${recommended.device_env_var}=${recommended.visible_devices}`);
|
|
3362
|
+
}
|
|
3363
|
+
|
|
3364
|
+
console.log(chalk.magenta(' Strategies:'));
|
|
3365
|
+
for (const strategy of modelPlan.strategies) {
|
|
3366
|
+
const fit = strategy.feasible ? chalk.green('fit') : chalk.red('no-fit');
|
|
3367
|
+
const risk = strategy.risk ? `${strategy.risk.level} (${strategy.risk.score})` : 'n/a';
|
|
3368
|
+
console.log(
|
|
3369
|
+
` - ${strategy.strategy.padEnd(7)} ${fit} | ~${strategy.estimated_tps} tok/s | ${strategy.memory_per_gpu_gb}GB/GPU | risk ${risk}`
|
|
3370
|
+
);
|
|
3371
|
+
}
|
|
3372
|
+
}
|
|
3373
|
+
|
|
3374
|
+
if (plan.notes && plan.notes.length > 0) {
|
|
3375
|
+
console.log(chalk.magenta.bold('\nNotes:'));
|
|
3376
|
+
for (const note of plan.notes) {
|
|
3377
|
+
console.log(` - ${note}`);
|
|
3378
|
+
}
|
|
3379
|
+
}
|
|
3380
|
+
|
|
3381
|
+
console.log('');
|
|
3382
|
+
} catch (error) {
|
|
3383
|
+
if (spinner) spinner.fail('Failed to build GPU placement plan');
|
|
3384
|
+
console.error(chalk.red('Error:'), error.message);
|
|
3385
|
+
if (process.env.DEBUG) {
|
|
3386
|
+
console.error(error.stack);
|
|
3387
|
+
}
|
|
3388
|
+
process.exit(1);
|
|
3389
|
+
}
|
|
3390
|
+
});
|
|
3391
|
+
|
|
3258
3392
|
program
|
|
3259
3393
|
.command('recommend')
|
|
3260
3394
|
.description('Get intelligent model recommendations for your hardware')
|
package/package.json
CHANGED
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
class OllamaGPUPlacementPlanner {
|
|
2
|
+
constructor(options = {}) {
|
|
3
|
+
this.minContext = options.minContext || 2048;
|
|
4
|
+
this.defaultReserveGB = options.defaultReserveGB || 1;
|
|
5
|
+
this.kvFactorPer4k = options.kvFactorPer4k || 0.08; // GB per 1B params at 4K context
|
|
6
|
+
this.modelOverheadGB = options.modelOverheadGB || 0.7;
|
|
7
|
+
this.spreadOverheadGB = options.spreadOverheadGB || 0.35;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
toFiniteNumber(value, fallback = 0) {
|
|
11
|
+
const numeric = Number(value);
|
|
12
|
+
return Number.isFinite(numeric) ? numeric : fallback;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
clamp(value, min, max) {
|
|
16
|
+
return Math.min(max, Math.max(min, value));
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
normalizeObjective(objective) {
|
|
20
|
+
const normalized = String(objective || 'balanced').toLowerCase();
|
|
21
|
+
if (normalized === 'latency' || normalized === 'balanced' || normalized === 'throughput') {
|
|
22
|
+
return normalized;
|
|
23
|
+
}
|
|
24
|
+
return 'balanced';
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
parseParamsB(model = {}) {
|
|
28
|
+
const bySize = String(model.size || '').match(/(\d+(?:\.\d+)?)\s*b/i);
|
|
29
|
+
if (bySize) {
|
|
30
|
+
return this.toFiniteNumber(bySize[1], 0);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const byName = String(model.name || '').match(/(\d+(?:\.\d+)?)\s*b\b/i);
|
|
34
|
+
if (byName) {
|
|
35
|
+
return this.toFiniteNumber(byName[1], 0);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
39
|
+
if (fileSizeGB > 0) {
|
|
40
|
+
return fileSizeGB / 0.65;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return 7;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
estimateBaseMemoryGB(model = {}) {
|
|
47
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
48
|
+
if (fileSizeGB > 0) {
|
|
49
|
+
return fileSizeGB + this.modelOverheadGB;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const paramsB = this.parseParamsB(model);
|
|
53
|
+
return paramsB * 0.65 + this.modelOverheadGB;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
estimateKVCacheGB(paramsB, contextTokens) {
|
|
57
|
+
const ctx = this.toFiniteNumber(contextTokens, this.minContext);
|
|
58
|
+
return paramsB * this.kvFactorPer4k * (ctx / 4096);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
normalizeQuantization(rawQuant) {
|
|
62
|
+
const quant = String(rawQuant || 'Q4_K_M').toUpperCase();
|
|
63
|
+
if (quant.includes('FP16') || quant.includes('F16')) return 'FP16';
|
|
64
|
+
if (quant.includes('Q8')) return 'Q8_0';
|
|
65
|
+
if (quant.includes('Q6')) return 'Q6_K';
|
|
66
|
+
if (quant.includes('Q5')) return 'Q5_K_M';
|
|
67
|
+
if (quant.includes('IQ4')) return 'IQ4_XS';
|
|
68
|
+
if (quant.includes('Q4')) return 'Q4_K_M';
|
|
69
|
+
if (quant.includes('IQ3')) return 'IQ3_XXS';
|
|
70
|
+
if (quant.includes('Q3')) return 'Q3_K_M';
|
|
71
|
+
if (quant.includes('Q2')) return 'Q2_K';
|
|
72
|
+
return 'Q4_K_M';
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
quantizationMultiplier(quantization) {
|
|
76
|
+
const table = {
|
|
77
|
+
FP16: 1.0,
|
|
78
|
+
Q8_0: 1.5,
|
|
79
|
+
Q6_K: 1.8,
|
|
80
|
+
Q5_K_M: 2.0,
|
|
81
|
+
Q4_K_M: 2.4,
|
|
82
|
+
IQ4_XS: 2.5,
|
|
83
|
+
Q3_K_M: 2.9,
|
|
84
|
+
IQ3_XXS: 3.1,
|
|
85
|
+
Q2_K: 3.4
|
|
86
|
+
};
|
|
87
|
+
return table[this.normalizeQuantization(quantization)] || 2.0;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
estimateTokensPerSecond(gpu, model, contextTokens) {
|
|
91
|
+
const paramsB = Math.max(0.5, this.toFiniteNumber(model.paramsB, 7));
|
|
92
|
+
const speedCoefficient = Math.max(1, this.toFiniteNumber(gpu.speedCoefficient, 60));
|
|
93
|
+
const quantMult = this.quantizationMultiplier(model.quantization);
|
|
94
|
+
|
|
95
|
+
// Larger contexts reduce generation speed in practice.
|
|
96
|
+
const contextScale = Math.max(0.55, Math.pow(4096 / Math.max(4096, contextTokens), 0.12));
|
|
97
|
+
return Math.max(1, Math.round((speedCoefficient / paramsB) * quantMult * contextScale));
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
normalizeModels(models = []) {
|
|
101
|
+
const normalized = models
|
|
102
|
+
.filter((model) => model && model.name)
|
|
103
|
+
.map((model) => {
|
|
104
|
+
const paramsB = this.parseParamsB(model);
|
|
105
|
+
const baseMemoryGB = this.estimateBaseMemoryGB(model);
|
|
106
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, Math.max(0, baseMemoryGB - this.modelOverheadGB));
|
|
107
|
+
const quantization = model.quantization || model.details?.quantization_level || 'Q4_K_M';
|
|
108
|
+
return {
|
|
109
|
+
name: model.name,
|
|
110
|
+
size: model.size || `${Math.round(paramsB)}B`,
|
|
111
|
+
fileSizeGB: Math.round(fileSizeGB * 10) / 10,
|
|
112
|
+
paramsB: Math.round(paramsB * 10) / 10,
|
|
113
|
+
baseMemoryGB: Math.round(baseMemoryGB * 100) / 100,
|
|
114
|
+
quantization
|
|
115
|
+
};
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
normalized.sort((a, b) => b.baseMemoryGB - a.baseMemoryGB);
|
|
119
|
+
return normalized;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
resolveDeviceEnvVar(backend) {
|
|
123
|
+
if (backend === 'cuda') return 'CUDA_VISIBLE_DEVICES';
|
|
124
|
+
if (backend === 'rocm') return 'HIP_VISIBLE_DEVICES';
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
resolveHardware(hardware = {}, reserveGB = null) {
|
|
129
|
+
const summary = hardware.summary || {};
|
|
130
|
+
const primary = hardware.primary || {};
|
|
131
|
+
const backend = summary.bestBackend || primary.type || 'cpu';
|
|
132
|
+
const backendName = summary.backendName || primary.name || 'CPU';
|
|
133
|
+
|
|
134
|
+
const backendInfo = primary.info || hardware.backends?.[backend]?.info || {};
|
|
135
|
+
const rawGpus = Array.isArray(backendInfo.gpus) ? backendInfo.gpus : [];
|
|
136
|
+
|
|
137
|
+
let gpus = rawGpus.map((gpu, index) => ({
|
|
138
|
+
index: this.toFiniteNumber(gpu.index, index),
|
|
139
|
+
name: String(gpu.name || `GPU ${index}`),
|
|
140
|
+
memoryGB: Math.max(1, this.toFiniteNumber(gpu.memory?.total, 0)),
|
|
141
|
+
speedCoefficient: Math.max(1, this.toFiniteNumber(gpu.speedCoefficient, summary.speedCoefficient || 60))
|
|
142
|
+
}));
|
|
143
|
+
|
|
144
|
+
if (!gpus.length && this.toFiniteNumber(summary.totalVRAM, 0) > 0) {
|
|
145
|
+
gpus = [{
|
|
146
|
+
index: 0,
|
|
147
|
+
name: summary.gpuModel || 'GPU 0',
|
|
148
|
+
memoryGB: Math.max(1, this.toFiniteNumber(summary.totalVRAM, 0)),
|
|
149
|
+
speedCoefficient: Math.max(1, this.toFiniteNumber(summary.speedCoefficient, 80))
|
|
150
|
+
}];
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if (!gpus.length) {
|
|
154
|
+
gpus = [{
|
|
155
|
+
index: 0,
|
|
156
|
+
name: summary.cpuModel || 'CPU',
|
|
157
|
+
memoryGB: Math.max(4, this.toFiniteNumber(summary.effectiveMemory, this.toFiniteNumber(hardware.memory?.total, 16) * 0.7)),
|
|
158
|
+
speedCoefficient: Math.max(1, this.toFiniteNumber(summary.speedCoefficient, 25))
|
|
159
|
+
}];
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
gpus.sort((a, b) => {
|
|
163
|
+
if (b.speedCoefficient !== a.speedCoefficient) return b.speedCoefficient - a.speedCoefficient;
|
|
164
|
+
return b.memoryGB - a.memoryGB;
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
const reserve = this.toFiniteNumber(reserveGB, this.defaultReserveGB);
|
|
168
|
+
const reservePerGPU = reserve / Math.max(1, gpus.length);
|
|
169
|
+
const usableGPUs = gpus.map((gpu) => ({
|
|
170
|
+
...gpu,
|
|
171
|
+
usableMemoryGB: Math.max(1, Math.round((gpu.memoryGB - reservePerGPU) * 100) / 100)
|
|
172
|
+
}));
|
|
173
|
+
|
|
174
|
+
const totalUsableGB = usableGPUs.reduce((sum, gpu) => sum + gpu.usableMemoryGB, 0);
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
backend,
|
|
178
|
+
backendName,
|
|
179
|
+
reserveGB: Math.round(reserve * 100) / 100,
|
|
180
|
+
isMultiGPU: usableGPUs.length > 1,
|
|
181
|
+
deviceEnvVar: this.resolveDeviceEnvVar(backend),
|
|
182
|
+
gpus: usableGPUs,
|
|
183
|
+
totalUsableGB: Math.round(totalUsableGB * 100) / 100
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
makeRisk(utilization, fits, strategy) {
|
|
188
|
+
if (!fits) {
|
|
189
|
+
return { level: 'critical', score: 95 };
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const complexityPenalty = strategy === 'spread' ? 12 : strategy === 'replica' ? 6 : 0;
|
|
193
|
+
const score = Math.min(100, Math.round((utilization * 72) + complexityPenalty));
|
|
194
|
+
|
|
195
|
+
let level = 'low';
|
|
196
|
+
if (score >= 75) level = 'critical';
|
|
197
|
+
else if (score >= 55) level = 'high';
|
|
198
|
+
else if (score >= 35) level = 'medium';
|
|
199
|
+
|
|
200
|
+
return { level, score };
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
strategyScore(strategyPlan, objective) {
|
|
204
|
+
const complexityPenalty = strategyPlan.strategy === 'spread'
|
|
205
|
+
? (objective === 'latency' ? 12 : 8)
|
|
206
|
+
: strategyPlan.strategy === 'replica'
|
|
207
|
+
? (objective === 'latency' ? 5 : 3)
|
|
208
|
+
: 0;
|
|
209
|
+
|
|
210
|
+
const riskWeight = objective === 'throughput' ? 0.15 : objective === 'latency' ? 0.28 : 0.22;
|
|
211
|
+
const infeasiblePenalty = strategyPlan.feasible ? 0 : 220;
|
|
212
|
+
|
|
213
|
+
return strategyPlan.estimated_tps - (strategyPlan.risk.score * riskWeight) - complexityPenalty - infeasiblePenalty;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
simulatePin(model, hardwarePlan, contextTokens, targetConcurrency) {
|
|
217
|
+
const gpu = hardwarePlan.gpus[0];
|
|
218
|
+
const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
|
|
219
|
+
const totalMemoryGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
|
|
220
|
+
const fits = totalMemoryGB <= gpu.usableMemoryGB;
|
|
221
|
+
const utilization = totalMemoryGB / Math.max(0.1, gpu.usableMemoryGB);
|
|
222
|
+
|
|
223
|
+
const baseTPS = this.estimateTokensPerSecond(gpu, model, contextTokens);
|
|
224
|
+
const throughputPenalty = fits ? 1 : Math.max(0.25, gpu.usableMemoryGB / Math.max(0.1, totalMemoryGB));
|
|
225
|
+
|
|
226
|
+
return {
|
|
227
|
+
strategy: 'pin',
|
|
228
|
+
feasible: fits,
|
|
229
|
+
estimated_tps: Math.max(1, Math.round(baseTPS * throughputPenalty)),
|
|
230
|
+
memory_per_gpu_gb: Math.round(totalMemoryGB * 100) / 100,
|
|
231
|
+
total_memory_gb: Math.round(totalMemoryGB * 100) / 100,
|
|
232
|
+
utilization_percent: Math.round(utilization * 100),
|
|
233
|
+
gpu_count: 1,
|
|
234
|
+
placement: [{
|
|
235
|
+
gpu_index: gpu.index,
|
|
236
|
+
gpu_name: gpu.name,
|
|
237
|
+
concurrency: targetConcurrency
|
|
238
|
+
}],
|
|
239
|
+
device_env_var: hardwarePlan.deviceEnvVar,
|
|
240
|
+
visible_devices: hardwarePlan.deviceEnvVar ? String(gpu.index) : null,
|
|
241
|
+
risk: this.makeRisk(utilization, fits, 'pin'),
|
|
242
|
+
notes: fits
|
|
243
|
+
? ['Single-GPU placement keeps routing simple and minimizes scheduling overhead.']
|
|
244
|
+
: ['Model+context+concurrency exceeds single-GPU memory.']
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
simulateReplica(model, hardwarePlan, contextTokens, targetConcurrency) {
|
|
249
|
+
const gpus = hardwarePlan.gpus;
|
|
250
|
+
const maxReplicas = Math.min(gpus.length, targetConcurrency);
|
|
251
|
+
const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
|
|
252
|
+
|
|
253
|
+
let selectedReplicas = 1;
|
|
254
|
+
let memoryPerReplicaGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
|
|
255
|
+
let feasible = false;
|
|
256
|
+
|
|
257
|
+
for (let replicas = maxReplicas; replicas >= 1; replicas -= 1) {
|
|
258
|
+
const perReplicaConcurrency = Math.ceil(targetConcurrency / replicas);
|
|
259
|
+
const candidateMemory = model.baseMemoryGB + (kvPerRequestGB * perReplicaConcurrency);
|
|
260
|
+
const candidateGPUs = gpus.slice(0, replicas);
|
|
261
|
+
const fitsAll = candidateGPUs.every((gpu) => candidateMemory <= gpu.usableMemoryGB);
|
|
262
|
+
if (fitsAll) {
|
|
263
|
+
selectedReplicas = replicas;
|
|
264
|
+
memoryPerReplicaGB = candidateMemory;
|
|
265
|
+
feasible = true;
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const chosenGPUs = gpus.slice(0, selectedReplicas);
|
|
271
|
+
const baseTPS = chosenGPUs.reduce(
|
|
272
|
+
(sum, gpu) => sum + this.estimateTokensPerSecond(gpu, model, contextTokens),
|
|
273
|
+
0
|
|
274
|
+
);
|
|
275
|
+
const replicaEfficiency = Math.max(0.8, 0.95 - ((selectedReplicas - 1) * 0.02));
|
|
276
|
+
const estimatedTPS = Math.max(1, Math.round(baseTPS * replicaEfficiency));
|
|
277
|
+
|
|
278
|
+
const maxUtilization = chosenGPUs.reduce((max, gpu) => {
|
|
279
|
+
const util = memoryPerReplicaGB / Math.max(0.1, gpu.usableMemoryGB);
|
|
280
|
+
return Math.max(max, util);
|
|
281
|
+
}, 0);
|
|
282
|
+
|
|
283
|
+
const placement = [];
|
|
284
|
+
let remaining = targetConcurrency;
|
|
285
|
+
for (let i = 0; i < chosenGPUs.length; i += 1) {
|
|
286
|
+
const gpu = chosenGPUs[i];
|
|
287
|
+
const slotsLeft = chosenGPUs.length - i;
|
|
288
|
+
const assigned = Math.ceil(remaining / slotsLeft);
|
|
289
|
+
placement.push({
|
|
290
|
+
gpu_index: gpu.index,
|
|
291
|
+
gpu_name: gpu.name,
|
|
292
|
+
concurrency: assigned
|
|
293
|
+
});
|
|
294
|
+
remaining -= assigned;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const visibleDevices = chosenGPUs.map((gpu) => gpu.index).join(',');
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
strategy: 'replica',
|
|
301
|
+
feasible,
|
|
302
|
+
estimated_tps: estimatedTPS,
|
|
303
|
+
memory_per_gpu_gb: Math.round(memoryPerReplicaGB * 100) / 100,
|
|
304
|
+
total_memory_gb: Math.round(memoryPerReplicaGB * selectedReplicas * 100) / 100,
|
|
305
|
+
utilization_percent: Math.round(maxUtilization * 100),
|
|
306
|
+
gpu_count: selectedReplicas,
|
|
307
|
+
placement,
|
|
308
|
+
device_env_var: hardwarePlan.deviceEnvVar,
|
|
309
|
+
visible_devices: hardwarePlan.deviceEnvVar ? visibleDevices : null,
|
|
310
|
+
risk: this.makeRisk(maxUtilization, feasible, 'replica'),
|
|
311
|
+
notes: feasible
|
|
312
|
+
? ['Replica strategy scales throughput by running independent model copies per GPU.']
|
|
313
|
+
: ['No replica count can satisfy per-GPU memory constraints at requested settings.']
|
|
314
|
+
};
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
simulateSpread(model, hardwarePlan, contextTokens, targetConcurrency) {
|
|
318
|
+
const gpus = hardwarePlan.gpus;
|
|
319
|
+
if (gpus.length < 2) {
|
|
320
|
+
return {
|
|
321
|
+
strategy: 'spread',
|
|
322
|
+
feasible: false,
|
|
323
|
+
estimated_tps: 0,
|
|
324
|
+
memory_per_gpu_gb: 0,
|
|
325
|
+
total_memory_gb: 0,
|
|
326
|
+
utilization_percent: 0,
|
|
327
|
+
gpu_count: 1,
|
|
328
|
+
placement: [],
|
|
329
|
+
device_env_var: hardwarePlan.deviceEnvVar,
|
|
330
|
+
visible_devices: null,
|
|
331
|
+
risk: { level: 'critical', score: 100 },
|
|
332
|
+
notes: ['Tensor/spread placement requires at least two GPUs.']
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
|
|
337
|
+
const totalMemoryGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
|
|
338
|
+
|
|
339
|
+
let selectedGPUCount = 2;
|
|
340
|
+
let memoryPerGPU = totalMemoryGB / selectedGPUCount + this.spreadOverheadGB;
|
|
341
|
+
let feasible = false;
|
|
342
|
+
|
|
343
|
+
for (let shardCount = 2; shardCount <= gpus.length; shardCount += 1) {
|
|
344
|
+
const candidatePerGPU = totalMemoryGB / shardCount + this.spreadOverheadGB;
|
|
345
|
+
const shardGPUs = gpus.slice(0, shardCount);
|
|
346
|
+
const fits = shardGPUs.every((gpu) => candidatePerGPU <= gpu.usableMemoryGB);
|
|
347
|
+
if (fits) {
|
|
348
|
+
selectedGPUCount = shardCount;
|
|
349
|
+
memoryPerGPU = candidatePerGPU;
|
|
350
|
+
feasible = true;
|
|
351
|
+
break;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
if (!feasible) {
|
|
356
|
+
selectedGPUCount = gpus.length;
|
|
357
|
+
memoryPerGPU = totalMemoryGB / selectedGPUCount + this.spreadOverheadGB;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
const chosenGPUs = gpus.slice(0, selectedGPUCount);
|
|
361
|
+
const primaryTPS = this.estimateTokensPerSecond(chosenGPUs[0], model, contextTokens);
|
|
362
|
+
const scaleFactor = 1 + (0.55 * (selectedGPUCount - 1));
|
|
363
|
+
const interconnectPenalty = Math.max(0.65, 1 - (0.07 * (selectedGPUCount - 1)));
|
|
364
|
+
const estimatedTPS = Math.max(1, Math.round(primaryTPS * scaleFactor * interconnectPenalty));
|
|
365
|
+
|
|
366
|
+
const minUsableMemory = chosenGPUs.reduce((min, gpu) => Math.min(min, gpu.usableMemoryGB), Infinity);
|
|
367
|
+
const utilization = memoryPerGPU / Math.max(0.1, minUsableMemory);
|
|
368
|
+
const visibleDevices = chosenGPUs.map((gpu) => gpu.index).join(',');
|
|
369
|
+
|
|
370
|
+
return {
|
|
371
|
+
strategy: 'spread',
|
|
372
|
+
feasible,
|
|
373
|
+
estimated_tps: estimatedTPS,
|
|
374
|
+
memory_per_gpu_gb: Math.round(memoryPerGPU * 100) / 100,
|
|
375
|
+
total_memory_gb: Math.round(memoryPerGPU * selectedGPUCount * 100) / 100,
|
|
376
|
+
utilization_percent: Math.round(utilization * 100),
|
|
377
|
+
gpu_count: selectedGPUCount,
|
|
378
|
+
placement: chosenGPUs.map((gpu) => ({
|
|
379
|
+
gpu_index: gpu.index,
|
|
380
|
+
gpu_name: gpu.name,
|
|
381
|
+
role: 'shard'
|
|
382
|
+
})),
|
|
383
|
+
device_env_var: hardwarePlan.deviceEnvVar,
|
|
384
|
+
visible_devices: hardwarePlan.deviceEnvVar ? visibleDevices : null,
|
|
385
|
+
risk: this.makeRisk(utilization, feasible, 'spread'),
|
|
386
|
+
notes: feasible
|
|
387
|
+
? ['Spread strategy shards one model across multiple GPUs and favors capacity over simplicity.']
|
|
388
|
+
: ['Even full spread cannot fit requested settings within per-GPU memory limits.']
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
pickRecommendedStrategy(strategies, objective) {
|
|
393
|
+
const scored = strategies.map((plan) => ({
|
|
394
|
+
...plan,
|
|
395
|
+
objective_score: Math.round(this.strategyScore(plan, objective) * 100) / 100
|
|
396
|
+
}));
|
|
397
|
+
|
|
398
|
+
scored.sort((a, b) => b.objective_score - a.objective_score);
|
|
399
|
+
return scored[0];
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
buildNotes(hardwarePlan, modelPlans) {
|
|
403
|
+
const notes = [];
|
|
404
|
+
if (!hardwarePlan.isMultiGPU) {
|
|
405
|
+
notes.push('Detected single-GPU (or CPU-only) runtime; replica/spread strategies may not be feasible.');
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
const infeasibleCount = modelPlans.filter((model) => !model.recommended?.feasible).length;
|
|
409
|
+
if (infeasibleCount > 0) {
|
|
410
|
+
notes.push(`${infeasibleCount} model(s) exceed safe memory at requested ctx/concurrency. Lower --ctx or --concurrency.`);
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
if (!hardwarePlan.deviceEnvVar) {
|
|
414
|
+
notes.push('Backend does not expose a standard GPU visibility env var; use strategy output as conceptual placement guidance.');
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
return notes;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
plan({
|
|
421
|
+
hardware,
|
|
422
|
+
models,
|
|
423
|
+
targetContext = 8192,
|
|
424
|
+
targetConcurrency = 2,
|
|
425
|
+
objective = 'balanced',
|
|
426
|
+
reserveGB = null
|
|
427
|
+
}) {
|
|
428
|
+
const normalizedModels = this.normalizeModels(models);
|
|
429
|
+
if (!normalizedModels.length) {
|
|
430
|
+
throw new Error('At least one model is required for GPU planning.');
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
const normalizedObjective = this.normalizeObjective(objective);
|
|
434
|
+
const contextTokens = this.clamp(
|
|
435
|
+
Math.round(this.toFiniteNumber(targetContext, 8192)),
|
|
436
|
+
512,
|
|
437
|
+
131072
|
|
438
|
+
);
|
|
439
|
+
const concurrency = this.clamp(
|
|
440
|
+
Math.round(this.toFiniteNumber(targetConcurrency, 2)),
|
|
441
|
+
1,
|
|
442
|
+
64
|
|
443
|
+
);
|
|
444
|
+
|
|
445
|
+
const hardwarePlan = this.resolveHardware(hardware, reserveGB);
|
|
446
|
+
|
|
447
|
+
const modelPlans = normalizedModels.map((model) => {
|
|
448
|
+
const strategies = [
|
|
449
|
+
this.simulatePin(model, hardwarePlan, contextTokens, concurrency),
|
|
450
|
+
this.simulateReplica(model, hardwarePlan, contextTokens, concurrency),
|
|
451
|
+
this.simulateSpread(model, hardwarePlan, contextTokens, concurrency)
|
|
452
|
+
];
|
|
453
|
+
|
|
454
|
+
const recommended = this.pickRecommendedStrategy(strategies, normalizedObjective);
|
|
455
|
+
|
|
456
|
+
return {
|
|
457
|
+
name: model.name,
|
|
458
|
+
size: model.size,
|
|
459
|
+
file_size_gb: model.fileSizeGB,
|
|
460
|
+
params_b: model.paramsB,
|
|
461
|
+
quantization: this.normalizeQuantization(model.quantization),
|
|
462
|
+
estimated_base_memory_gb: model.baseMemoryGB,
|
|
463
|
+
recommended,
|
|
464
|
+
strategies
|
|
465
|
+
};
|
|
466
|
+
});
|
|
467
|
+
|
|
468
|
+
return {
|
|
469
|
+
objective: normalizedObjective,
|
|
470
|
+
inputs: {
|
|
471
|
+
target_context: contextTokens,
|
|
472
|
+
target_concurrency: concurrency
|
|
473
|
+
},
|
|
474
|
+
hardware: {
|
|
475
|
+
backend: hardwarePlan.backend,
|
|
476
|
+
backend_name: hardwarePlan.backendName,
|
|
477
|
+
is_multi_gpu: hardwarePlan.isMultiGPU,
|
|
478
|
+
gpu_count: hardwarePlan.gpus.length,
|
|
479
|
+
reserve_gb: hardwarePlan.reserveGB,
|
|
480
|
+
total_usable_memory_gb: hardwarePlan.totalUsableGB,
|
|
481
|
+
device_env_var: hardwarePlan.deviceEnvVar,
|
|
482
|
+
gpus: hardwarePlan.gpus.map((gpu) => ({
|
|
483
|
+
index: gpu.index,
|
|
484
|
+
name: gpu.name,
|
|
485
|
+
memory_gb: gpu.memoryGB,
|
|
486
|
+
usable_memory_gb: gpu.usableMemoryGB,
|
|
487
|
+
speed_coefficient: gpu.speedCoefficient
|
|
488
|
+
}))
|
|
489
|
+
},
|
|
490
|
+
models: modelPlans,
|
|
491
|
+
notes: this.buildNotes(hardwarePlan, modelPlans)
|
|
492
|
+
};
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
module.exports = OllamaGPUPlacementPlanner;
|