llm-checker 3.3.0 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/bin/enhanced_cli.js +213 -0
- package/package.json +1 -1
- package/src/ollama/capacity-planner.js +399 -0
package/README.md
CHANGED
|
@@ -309,6 +309,7 @@ Claude will automatically call the right tools and give you actionable results.
|
|
|
309
309
|
| `recommend` | Intelligent recommendations by category (coding, reasoning, multimodal, etc.) |
|
|
310
310
|
| `calibrate` | Generate calibration result + routing policy artifacts from a JSONL prompt suite |
|
|
311
311
|
| `installed` | Rank your installed Ollama models by compatibility |
|
|
312
|
+
| `ollama-plan` | Compute safe Ollama runtime env vars (`NUM_CTX`, `NUM_PARALLEL`, `MAX_LOADED_MODELS`) for selected local models |
|
|
312
313
|
|
|
313
314
|
### Advanced Commands (require `sql.js`)
|
|
314
315
|
|
package/bin/enhanced_cli.js
CHANGED
|
@@ -591,6 +591,80 @@ async function checkOllamaAndExit() {
|
|
|
591
591
|
}
|
|
592
592
|
}
|
|
593
593
|
|
|
594
|
+
function parsePositiveIntegerOption(rawValue, optionName) {
|
|
595
|
+
const parsed = Number(rawValue);
|
|
596
|
+
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
597
|
+
throw new Error(`Invalid ${optionName}: ${rawValue}`);
|
|
598
|
+
}
|
|
599
|
+
return Math.round(parsed);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
function parseNonNegativeNumberOption(rawValue, optionName) {
|
|
603
|
+
const parsed = Number(rawValue);
|
|
604
|
+
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
605
|
+
throw new Error(`Invalid ${optionName}: ${rawValue}`);
|
|
606
|
+
}
|
|
607
|
+
return parsed;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
function selectModelsForPlan(installedModels, requestedModels = []) {
|
|
611
|
+
const requested = Array.isArray(requestedModels)
|
|
612
|
+
? requestedModels.map((model) => String(model || '').trim()).filter(Boolean)
|
|
613
|
+
: [];
|
|
614
|
+
|
|
615
|
+
if (!requested.length) {
|
|
616
|
+
return {
|
|
617
|
+
selected: installedModels.slice(),
|
|
618
|
+
missing: []
|
|
619
|
+
};
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const selected = [];
|
|
623
|
+
const missing = [];
|
|
624
|
+
const seen = new Set();
|
|
625
|
+
|
|
626
|
+
for (const request of requested) {
|
|
627
|
+
const normalized = request.toLowerCase();
|
|
628
|
+
|
|
629
|
+
let match = installedModels.find(
|
|
630
|
+
(model) => String(model.name || '').toLowerCase() === normalized
|
|
631
|
+
);
|
|
632
|
+
|
|
633
|
+
if (!match) {
|
|
634
|
+
match = installedModels.find((model) =>
|
|
635
|
+
String(model.name || '').toLowerCase().startsWith(`${normalized}:`)
|
|
636
|
+
);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
if (!match) {
|
|
640
|
+
match = installedModels.find(
|
|
641
|
+
(model) => String(model.family || '').toLowerCase() === normalized
|
|
642
|
+
);
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
if (!match) {
|
|
646
|
+
match = installedModels.find((model) =>
|
|
647
|
+
String(model.name || '').toLowerCase().includes(normalized)
|
|
648
|
+
);
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
if (!match) {
|
|
652
|
+
missing.push(request);
|
|
653
|
+
continue;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
if (!seen.has(match.name)) {
|
|
657
|
+
selected.push(match);
|
|
658
|
+
seen.add(match.name);
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
return {
|
|
663
|
+
selected,
|
|
664
|
+
missing
|
|
665
|
+
};
|
|
666
|
+
}
|
|
667
|
+
|
|
594
668
|
function getStatusIcon(model, ollamaModels) {
|
|
595
669
|
const ollamaModel = ollamaModels?.find(om => om.matchedModel?.name === model.name);
|
|
596
670
|
|
|
@@ -3042,6 +3116,145 @@ program
|
|
|
3042
3116
|
}
|
|
3043
3117
|
});
|
|
3044
3118
|
|
|
3119
|
+
program
|
|
3120
|
+
.command('ollama-plan')
|
|
3121
|
+
.description('Plan safe Ollama runtime settings for selected local models')
|
|
3122
|
+
.option('--models <models...>', 'Model tags/families to include (default: all local models)')
|
|
3123
|
+
.option('--ctx <tokens>', 'Target context window in tokens', '8192')
|
|
3124
|
+
.option('--concurrency <n>', 'Target parallel request count', '2')
|
|
3125
|
+
.option('--objective <mode>', 'Optimization objective (latency|balanced|throughput)', 'balanced')
|
|
3126
|
+
.option('--reserve-gb <gb>', 'Memory reserve for OS and background workloads', '2')
|
|
3127
|
+
.option('--json', 'Output plan as JSON')
|
|
3128
|
+
.action(async (options) => {
|
|
3129
|
+
const spinner = options.json ? null : ora('Building Ollama capacity plan...').start();
|
|
3130
|
+
|
|
3131
|
+
try {
|
|
3132
|
+
const requestedObjective = String(options.objective || 'balanced').toLowerCase();
|
|
3133
|
+
const supportedObjectives = new Set(['latency', 'balanced', 'throughput']);
|
|
3134
|
+
if (!supportedObjectives.has(requestedObjective)) {
|
|
3135
|
+
throw new Error(`Invalid objective "${options.objective}". Use latency, balanced, or throughput.`);
|
|
3136
|
+
}
|
|
3137
|
+
|
|
3138
|
+
const targetContext = parsePositiveIntegerOption(options.ctx, '--ctx');
|
|
3139
|
+
const targetConcurrency = parsePositiveIntegerOption(options.concurrency, '--concurrency');
|
|
3140
|
+
const reserveGB = parseNonNegativeNumberOption(options.reserveGb, '--reserve-gb');
|
|
3141
|
+
|
|
3142
|
+
const OllamaClient = require('../src/ollama/client');
|
|
3143
|
+
const UnifiedDetector = require('../src/hardware/unified-detector');
|
|
3144
|
+
const OllamaCapacityPlanner = require('../src/ollama/capacity-planner');
|
|
3145
|
+
|
|
3146
|
+
const ollamaClient = new OllamaClient();
|
|
3147
|
+
const availability = await ollamaClient.checkOllamaAvailability();
|
|
3148
|
+
if (!availability.available) {
|
|
3149
|
+
throw new Error(availability.error || 'Ollama is not available');
|
|
3150
|
+
}
|
|
3151
|
+
|
|
3152
|
+
const localModels = await ollamaClient.getLocalModels();
|
|
3153
|
+
if (!localModels || localModels.length === 0) {
|
|
3154
|
+
throw new Error('No local Ollama models found. Install one with: ollama pull llama3.2:3b');
|
|
3155
|
+
}
|
|
3156
|
+
|
|
3157
|
+
const { selected, missing } = selectModelsForPlan(localModels, options.models || []);
|
|
3158
|
+
if (selected.length === 0) {
|
|
3159
|
+
throw new Error(
|
|
3160
|
+
`No matching local models found for: ${(options.models || []).join(', ')}`
|
|
3161
|
+
);
|
|
3162
|
+
}
|
|
3163
|
+
|
|
3164
|
+
const detector = new UnifiedDetector();
|
|
3165
|
+
const hardware = await detector.detect();
|
|
3166
|
+
const planner = new OllamaCapacityPlanner();
|
|
3167
|
+
|
|
3168
|
+
const plan = planner.plan({
|
|
3169
|
+
hardware,
|
|
3170
|
+
models: selected,
|
|
3171
|
+
targetContext,
|
|
3172
|
+
targetConcurrency,
|
|
3173
|
+
objective: requestedObjective,
|
|
3174
|
+
reserveGB
|
|
3175
|
+
});
|
|
3176
|
+
|
|
3177
|
+
if (options.json) {
|
|
3178
|
+
console.log(JSON.stringify({
|
|
3179
|
+
generated_at: new Date().toISOString(),
|
|
3180
|
+
selection: {
|
|
3181
|
+
requested: options.models || [],
|
|
3182
|
+
selected: selected.map((model) => model.name),
|
|
3183
|
+
missing
|
|
3184
|
+
},
|
|
3185
|
+
plan
|
|
3186
|
+
}, null, 2));
|
|
3187
|
+
return;
|
|
3188
|
+
}
|
|
3189
|
+
|
|
3190
|
+
if (spinner) spinner.succeed('Capacity plan generated');
|
|
3191
|
+
|
|
3192
|
+
console.log('\n' + chalk.bgBlue.white.bold(' OLLAMA CAPACITY PLAN '));
|
|
3193
|
+
console.log(
|
|
3194
|
+
chalk.blue('Hardware:'),
|
|
3195
|
+
`${plan.hardware.backendName} (${plan.hardware.backend})`
|
|
3196
|
+
);
|
|
3197
|
+
console.log(
|
|
3198
|
+
chalk.blue('Memory budget:'),
|
|
3199
|
+
`${plan.memory.budgetGB}GB usable (reserve ${plan.hardware.reserveGB}GB)`
|
|
3200
|
+
);
|
|
3201
|
+
|
|
3202
|
+
if (missing.length > 0) {
|
|
3203
|
+
console.log(
|
|
3204
|
+
chalk.yellow('Missing model filters:'),
|
|
3205
|
+
missing.join(', ')
|
|
3206
|
+
);
|
|
3207
|
+
}
|
|
3208
|
+
|
|
3209
|
+
console.log(chalk.blue.bold('\nSelected models:'));
|
|
3210
|
+
for (const model of plan.models) {
|
|
3211
|
+
console.log(
|
|
3212
|
+
` - ${model.name} (${model.size}, ~${model.estimatedBaseMemoryGB}GB base)`
|
|
3213
|
+
);
|
|
3214
|
+
}
|
|
3215
|
+
|
|
3216
|
+
console.log(chalk.blue.bold('\nRecommended envelope:'));
|
|
3217
|
+
console.log(
|
|
3218
|
+
` Context: ${plan.envelope.context.recommended} (requested ${plan.envelope.context.requested})`
|
|
3219
|
+
);
|
|
3220
|
+
console.log(
|
|
3221
|
+
` Parallel: ${plan.envelope.parallel.recommended} (requested ${plan.envelope.parallel.requested})`
|
|
3222
|
+
);
|
|
3223
|
+
console.log(
|
|
3224
|
+
` Loaded models: ${plan.envelope.loaded_models.recommended} (requested ${plan.envelope.loaded_models.requested})`
|
|
3225
|
+
);
|
|
3226
|
+
console.log(
|
|
3227
|
+
` Estimated memory: ${plan.memory.recommendedEstimatedGB}GB / ${plan.memory.budgetGB}GB (${plan.memory.utilizationPercent}%)`
|
|
3228
|
+
);
|
|
3229
|
+
console.log(` Risk: ${plan.risk.level.toUpperCase()} (${plan.risk.score}/100)`);
|
|
3230
|
+
|
|
3231
|
+
if (plan.notes.length > 0) {
|
|
3232
|
+
console.log(chalk.blue.bold('\nNotes:'));
|
|
3233
|
+
for (const note of plan.notes) {
|
|
3234
|
+
console.log(` - ${note}`);
|
|
3235
|
+
}
|
|
3236
|
+
}
|
|
3237
|
+
|
|
3238
|
+
console.log(chalk.blue.bold('\nRecommended env vars:'));
|
|
3239
|
+
for (const [key, value] of Object.entries(plan.shell.env)) {
|
|
3240
|
+
console.log(` export ${key}=${value}`);
|
|
3241
|
+
}
|
|
3242
|
+
|
|
3243
|
+
console.log(chalk.blue.bold('\nFallback profile:'));
|
|
3244
|
+
console.log(
|
|
3245
|
+
` OLLAMA_NUM_CTX=${plan.fallback.num_ctx} OLLAMA_NUM_PARALLEL=${plan.fallback.num_parallel} OLLAMA_MAX_LOADED_MODELS=${plan.fallback.max_loaded_models}`
|
|
3246
|
+
);
|
|
3247
|
+
console.log('');
|
|
3248
|
+
} catch (error) {
|
|
3249
|
+
if (spinner) spinner.fail('Failed to build capacity plan');
|
|
3250
|
+
console.error(chalk.red('Error:'), error.message);
|
|
3251
|
+
if (process.env.DEBUG) {
|
|
3252
|
+
console.error(error.stack);
|
|
3253
|
+
}
|
|
3254
|
+
process.exit(1);
|
|
3255
|
+
}
|
|
3256
|
+
});
|
|
3257
|
+
|
|
3045
3258
|
program
|
|
3046
3259
|
.command('recommend')
|
|
3047
3260
|
.description('Get intelligent model recommendations for your hardware')
|
package/package.json
CHANGED
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
class OllamaCapacityPlanner {
|
|
2
|
+
constructor(options = {}) {
|
|
3
|
+
this.minContext = options.minContext || 2048;
|
|
4
|
+
this.maxParallelCap = options.maxParallelCap || 8;
|
|
5
|
+
this.defaultReserveGB = options.defaultReserveGB || 2;
|
|
6
|
+
this.kvFactorPer4k = options.kvFactorPer4k || 0.08; // GB per 1B params at 4k ctx
|
|
7
|
+
this.modelOverheadGB = options.modelOverheadGB || 0.7;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
toFiniteNumber(value, fallback = 0) {
|
|
11
|
+
const numeric = Number(value);
|
|
12
|
+
return Number.isFinite(numeric) ? numeric : fallback;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
clamp(value, min, max) {
|
|
16
|
+
return Math.min(max, Math.max(min, value));
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
normalizeObjective(objective) {
|
|
20
|
+
const normalized = String(objective || 'balanced').toLowerCase();
|
|
21
|
+
if (normalized === 'latency' || normalized === 'throughput' || normalized === 'balanced') {
|
|
22
|
+
return normalized;
|
|
23
|
+
}
|
|
24
|
+
return 'balanced';
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
objectiveProfile(objective) {
|
|
28
|
+
if (objective === 'latency') {
|
|
29
|
+
return {
|
|
30
|
+
parallelCap: 2,
|
|
31
|
+
loadedCap: 1,
|
|
32
|
+
keepAlive: '30m'
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (objective === 'throughput') {
|
|
37
|
+
return {
|
|
38
|
+
parallelCap: 6,
|
|
39
|
+
loadedCap: 3,
|
|
40
|
+
keepAlive: '10m'
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
parallelCap: 3,
|
|
46
|
+
loadedCap: 2,
|
|
47
|
+
keepAlive: '15m'
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
estimateParamsB(model = {}) {
|
|
52
|
+
const sizeMatch = String(model.size || '').match(/(\d+(?:\.\d+)?)\s*b/i);
|
|
53
|
+
if (sizeMatch) {
|
|
54
|
+
return this.toFiniteNumber(sizeMatch[1], 0);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const nameMatch = String(model.name || '').match(/(\d+(?:\.\d+)?)\s*b\b/i);
|
|
58
|
+
if (nameMatch) {
|
|
59
|
+
return this.toFiniteNumber(nameMatch[1], 0);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Approximate from quantized model file size (Q4 ~0.65 GB per 1B params)
|
|
63
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
64
|
+
if (fileSizeGB > 0) {
|
|
65
|
+
return fileSizeGB / 0.65;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return 7; // conservative fallback
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
estimateBaseMemoryGB(model = {}) {
|
|
72
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
73
|
+
if (fileSizeGB > 0) {
|
|
74
|
+
return fileSizeGB + this.modelOverheadGB;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const paramsB = this.estimateParamsB(model);
|
|
78
|
+
return paramsB * 0.65 + this.modelOverheadGB;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
estimateKVCacheGB(paramsB, contextTokens) {
|
|
82
|
+
const ctx = this.toFiniteNumber(contextTokens, this.minContext);
|
|
83
|
+
return paramsB * this.kvFactorPer4k * (ctx / 4096);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
normalizeModels(models = []) {
|
|
87
|
+
const normalized = models
|
|
88
|
+
.filter((model) => model && model.name)
|
|
89
|
+
.map((model) => {
|
|
90
|
+
const paramsB = this.estimateParamsB(model);
|
|
91
|
+
const baseMemoryGB = this.estimateBaseMemoryGB(model);
|
|
92
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, Math.max(0, baseMemoryGB - this.modelOverheadGB));
|
|
93
|
+
return {
|
|
94
|
+
name: model.name,
|
|
95
|
+
size: model.size || `${Math.round(paramsB)}B`,
|
|
96
|
+
fileSizeGB: Math.round(fileSizeGB * 10) / 10,
|
|
97
|
+
paramsB: Math.round(paramsB * 10) / 10,
|
|
98
|
+
baseMemoryGB: Math.round(baseMemoryGB * 100) / 100
|
|
99
|
+
};
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Heaviest first to keep planning conservative
|
|
103
|
+
normalized.sort((a, b) => b.baseMemoryGB - a.baseMemoryGB);
|
|
104
|
+
return normalized;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
resolveHardwareBudget(hardware = {}, reserveGB = null) {
|
|
108
|
+
const summary = hardware.summary || {};
|
|
109
|
+
const reserve = this.toFiniteNumber(reserveGB, this.defaultReserveGB);
|
|
110
|
+
|
|
111
|
+
const effectiveMemory = this.toFiniteNumber(summary.effectiveMemory, 0);
|
|
112
|
+
const systemRAM = this.toFiniteNumber(summary.systemRAM, 0);
|
|
113
|
+
const vram = this.toFiniteNumber(summary.totalVRAM, 0);
|
|
114
|
+
const fallbackTotal = this.toFiniteNumber(hardware.memory?.total, 8);
|
|
115
|
+
|
|
116
|
+
const rawCapacityGB = effectiveMemory || vram || (systemRAM > 0 ? systemRAM * 0.7 : 0) || fallbackTotal * 0.7;
|
|
117
|
+
const memoryBudgetGB = Math.max(2, rawCapacityGB - reserve);
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
backend: summary.bestBackend || 'cpu',
|
|
121
|
+
backendName: summary.backendName || summary.bestBackend || 'CPU',
|
|
122
|
+
rawCapacityGB: Math.round(rawCapacityGB * 10) / 10,
|
|
123
|
+
reserveGB: Math.round(reserve * 10) / 10,
|
|
124
|
+
memoryBudgetGB: Math.round(memoryBudgetGB * 10) / 10
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
computeLoadState(models, contextTokens, loadedCount, budgetGB) {
|
|
129
|
+
const activeModels = models.slice(0, loadedCount);
|
|
130
|
+
const baseTotalGB = activeModels.reduce((sum, model) => sum + model.baseMemoryGB, 0);
|
|
131
|
+
const maxParamsB = activeModels.reduce((max, model) => Math.max(max, model.paramsB), 0);
|
|
132
|
+
const kvAtContextGB = this.estimateKVCacheGB(maxParamsB, contextTokens);
|
|
133
|
+
const kvPerTokenGB = maxParamsB > 0 ? (maxParamsB * this.kvFactorPer4k) / 4096 : 0;
|
|
134
|
+
const availableForKVGB = budgetGB - baseTotalGB;
|
|
135
|
+
|
|
136
|
+
let maxParallelAtContext = 0;
|
|
137
|
+
if (kvAtContextGB <= 0) {
|
|
138
|
+
maxParallelAtContext = this.maxParallelCap;
|
|
139
|
+
} else if (availableForKVGB > 0) {
|
|
140
|
+
maxParallelAtContext = Math.floor(availableForKVGB / kvAtContextGB);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
activeModels,
|
|
145
|
+
baseTotalGB,
|
|
146
|
+
maxParamsB,
|
|
147
|
+
kvAtContextGB,
|
|
148
|
+
kvPerTokenGB,
|
|
149
|
+
availableForKVGB,
|
|
150
|
+
maxParallelAtContext
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
maxLoadedModelsFor(models, contextTokens, parallel, budgetGB, hardCap) {
|
|
155
|
+
const cap = Math.max(1, Math.min(hardCap, models.length));
|
|
156
|
+
let best = 1;
|
|
157
|
+
for (let i = 1; i <= cap; i += 1) {
|
|
158
|
+
const state = this.computeLoadState(models, contextTokens, i, budgetGB);
|
|
159
|
+
const estimatedTotal = state.baseTotalGB + (state.kvAtContextGB * parallel);
|
|
160
|
+
if (estimatedTotal <= budgetGB) {
|
|
161
|
+
best = i;
|
|
162
|
+
} else {
|
|
163
|
+
break;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return best;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
calculateRiskLevel({
|
|
170
|
+
budgetGB,
|
|
171
|
+
requestedTotalGB,
|
|
172
|
+
recommendedTotalGB,
|
|
173
|
+
requestedFits
|
|
174
|
+
}) {
|
|
175
|
+
const safeBudget = Math.max(0.1, budgetGB);
|
|
176
|
+
const requestedUtil = requestedTotalGB / safeBudget;
|
|
177
|
+
const recommendedUtil = recommendedTotalGB / safeBudget;
|
|
178
|
+
const overage = Math.max(0, requestedTotalGB - safeBudget) / safeBudget;
|
|
179
|
+
|
|
180
|
+
const score = Math.min(
|
|
181
|
+
100,
|
|
182
|
+
Math.round((overage * 100) + (recommendedUtil * 55) + (requestedFits ? 0 : 20))
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
let level = 'low';
|
|
186
|
+
if (score >= 75) level = 'critical';
|
|
187
|
+
else if (score >= 55) level = 'high';
|
|
188
|
+
else if (score >= 35) level = 'medium';
|
|
189
|
+
|
|
190
|
+
return { level, score };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
plan({
|
|
194
|
+
hardware,
|
|
195
|
+
models,
|
|
196
|
+
targetContext = 8192,
|
|
197
|
+
targetConcurrency = 2,
|
|
198
|
+
objective = 'balanced',
|
|
199
|
+
reserveGB = null
|
|
200
|
+
}) {
|
|
201
|
+
const normalizedObjective = this.normalizeObjective(objective);
|
|
202
|
+
const profile = this.objectiveProfile(normalizedObjective);
|
|
203
|
+
const modelPool = this.normalizeModels(models);
|
|
204
|
+
|
|
205
|
+
if (modelPool.length === 0) {
|
|
206
|
+
throw new Error('At least one model is required for planning.');
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const requestedCtx = this.clamp(
|
|
210
|
+
Math.round(this.toFiniteNumber(targetContext, 8192)),
|
|
211
|
+
512,
|
|
212
|
+
131072
|
|
213
|
+
);
|
|
214
|
+
const requestedConcurrency = this.clamp(
|
|
215
|
+
Math.round(this.toFiniteNumber(targetConcurrency, 2)),
|
|
216
|
+
1,
|
|
217
|
+
64
|
|
218
|
+
);
|
|
219
|
+
|
|
220
|
+
const hardwareBudget = this.resolveHardwareBudget(hardware, reserveGB);
|
|
221
|
+
const budgetGB = hardwareBudget.memoryBudgetGB;
|
|
222
|
+
|
|
223
|
+
const desiredLoaded = Math.max(1, Math.min(profile.loadedCap, modelPool.length));
|
|
224
|
+
let loadedModels = desiredLoaded;
|
|
225
|
+
|
|
226
|
+
// Ensure the base model memory is feasible.
|
|
227
|
+
while (loadedModels > 1) {
|
|
228
|
+
const state = this.computeLoadState(modelPool, requestedCtx, loadedModels, budgetGB);
|
|
229
|
+
if (state.availableForKVGB > 0) {
|
|
230
|
+
break;
|
|
231
|
+
}
|
|
232
|
+
loadedModels -= 1;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
let requestedState = this.computeLoadState(modelPool, requestedCtx, loadedModels, budgetGB);
|
|
236
|
+
let recommendedCtx = requestedCtx;
|
|
237
|
+
|
|
238
|
+
if (requestedState.maxParallelAtContext < 1) {
|
|
239
|
+
const ctxFitAtParallel1 = requestedState.kvPerTokenGB > 0
|
|
240
|
+
? Math.floor(requestedState.availableForKVGB / requestedState.kvPerTokenGB)
|
|
241
|
+
: requestedCtx;
|
|
242
|
+
recommendedCtx = this.clamp(
|
|
243
|
+
Math.max(this.minContext, Math.min(requestedCtx, ctxFitAtParallel1 || this.minContext)),
|
|
244
|
+
this.minContext,
|
|
245
|
+
requestedCtx
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
let recommendedState = this.computeLoadState(modelPool, recommendedCtx, loadedModels, budgetGB);
|
|
250
|
+
if (recommendedState.maxParallelAtContext < 1) {
|
|
251
|
+
recommendedCtx = this.minContext;
|
|
252
|
+
recommendedState = this.computeLoadState(modelPool, recommendedCtx, loadedModels, budgetGB);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
let recommendedParallel = Math.max(
|
|
256
|
+
1,
|
|
257
|
+
Math.min(
|
|
258
|
+
requestedConcurrency,
|
|
259
|
+
profile.parallelCap,
|
|
260
|
+
this.maxParallelCap,
|
|
261
|
+
Math.max(1, recommendedState.maxParallelAtContext)
|
|
262
|
+
)
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
let recommendedLoaded = this.maxLoadedModelsFor(
|
|
266
|
+
modelPool,
|
|
267
|
+
recommendedCtx,
|
|
268
|
+
recommendedParallel,
|
|
269
|
+
budgetGB,
|
|
270
|
+
profile.loadedCap
|
|
271
|
+
);
|
|
272
|
+
|
|
273
|
+
// Recompute state after final loaded model selection.
|
|
274
|
+
recommendedState = this.computeLoadState(modelPool, recommendedCtx, recommendedLoaded, budgetGB);
|
|
275
|
+
|
|
276
|
+
const maxCtxParallel1 = recommendedState.kvPerTokenGB > 0
|
|
277
|
+
? Math.floor(recommendedState.availableForKVGB / recommendedState.kvPerTokenGB)
|
|
278
|
+
: requestedCtx;
|
|
279
|
+
const maxCtxAtRecommendedParallel = recommendedState.kvPerTokenGB > 0
|
|
280
|
+
? Math.floor(recommendedState.availableForKVGB / (recommendedState.kvPerTokenGB * recommendedParallel))
|
|
281
|
+
: requestedCtx;
|
|
282
|
+
|
|
283
|
+
if (maxCtxAtRecommendedParallel > 0) {
|
|
284
|
+
recommendedCtx = this.clamp(
|
|
285
|
+
Math.min(recommendedCtx, maxCtxAtRecommendedParallel),
|
|
286
|
+
this.minContext,
|
|
287
|
+
requestedCtx
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
recommendedState = this.computeLoadState(modelPool, recommendedCtx, recommendedLoaded, budgetGB);
|
|
292
|
+
const requestedTotalGB = requestedState.baseTotalGB + (requestedState.kvAtContextGB * requestedConcurrency);
|
|
293
|
+
const recommendedTotalGB = recommendedState.baseTotalGB + (recommendedState.kvAtContextGB * recommendedParallel);
|
|
294
|
+
const requestedFits = requestedTotalGB <= budgetGB;
|
|
295
|
+
|
|
296
|
+
const risk = this.calculateRiskLevel({
|
|
297
|
+
budgetGB,
|
|
298
|
+
requestedTotalGB,
|
|
299
|
+
recommendedTotalGB,
|
|
300
|
+
requestedFits
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
const flashAttention = hardwareBudget.backend === 'cpu' ? '0' : '1';
|
|
304
|
+
const maxQueue = Math.max(4, recommendedParallel * 4);
|
|
305
|
+
|
|
306
|
+
const fallbackCtx = this.clamp(Math.min(4096, recommendedCtx), this.minContext, recommendedCtx);
|
|
307
|
+
const fallbackState = this.computeLoadState(modelPool, fallbackCtx, 1, budgetGB);
|
|
308
|
+
const fallbackTotalGB = fallbackState.baseTotalGB + fallbackState.kvAtContextGB;
|
|
309
|
+
|
|
310
|
+
const notes = [];
|
|
311
|
+
if (!requestedFits) {
|
|
312
|
+
notes.push('Requested settings exceed available memory budget; reduced settings are recommended.');
|
|
313
|
+
}
|
|
314
|
+
if (recommendedCtx < requestedCtx) {
|
|
315
|
+
notes.push(`Context reduced from ${requestedCtx} to ${recommendedCtx} to avoid memory pressure.`);
|
|
316
|
+
}
|
|
317
|
+
if (recommendedParallel < requestedConcurrency) {
|
|
318
|
+
notes.push(`Parallelism reduced from ${requestedConcurrency} to ${recommendedParallel} to keep memory stable.`);
|
|
319
|
+
}
|
|
320
|
+
if (recommendedLoaded < desiredLoaded) {
|
|
321
|
+
notes.push(`Loaded models capped at ${recommendedLoaded} for this objective and memory budget.`);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return {
|
|
325
|
+
objective: normalizedObjective,
|
|
326
|
+
inputs: {
|
|
327
|
+
targetContext: requestedCtx,
|
|
328
|
+
targetConcurrency: requestedConcurrency
|
|
329
|
+
},
|
|
330
|
+
hardware: hardwareBudget,
|
|
331
|
+
models: recommendedState.activeModels.map((model) => ({
|
|
332
|
+
name: model.name,
|
|
333
|
+
size: model.size,
|
|
334
|
+
fileSizeGB: model.fileSizeGB,
|
|
335
|
+
paramsB: model.paramsB,
|
|
336
|
+
estimatedBaseMemoryGB: Math.round(model.baseMemoryGB * 100) / 100
|
|
337
|
+
})),
|
|
338
|
+
envelope: {
|
|
339
|
+
context: {
|
|
340
|
+
requested: requestedCtx,
|
|
341
|
+
recommended: recommendedCtx,
|
|
342
|
+
min_safe: this.minContext,
|
|
343
|
+
max_for_parallel_1: Math.max(0, maxCtxParallel1 || 0),
|
|
344
|
+
max_for_recommended_parallel: Math.max(0, maxCtxAtRecommendedParallel || 0)
|
|
345
|
+
},
|
|
346
|
+
parallel: {
|
|
347
|
+
requested: requestedConcurrency,
|
|
348
|
+
recommended: recommendedParallel,
|
|
349
|
+
max_at_requested_ctx: Math.max(0, requestedState.maxParallelAtContext)
|
|
350
|
+
},
|
|
351
|
+
loaded_models: {
|
|
352
|
+
requested: desiredLoaded,
|
|
353
|
+
recommended: recommendedLoaded,
|
|
354
|
+
max_at_recommended_settings: this.maxLoadedModelsFor(
|
|
355
|
+
modelPool,
|
|
356
|
+
recommendedCtx,
|
|
357
|
+
recommendedParallel,
|
|
358
|
+
budgetGB,
|
|
359
|
+
modelPool.length
|
|
360
|
+
)
|
|
361
|
+
}
|
|
362
|
+
},
|
|
363
|
+
recommendation: {
|
|
364
|
+
num_ctx: recommendedCtx,
|
|
365
|
+
num_parallel: recommendedParallel,
|
|
366
|
+
max_loaded_models: recommendedLoaded,
|
|
367
|
+
max_queue: maxQueue,
|
|
368
|
+
keep_alive: profile.keepAlive,
|
|
369
|
+
flash_attention: flashAttention
|
|
370
|
+
},
|
|
371
|
+
memory: {
|
|
372
|
+
budgetGB: Math.round(budgetGB * 100) / 100,
|
|
373
|
+
requestedEstimatedGB: Math.round(requestedTotalGB * 100) / 100,
|
|
374
|
+
recommendedEstimatedGB: Math.round(recommendedTotalGB * 100) / 100,
|
|
375
|
+
utilizationPercent: Math.round((recommendedTotalGB / Math.max(0.1, budgetGB)) * 100)
|
|
376
|
+
},
|
|
377
|
+
risk,
|
|
378
|
+
fallback: {
|
|
379
|
+
num_ctx: fallbackCtx,
|
|
380
|
+
num_parallel: 1,
|
|
381
|
+
max_loaded_models: 1,
|
|
382
|
+
estimated_memory_gb: Math.round(fallbackTotalGB * 100) / 100
|
|
383
|
+
},
|
|
384
|
+
shell: {
|
|
385
|
+
env: {
|
|
386
|
+
OLLAMA_NUM_CTX: String(recommendedCtx),
|
|
387
|
+
OLLAMA_NUM_PARALLEL: String(recommendedParallel),
|
|
388
|
+
OLLAMA_MAX_LOADED_MODELS: String(recommendedLoaded),
|
|
389
|
+
OLLAMA_MAX_QUEUE: String(maxQueue),
|
|
390
|
+
OLLAMA_KEEP_ALIVE: profile.keepAlive,
|
|
391
|
+
OLLAMA_FLASH_ATTENTION: flashAttention
|
|
392
|
+
}
|
|
393
|
+
},
|
|
394
|
+
notes
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
module.exports = OllamaCapacityPlanner;
|