llm-checker 3.2.8 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,399 @@
1
+ class OllamaCapacityPlanner {
2
+ constructor(options = {}) {
3
+ this.minContext = options.minContext || 2048;
4
+ this.maxParallelCap = options.maxParallelCap || 8;
5
+ this.defaultReserveGB = options.defaultReserveGB || 2;
6
+ this.kvFactorPer4k = options.kvFactorPer4k || 0.08; // GB per 1B params at 4k ctx
7
+ this.modelOverheadGB = options.modelOverheadGB || 0.7;
8
+ }
9
+
10
+ toFiniteNumber(value, fallback = 0) {
11
+ const numeric = Number(value);
12
+ return Number.isFinite(numeric) ? numeric : fallback;
13
+ }
14
+
15
+ clamp(value, min, max) {
16
+ return Math.min(max, Math.max(min, value));
17
+ }
18
+
19
+ normalizeObjective(objective) {
20
+ const normalized = String(objective || 'balanced').toLowerCase();
21
+ if (normalized === 'latency' || normalized === 'throughput' || normalized === 'balanced') {
22
+ return normalized;
23
+ }
24
+ return 'balanced';
25
+ }
26
+
27
+ objectiveProfile(objective) {
28
+ if (objective === 'latency') {
29
+ return {
30
+ parallelCap: 2,
31
+ loadedCap: 1,
32
+ keepAlive: '30m'
33
+ };
34
+ }
35
+
36
+ if (objective === 'throughput') {
37
+ return {
38
+ parallelCap: 6,
39
+ loadedCap: 3,
40
+ keepAlive: '10m'
41
+ };
42
+ }
43
+
44
+ return {
45
+ parallelCap: 3,
46
+ loadedCap: 2,
47
+ keepAlive: '15m'
48
+ };
49
+ }
50
+
51
+ estimateParamsB(model = {}) {
52
+ const sizeMatch = String(model.size || '').match(/(\d+(?:\.\d+)?)\s*b/i);
53
+ if (sizeMatch) {
54
+ return this.toFiniteNumber(sizeMatch[1], 0);
55
+ }
56
+
57
+ const nameMatch = String(model.name || '').match(/(\d+(?:\.\d+)?)\s*b\b/i);
58
+ if (nameMatch) {
59
+ return this.toFiniteNumber(nameMatch[1], 0);
60
+ }
61
+
62
+ // Approximate from quantized model file size (Q4 ~0.65 GB per 1B params)
63
+ const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
64
+ if (fileSizeGB > 0) {
65
+ return fileSizeGB / 0.65;
66
+ }
67
+
68
+ return 7; // conservative fallback
69
+ }
70
+
71
+ estimateBaseMemoryGB(model = {}) {
72
+ const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
73
+ if (fileSizeGB > 0) {
74
+ return fileSizeGB + this.modelOverheadGB;
75
+ }
76
+
77
+ const paramsB = this.estimateParamsB(model);
78
+ return paramsB * 0.65 + this.modelOverheadGB;
79
+ }
80
+
81
+ estimateKVCacheGB(paramsB, contextTokens) {
82
+ const ctx = this.toFiniteNumber(contextTokens, this.minContext);
83
+ return paramsB * this.kvFactorPer4k * (ctx / 4096);
84
+ }
85
+
86
+ normalizeModels(models = []) {
87
+ const normalized = models
88
+ .filter((model) => model && model.name)
89
+ .map((model) => {
90
+ const paramsB = this.estimateParamsB(model);
91
+ const baseMemoryGB = this.estimateBaseMemoryGB(model);
92
+ const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, Math.max(0, baseMemoryGB - this.modelOverheadGB));
93
+ return {
94
+ name: model.name,
95
+ size: model.size || `${Math.round(paramsB)}B`,
96
+ fileSizeGB: Math.round(fileSizeGB * 10) / 10,
97
+ paramsB: Math.round(paramsB * 10) / 10,
98
+ baseMemoryGB: Math.round(baseMemoryGB * 100) / 100
99
+ };
100
+ });
101
+
102
+ // Heaviest first to keep planning conservative
103
+ normalized.sort((a, b) => b.baseMemoryGB - a.baseMemoryGB);
104
+ return normalized;
105
+ }
106
+
107
+ resolveHardwareBudget(hardware = {}, reserveGB = null) {
108
+ const summary = hardware.summary || {};
109
+ const reserve = this.toFiniteNumber(reserveGB, this.defaultReserveGB);
110
+
111
+ const effectiveMemory = this.toFiniteNumber(summary.effectiveMemory, 0);
112
+ const systemRAM = this.toFiniteNumber(summary.systemRAM, 0);
113
+ const vram = this.toFiniteNumber(summary.totalVRAM, 0);
114
+ const fallbackTotal = this.toFiniteNumber(hardware.memory?.total, 8);
115
+
116
+ const rawCapacityGB = effectiveMemory || vram || (systemRAM > 0 ? systemRAM * 0.7 : 0) || fallbackTotal * 0.7;
117
+ const memoryBudgetGB = Math.max(2, rawCapacityGB - reserve);
118
+
119
+ return {
120
+ backend: summary.bestBackend || 'cpu',
121
+ backendName: summary.backendName || summary.bestBackend || 'CPU',
122
+ rawCapacityGB: Math.round(rawCapacityGB * 10) / 10,
123
+ reserveGB: Math.round(reserve * 10) / 10,
124
+ memoryBudgetGB: Math.round(memoryBudgetGB * 10) / 10
125
+ };
126
+ }
127
+
128
+ computeLoadState(models, contextTokens, loadedCount, budgetGB) {
129
+ const activeModels = models.slice(0, loadedCount);
130
+ const baseTotalGB = activeModels.reduce((sum, model) => sum + model.baseMemoryGB, 0);
131
+ const maxParamsB = activeModels.reduce((max, model) => Math.max(max, model.paramsB), 0);
132
+ const kvAtContextGB = this.estimateKVCacheGB(maxParamsB, contextTokens);
133
+ const kvPerTokenGB = maxParamsB > 0 ? (maxParamsB * this.kvFactorPer4k) / 4096 : 0;
134
+ const availableForKVGB = budgetGB - baseTotalGB;
135
+
136
+ let maxParallelAtContext = 0;
137
+ if (kvAtContextGB <= 0) {
138
+ maxParallelAtContext = this.maxParallelCap;
139
+ } else if (availableForKVGB > 0) {
140
+ maxParallelAtContext = Math.floor(availableForKVGB / kvAtContextGB);
141
+ }
142
+
143
+ return {
144
+ activeModels,
145
+ baseTotalGB,
146
+ maxParamsB,
147
+ kvAtContextGB,
148
+ kvPerTokenGB,
149
+ availableForKVGB,
150
+ maxParallelAtContext
151
+ };
152
+ }
153
+
154
+ maxLoadedModelsFor(models, contextTokens, parallel, budgetGB, hardCap) {
155
+ const cap = Math.max(1, Math.min(hardCap, models.length));
156
+ let best = 1;
157
+ for (let i = 1; i <= cap; i += 1) {
158
+ const state = this.computeLoadState(models, contextTokens, i, budgetGB);
159
+ const estimatedTotal = state.baseTotalGB + (state.kvAtContextGB * parallel);
160
+ if (estimatedTotal <= budgetGB) {
161
+ best = i;
162
+ } else {
163
+ break;
164
+ }
165
+ }
166
+ return best;
167
+ }
168
+
169
+ calculateRiskLevel({
170
+ budgetGB,
171
+ requestedTotalGB,
172
+ recommendedTotalGB,
173
+ requestedFits
174
+ }) {
175
+ const safeBudget = Math.max(0.1, budgetGB);
176
+ const requestedUtil = requestedTotalGB / safeBudget;
177
+ const recommendedUtil = recommendedTotalGB / safeBudget;
178
+ const overage = Math.max(0, requestedTotalGB - safeBudget) / safeBudget;
179
+
180
+ const score = Math.min(
181
+ 100,
182
+ Math.round((overage * 100) + (recommendedUtil * 55) + (requestedFits ? 0 : 20))
183
+ );
184
+
185
+ let level = 'low';
186
+ if (score >= 75) level = 'critical';
187
+ else if (score >= 55) level = 'high';
188
+ else if (score >= 35) level = 'medium';
189
+
190
+ return { level, score };
191
+ }
192
+
193
+ plan({
194
+ hardware,
195
+ models,
196
+ targetContext = 8192,
197
+ targetConcurrency = 2,
198
+ objective = 'balanced',
199
+ reserveGB = null
200
+ }) {
201
+ const normalizedObjective = this.normalizeObjective(objective);
202
+ const profile = this.objectiveProfile(normalizedObjective);
203
+ const modelPool = this.normalizeModels(models);
204
+
205
+ if (modelPool.length === 0) {
206
+ throw new Error('At least one model is required for planning.');
207
+ }
208
+
209
+ const requestedCtx = this.clamp(
210
+ Math.round(this.toFiniteNumber(targetContext, 8192)),
211
+ 512,
212
+ 131072
213
+ );
214
+ const requestedConcurrency = this.clamp(
215
+ Math.round(this.toFiniteNumber(targetConcurrency, 2)),
216
+ 1,
217
+ 64
218
+ );
219
+
220
+ const hardwareBudget = this.resolveHardwareBudget(hardware, reserveGB);
221
+ const budgetGB = hardwareBudget.memoryBudgetGB;
222
+
223
+ const desiredLoaded = Math.max(1, Math.min(profile.loadedCap, modelPool.length));
224
+ let loadedModels = desiredLoaded;
225
+
226
+ // Ensure the base model memory is feasible.
227
+ while (loadedModels > 1) {
228
+ const state = this.computeLoadState(modelPool, requestedCtx, loadedModels, budgetGB);
229
+ if (state.availableForKVGB > 0) {
230
+ break;
231
+ }
232
+ loadedModels -= 1;
233
+ }
234
+
235
+ let requestedState = this.computeLoadState(modelPool, requestedCtx, loadedModels, budgetGB);
236
+ let recommendedCtx = requestedCtx;
237
+
238
+ if (requestedState.maxParallelAtContext < 1) {
239
+ const ctxFitAtParallel1 = requestedState.kvPerTokenGB > 0
240
+ ? Math.floor(requestedState.availableForKVGB / requestedState.kvPerTokenGB)
241
+ : requestedCtx;
242
+ recommendedCtx = this.clamp(
243
+ Math.max(this.minContext, Math.min(requestedCtx, ctxFitAtParallel1 || this.minContext)),
244
+ this.minContext,
245
+ requestedCtx
246
+ );
247
+ }
248
+
249
+ let recommendedState = this.computeLoadState(modelPool, recommendedCtx, loadedModels, budgetGB);
250
+ if (recommendedState.maxParallelAtContext < 1) {
251
+ recommendedCtx = this.minContext;
252
+ recommendedState = this.computeLoadState(modelPool, recommendedCtx, loadedModels, budgetGB);
253
+ }
254
+
255
+ let recommendedParallel = Math.max(
256
+ 1,
257
+ Math.min(
258
+ requestedConcurrency,
259
+ profile.parallelCap,
260
+ this.maxParallelCap,
261
+ Math.max(1, recommendedState.maxParallelAtContext)
262
+ )
263
+ );
264
+
265
+ let recommendedLoaded = this.maxLoadedModelsFor(
266
+ modelPool,
267
+ recommendedCtx,
268
+ recommendedParallel,
269
+ budgetGB,
270
+ profile.loadedCap
271
+ );
272
+
273
+ // Recompute state after final loaded model selection.
274
+ recommendedState = this.computeLoadState(modelPool, recommendedCtx, recommendedLoaded, budgetGB);
275
+
276
+ const maxCtxParallel1 = recommendedState.kvPerTokenGB > 0
277
+ ? Math.floor(recommendedState.availableForKVGB / recommendedState.kvPerTokenGB)
278
+ : requestedCtx;
279
+ const maxCtxAtRecommendedParallel = recommendedState.kvPerTokenGB > 0
280
+ ? Math.floor(recommendedState.availableForKVGB / (recommendedState.kvPerTokenGB * recommendedParallel))
281
+ : requestedCtx;
282
+
283
+ if (maxCtxAtRecommendedParallel > 0) {
284
+ recommendedCtx = this.clamp(
285
+ Math.min(recommendedCtx, maxCtxAtRecommendedParallel),
286
+ this.minContext,
287
+ requestedCtx
288
+ );
289
+ }
290
+
291
+ recommendedState = this.computeLoadState(modelPool, recommendedCtx, recommendedLoaded, budgetGB);
292
+ const requestedTotalGB = requestedState.baseTotalGB + (requestedState.kvAtContextGB * requestedConcurrency);
293
+ const recommendedTotalGB = recommendedState.baseTotalGB + (recommendedState.kvAtContextGB * recommendedParallel);
294
+ const requestedFits = requestedTotalGB <= budgetGB;
295
+
296
+ const risk = this.calculateRiskLevel({
297
+ budgetGB,
298
+ requestedTotalGB,
299
+ recommendedTotalGB,
300
+ requestedFits
301
+ });
302
+
303
+ const flashAttention = hardwareBudget.backend === 'cpu' ? '0' : '1';
304
+ const maxQueue = Math.max(4, recommendedParallel * 4);
305
+
306
+ const fallbackCtx = this.clamp(Math.min(4096, recommendedCtx), this.minContext, recommendedCtx);
307
+ const fallbackState = this.computeLoadState(modelPool, fallbackCtx, 1, budgetGB);
308
+ const fallbackTotalGB = fallbackState.baseTotalGB + fallbackState.kvAtContextGB;
309
+
310
+ const notes = [];
311
+ if (!requestedFits) {
312
+ notes.push('Requested settings exceed available memory budget; reduced settings are recommended.');
313
+ }
314
+ if (recommendedCtx < requestedCtx) {
315
+ notes.push(`Context reduced from ${requestedCtx} to ${recommendedCtx} to avoid memory pressure.`);
316
+ }
317
+ if (recommendedParallel < requestedConcurrency) {
318
+ notes.push(`Parallelism reduced from ${requestedConcurrency} to ${recommendedParallel} to keep memory stable.`);
319
+ }
320
+ if (recommendedLoaded < desiredLoaded) {
321
+ notes.push(`Loaded models capped at ${recommendedLoaded} for this objective and memory budget.`);
322
+ }
323
+
324
+ return {
325
+ objective: normalizedObjective,
326
+ inputs: {
327
+ targetContext: requestedCtx,
328
+ targetConcurrency: requestedConcurrency
329
+ },
330
+ hardware: hardwareBudget,
331
+ models: recommendedState.activeModels.map((model) => ({
332
+ name: model.name,
333
+ size: model.size,
334
+ fileSizeGB: model.fileSizeGB,
335
+ paramsB: model.paramsB,
336
+ estimatedBaseMemoryGB: Math.round(model.baseMemoryGB * 100) / 100
337
+ })),
338
+ envelope: {
339
+ context: {
340
+ requested: requestedCtx,
341
+ recommended: recommendedCtx,
342
+ min_safe: this.minContext,
343
+ max_for_parallel_1: Math.max(0, maxCtxParallel1 || 0),
344
+ max_for_recommended_parallel: Math.max(0, maxCtxAtRecommendedParallel || 0)
345
+ },
346
+ parallel: {
347
+ requested: requestedConcurrency,
348
+ recommended: recommendedParallel,
349
+ max_at_requested_ctx: Math.max(0, requestedState.maxParallelAtContext)
350
+ },
351
+ loaded_models: {
352
+ requested: desiredLoaded,
353
+ recommended: recommendedLoaded,
354
+ max_at_recommended_settings: this.maxLoadedModelsFor(
355
+ modelPool,
356
+ recommendedCtx,
357
+ recommendedParallel,
358
+ budgetGB,
359
+ modelPool.length
360
+ )
361
+ }
362
+ },
363
+ recommendation: {
364
+ num_ctx: recommendedCtx,
365
+ num_parallel: recommendedParallel,
366
+ max_loaded_models: recommendedLoaded,
367
+ max_queue: maxQueue,
368
+ keep_alive: profile.keepAlive,
369
+ flash_attention: flashAttention
370
+ },
371
+ memory: {
372
+ budgetGB: Math.round(budgetGB * 100) / 100,
373
+ requestedEstimatedGB: Math.round(requestedTotalGB * 100) / 100,
374
+ recommendedEstimatedGB: Math.round(recommendedTotalGB * 100) / 100,
375
+ utilizationPercent: Math.round((recommendedTotalGB / Math.max(0.1, budgetGB)) * 100)
376
+ },
377
+ risk,
378
+ fallback: {
379
+ num_ctx: fallbackCtx,
380
+ num_parallel: 1,
381
+ max_loaded_models: 1,
382
+ estimated_memory_gb: Math.round(fallbackTotalGB * 100) / 100
383
+ },
384
+ shell: {
385
+ env: {
386
+ OLLAMA_NUM_CTX: String(recommendedCtx),
387
+ OLLAMA_NUM_PARALLEL: String(recommendedParallel),
388
+ OLLAMA_MAX_LOADED_MODELS: String(recommendedLoaded),
389
+ OLLAMA_MAX_QUEUE: String(maxQueue),
390
+ OLLAMA_KEEP_ALIVE: profile.keepAlive,
391
+ OLLAMA_FLASH_ATTENTION: flashAttention
392
+ }
393
+ },
394
+ notes
395
+ };
396
+ }
397
+ }
398
+
399
+ module.exports = OllamaCapacityPlanner;