llm-checker 3.3.0 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -3
- package/bin/enhanced_cli.js +215 -2
- package/bin/mcp-server.mjs +423 -1
- package/package.json +1 -1
- package/src/hardware/backends/cuda-detector.js +26 -8
- package/src/ollama/capacity-planner.js +399 -0
package/README.md
CHANGED
|
@@ -133,9 +133,22 @@ llm-checker ai-run --calibrated --category coding --prompt "Refactor this functi
|
|
|
133
133
|
|
|
134
134
|
LLM Checker is published in all primary channels:
|
|
135
135
|
|
|
136
|
-
- npm (latest): [`llm-checker@latest`](https://www.npmjs.com/package/llm-checker)
|
|
136
|
+
- npm (latest, recommended): [`llm-checker@latest`](https://www.npmjs.com/package/llm-checker)
|
|
137
137
|
- GitHub Releases: [Release history](https://github.com/Pavelevich/llm-checker/releases)
|
|
138
|
-
- GitHub Packages: [`@pavelevich/llm-checker`](https://github.com/users/Pavelevich/packages/npm/package/llm-checker)
|
|
138
|
+
- GitHub Packages (legacy mirror, may lag): [`@pavelevich/llm-checker`](https://github.com/users/Pavelevich/packages/npm/package/llm-checker)
|
|
139
|
+
|
|
140
|
+
### Important: Use npm for Latest Builds
|
|
141
|
+
|
|
142
|
+
If you need the newest release, install from npm (`llm-checker`), not the scoped GitHub Packages mirror.
|
|
143
|
+
|
|
144
|
+
If you installed `@pavelevich/llm-checker` and version looks old:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
npm uninstall -g @pavelevich/llm-checker
|
|
148
|
+
npm install -g llm-checker@latest
|
|
149
|
+
hash -r
|
|
150
|
+
llm-checker --version
|
|
151
|
+
```
|
|
139
152
|
|
|
140
153
|
### v3.3.0 Highlights
|
|
141
154
|
|
|
@@ -148,7 +161,9 @@ LLM Checker is published in all primary channels:
|
|
|
148
161
|
- Hardened Jetson CUDA detection to avoid false CPU-only fallback.
|
|
149
162
|
- Documentation reorganized under `docs/` with clearer onboarding paths.
|
|
150
163
|
|
|
151
|
-
### Optional: Install from GitHub Packages
|
|
164
|
+
### Optional (Legacy): Install from GitHub Packages
|
|
165
|
+
|
|
166
|
+
Use this only if you explicitly need GitHub Packages. It may not match npm latest.
|
|
152
167
|
|
|
153
168
|
```bash
|
|
154
169
|
# 1) Configure registry + token (PAT with read:packages)
|
|
@@ -261,6 +276,11 @@ Once connected, Claude can use these tools:
|
|
|
261
276
|
| `installed` | Rank your already-downloaded Ollama models |
|
|
262
277
|
| `search` | Search the Ollama model catalog with filters |
|
|
263
278
|
| `smart_recommend` | Advanced recommendations using the full scoring engine |
|
|
279
|
+
| `ollama_plan` | Build a capacity plan for local models with recommended context/parallel/memory settings |
|
|
280
|
+
| `ollama_plan_env` | Return ready-to-paste `export ...` env vars from the recommended or fallback plan profile |
|
|
281
|
+
| `policy_validate` | Validate a policy file against the v1 schema and return structured validation output |
|
|
282
|
+
| `audit_export` | Run policy compliance export (`json`/`csv`/`sarif`/`all`) for `check` or `recommend` flows |
|
|
283
|
+
| `calibrate` | Generate calibration artifacts from a prompt suite with typed MCP inputs |
|
|
264
284
|
|
|
265
285
|
**Ollama Management:**
|
|
266
286
|
|
|
@@ -281,6 +301,8 @@ Once connected, Claude can use these tools:
|
|
|
281
301
|
| `cleanup_models` | Analyze installed models — find redundancies, cloud-only models, oversized models, and upgrade candidates |
|
|
282
302
|
| `project_recommend` | Scan a project directory (languages, frameworks, size) and recommend the best model for that codebase |
|
|
283
303
|
| `ollama_monitor` | Real-time system status: RAM usage, loaded models, memory headroom analysis |
|
|
304
|
+
| `cli_help` | List all allowlisted CLI commands exposed through MCP |
|
|
305
|
+
| `cli_exec` | Execute any allowlisted `llm-checker` CLI command with custom args (policy/audit/calibrate/sync/ai-run/etc.) |
|
|
284
306
|
|
|
285
307
|
### Example Prompts
|
|
286
308
|
|
|
@@ -309,6 +331,7 @@ Claude will automatically call the right tools and give you actionable results.
|
|
|
309
331
|
| `recommend` | Intelligent recommendations by category (coding, reasoning, multimodal, etc.) |
|
|
310
332
|
| `calibrate` | Generate calibration result + routing policy artifacts from a JSONL prompt suite |
|
|
311
333
|
| `installed` | Rank your installed Ollama models by compatibility |
|
|
334
|
+
| `ollama-plan` | Compute safe Ollama runtime env vars (`NUM_CTX`, `NUM_PARALLEL`, `MAX_LOADED_MODELS`) for selected local models |
|
|
312
335
|
|
|
313
336
|
### Advanced Commands (require `sql.js`)
|
|
314
337
|
|
package/bin/enhanced_cli.js
CHANGED
|
@@ -591,6 +591,80 @@ async function checkOllamaAndExit() {
|
|
|
591
591
|
}
|
|
592
592
|
}
|
|
593
593
|
|
|
594
|
+
function parsePositiveIntegerOption(rawValue, optionName) {
|
|
595
|
+
const parsed = Number(rawValue);
|
|
596
|
+
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
597
|
+
throw new Error(`Invalid ${optionName}: ${rawValue}`);
|
|
598
|
+
}
|
|
599
|
+
return Math.round(parsed);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
function parseNonNegativeNumberOption(rawValue, optionName) {
|
|
603
|
+
const parsed = Number(rawValue);
|
|
604
|
+
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
605
|
+
throw new Error(`Invalid ${optionName}: ${rawValue}`);
|
|
606
|
+
}
|
|
607
|
+
return parsed;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
function selectModelsForPlan(installedModels, requestedModels = []) {
|
|
611
|
+
const requested = Array.isArray(requestedModels)
|
|
612
|
+
? requestedModels.map((model) => String(model || '').trim()).filter(Boolean)
|
|
613
|
+
: [];
|
|
614
|
+
|
|
615
|
+
if (!requested.length) {
|
|
616
|
+
return {
|
|
617
|
+
selected: installedModels.slice(),
|
|
618
|
+
missing: []
|
|
619
|
+
};
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
const selected = [];
|
|
623
|
+
const missing = [];
|
|
624
|
+
const seen = new Set();
|
|
625
|
+
|
|
626
|
+
for (const request of requested) {
|
|
627
|
+
const normalized = request.toLowerCase();
|
|
628
|
+
|
|
629
|
+
let match = installedModels.find(
|
|
630
|
+
(model) => String(model.name || '').toLowerCase() === normalized
|
|
631
|
+
);
|
|
632
|
+
|
|
633
|
+
if (!match) {
|
|
634
|
+
match = installedModels.find((model) =>
|
|
635
|
+
String(model.name || '').toLowerCase().startsWith(`${normalized}:`)
|
|
636
|
+
);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
if (!match) {
|
|
640
|
+
match = installedModels.find(
|
|
641
|
+
(model) => String(model.family || '').toLowerCase() === normalized
|
|
642
|
+
);
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
if (!match) {
|
|
646
|
+
match = installedModels.find((model) =>
|
|
647
|
+
String(model.name || '').toLowerCase().includes(normalized)
|
|
648
|
+
);
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
if (!match) {
|
|
652
|
+
missing.push(request);
|
|
653
|
+
continue;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
if (!seen.has(match.name)) {
|
|
657
|
+
selected.push(match);
|
|
658
|
+
seen.add(match.name);
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
return {
|
|
663
|
+
selected,
|
|
664
|
+
missing
|
|
665
|
+
};
|
|
666
|
+
}
|
|
667
|
+
|
|
594
668
|
function getStatusIcon(model, ollamaModels) {
|
|
595
669
|
const ollamaModel = ollamaModels?.find(om => om.matchedModel?.name === model.name);
|
|
596
670
|
|
|
@@ -3042,6 +3116,145 @@ program
|
|
|
3042
3116
|
}
|
|
3043
3117
|
});
|
|
3044
3118
|
|
|
3119
|
+
program
|
|
3120
|
+
.command('ollama-plan')
|
|
3121
|
+
.description('Plan safe Ollama runtime settings for selected local models')
|
|
3122
|
+
.option('--models <models...>', 'Model tags/families to include (default: all local models)')
|
|
3123
|
+
.option('--ctx <tokens>', 'Target context window in tokens', '8192')
|
|
3124
|
+
.option('--concurrency <n>', 'Target parallel request count', '2')
|
|
3125
|
+
.option('--objective <mode>', 'Optimization objective (latency|balanced|throughput)', 'balanced')
|
|
3126
|
+
.option('--reserve-gb <gb>', 'Memory reserve for OS and background workloads', '2')
|
|
3127
|
+
.option('--json', 'Output plan as JSON')
|
|
3128
|
+
.action(async (options) => {
|
|
3129
|
+
const spinner = options.json ? null : ora('Building Ollama capacity plan...').start();
|
|
3130
|
+
|
|
3131
|
+
try {
|
|
3132
|
+
const requestedObjective = String(options.objective || 'balanced').toLowerCase();
|
|
3133
|
+
const supportedObjectives = new Set(['latency', 'balanced', 'throughput']);
|
|
3134
|
+
if (!supportedObjectives.has(requestedObjective)) {
|
|
3135
|
+
throw new Error(`Invalid objective "${options.objective}". Use latency, balanced, or throughput.`);
|
|
3136
|
+
}
|
|
3137
|
+
|
|
3138
|
+
const targetContext = parsePositiveIntegerOption(options.ctx, '--ctx');
|
|
3139
|
+
const targetConcurrency = parsePositiveIntegerOption(options.concurrency, '--concurrency');
|
|
3140
|
+
const reserveGB = parseNonNegativeNumberOption(options.reserveGb, '--reserve-gb');
|
|
3141
|
+
|
|
3142
|
+
const OllamaClient = require('../src/ollama/client');
|
|
3143
|
+
const UnifiedDetector = require('../src/hardware/unified-detector');
|
|
3144
|
+
const OllamaCapacityPlanner = require('../src/ollama/capacity-planner');
|
|
3145
|
+
|
|
3146
|
+
const ollamaClient = new OllamaClient();
|
|
3147
|
+
const availability = await ollamaClient.checkOllamaAvailability();
|
|
3148
|
+
if (!availability.available) {
|
|
3149
|
+
throw new Error(availability.error || 'Ollama is not available');
|
|
3150
|
+
}
|
|
3151
|
+
|
|
3152
|
+
const localModels = await ollamaClient.getLocalModels();
|
|
3153
|
+
if (!localModels || localModels.length === 0) {
|
|
3154
|
+
throw new Error('No local Ollama models found. Install one with: ollama pull llama3.2:3b');
|
|
3155
|
+
}
|
|
3156
|
+
|
|
3157
|
+
const { selected, missing } = selectModelsForPlan(localModels, options.models || []);
|
|
3158
|
+
if (selected.length === 0) {
|
|
3159
|
+
throw new Error(
|
|
3160
|
+
`No matching local models found for: ${(options.models || []).join(', ')}`
|
|
3161
|
+
);
|
|
3162
|
+
}
|
|
3163
|
+
|
|
3164
|
+
const detector = new UnifiedDetector();
|
|
3165
|
+
const hardware = await detector.detect();
|
|
3166
|
+
const planner = new OllamaCapacityPlanner();
|
|
3167
|
+
|
|
3168
|
+
const plan = planner.plan({
|
|
3169
|
+
hardware,
|
|
3170
|
+
models: selected,
|
|
3171
|
+
targetContext,
|
|
3172
|
+
targetConcurrency,
|
|
3173
|
+
objective: requestedObjective,
|
|
3174
|
+
reserveGB
|
|
3175
|
+
});
|
|
3176
|
+
|
|
3177
|
+
if (options.json) {
|
|
3178
|
+
console.log(JSON.stringify({
|
|
3179
|
+
generated_at: new Date().toISOString(),
|
|
3180
|
+
selection: {
|
|
3181
|
+
requested: options.models || [],
|
|
3182
|
+
selected: selected.map((model) => model.name),
|
|
3183
|
+
missing
|
|
3184
|
+
},
|
|
3185
|
+
plan
|
|
3186
|
+
}, null, 2));
|
|
3187
|
+
return;
|
|
3188
|
+
}
|
|
3189
|
+
|
|
3190
|
+
if (spinner) spinner.succeed('Capacity plan generated');
|
|
3191
|
+
|
|
3192
|
+
console.log('\n' + chalk.bgBlue.white.bold(' OLLAMA CAPACITY PLAN '));
|
|
3193
|
+
console.log(
|
|
3194
|
+
chalk.blue('Hardware:'),
|
|
3195
|
+
`${plan.hardware.backendName} (${plan.hardware.backend})`
|
|
3196
|
+
);
|
|
3197
|
+
console.log(
|
|
3198
|
+
chalk.blue('Memory budget:'),
|
|
3199
|
+
`${plan.memory.budgetGB}GB usable (reserve ${plan.hardware.reserveGB}GB)`
|
|
3200
|
+
);
|
|
3201
|
+
|
|
3202
|
+
if (missing.length > 0) {
|
|
3203
|
+
console.log(
|
|
3204
|
+
chalk.yellow('Missing model filters:'),
|
|
3205
|
+
missing.join(', ')
|
|
3206
|
+
);
|
|
3207
|
+
}
|
|
3208
|
+
|
|
3209
|
+
console.log(chalk.blue.bold('\nSelected models:'));
|
|
3210
|
+
for (const model of plan.models) {
|
|
3211
|
+
console.log(
|
|
3212
|
+
` - ${model.name} (${model.size}, ~${model.estimatedBaseMemoryGB}GB base)`
|
|
3213
|
+
);
|
|
3214
|
+
}
|
|
3215
|
+
|
|
3216
|
+
console.log(chalk.blue.bold('\nRecommended envelope:'));
|
|
3217
|
+
console.log(
|
|
3218
|
+
` Context: ${plan.envelope.context.recommended} (requested ${plan.envelope.context.requested})`
|
|
3219
|
+
);
|
|
3220
|
+
console.log(
|
|
3221
|
+
` Parallel: ${plan.envelope.parallel.recommended} (requested ${plan.envelope.parallel.requested})`
|
|
3222
|
+
);
|
|
3223
|
+
console.log(
|
|
3224
|
+
` Loaded models: ${plan.envelope.loaded_models.recommended} (requested ${plan.envelope.loaded_models.requested})`
|
|
3225
|
+
);
|
|
3226
|
+
console.log(
|
|
3227
|
+
` Estimated memory: ${plan.memory.recommendedEstimatedGB}GB / ${plan.memory.budgetGB}GB (${plan.memory.utilizationPercent}%)`
|
|
3228
|
+
);
|
|
3229
|
+
console.log(` Risk: ${plan.risk.level.toUpperCase()} (${plan.risk.score}/100)`);
|
|
3230
|
+
|
|
3231
|
+
if (plan.notes.length > 0) {
|
|
3232
|
+
console.log(chalk.blue.bold('\nNotes:'));
|
|
3233
|
+
for (const note of plan.notes) {
|
|
3234
|
+
console.log(` - ${note}`);
|
|
3235
|
+
}
|
|
3236
|
+
}
|
|
3237
|
+
|
|
3238
|
+
console.log(chalk.blue.bold('\nRecommended env vars:'));
|
|
3239
|
+
for (const [key, value] of Object.entries(plan.shell.env)) {
|
|
3240
|
+
console.log(` export ${key}=${value}`);
|
|
3241
|
+
}
|
|
3242
|
+
|
|
3243
|
+
console.log(chalk.blue.bold('\nFallback profile:'));
|
|
3244
|
+
console.log(
|
|
3245
|
+
` OLLAMA_NUM_CTX=${plan.fallback.num_ctx} OLLAMA_NUM_PARALLEL=${plan.fallback.num_parallel} OLLAMA_MAX_LOADED_MODELS=${plan.fallback.max_loaded_models}`
|
|
3246
|
+
);
|
|
3247
|
+
console.log('');
|
|
3248
|
+
} catch (error) {
|
|
3249
|
+
if (spinner) spinner.fail('Failed to build capacity plan');
|
|
3250
|
+
console.error(chalk.red('Error:'), error.message);
|
|
3251
|
+
if (process.env.DEBUG) {
|
|
3252
|
+
console.error(error.stack);
|
|
3253
|
+
}
|
|
3254
|
+
process.exit(1);
|
|
3255
|
+
}
|
|
3256
|
+
});
|
|
3257
|
+
|
|
3045
3258
|
program
|
|
3046
3259
|
.command('recommend')
|
|
3047
3260
|
.description('Get intelligent model recommendations for your hardware')
|
|
@@ -3942,8 +4155,8 @@ program
|
|
|
3942
4155
|
}
|
|
3943
4156
|
|
|
3944
4157
|
if (backend === 'cuda' && info.info) {
|
|
3945
|
-
console.log(` Driver: ${info.info.driver}`);
|
|
3946
|
-
console.log(` CUDA: ${info.info.cuda}`);
|
|
4158
|
+
console.log(` Driver: ${info.info.driver || 'unknown'}`);
|
|
4159
|
+
console.log(` CUDA: ${info.info.cuda || 'unknown'}`);
|
|
3947
4160
|
console.log(` Total VRAM: ${info.info.totalVRAM}GB`);
|
|
3948
4161
|
for (const gpu of info.info.gpus) {
|
|
3949
4162
|
console.log(` ${gpu.name}: ${gpu.memory.total}GB`);
|
package/bin/mcp-server.mjs
CHANGED
|
@@ -101,13 +101,89 @@ function nsToSec(ns) {
|
|
|
101
101
|
return (ns / 1e9).toFixed(2);
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
+
function tryParseJSON(text) {
|
|
105
|
+
try {
|
|
106
|
+
return JSON.parse(text);
|
|
107
|
+
} catch {
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function formatExportBlock(envObject) {
|
|
113
|
+
if (!envObject || typeof envObject !== "object") return "";
|
|
114
|
+
const entries = Object.entries(envObject).filter(([, value]) => value !== undefined && value !== null);
|
|
115
|
+
if (entries.length === 0) return "";
|
|
116
|
+
return entries
|
|
117
|
+
.map(([key, value]) => `export ${key}="${String(value)}"`)
|
|
118
|
+
.join("\n");
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function summarizeOllamaPlan(payload) {
|
|
122
|
+
if (!payload || typeof payload !== "object") return null;
|
|
123
|
+
const plan = payload.plan;
|
|
124
|
+
if (!plan || typeof plan !== "object") return null;
|
|
125
|
+
|
|
126
|
+
const selectedModels = Array.isArray(plan.models)
|
|
127
|
+
? plan.models.map((model) => model?.name).filter(Boolean)
|
|
128
|
+
: [];
|
|
129
|
+
const hardware = plan.hardware || {};
|
|
130
|
+
const memory = plan.memory || {};
|
|
131
|
+
const recommendation = plan.recommendation || {};
|
|
132
|
+
const risk = plan.risk || {};
|
|
133
|
+
|
|
134
|
+
const lines = [
|
|
135
|
+
"OLLAMA CAPACITY PLAN",
|
|
136
|
+
`Hardware: ${hardware.backendName || hardware.backend || "unknown"}`,
|
|
137
|
+
`Models: ${selectedModels.length > 0 ? selectedModels.join(", ") : "none selected"}`,
|
|
138
|
+
"",
|
|
139
|
+
"Recommended envelope:",
|
|
140
|
+
` Context: ${plan.envelope?.context?.recommended ?? "?"}`,
|
|
141
|
+
` Parallel: ${plan.envelope?.parallel?.recommended ?? "?"}`,
|
|
142
|
+
` Loaded models: ${plan.envelope?.loaded_models?.recommended ?? "?"}`,
|
|
143
|
+
` Estimated memory: ${memory.recommendedEstimatedGB ?? "?"}GB / ${memory.budgetGB ?? "?"}GB (${memory.utilizationPercent ?? "?"}%)`,
|
|
144
|
+
` Risk: ${(risk.level || "unknown").toUpperCase()} (${risk.score ?? "?"}/100)`,
|
|
145
|
+
];
|
|
146
|
+
|
|
147
|
+
if (recommendation && Object.keys(recommendation).length > 0) {
|
|
148
|
+
lines.push("");
|
|
149
|
+
lines.push("Recommended env vars:");
|
|
150
|
+
if (recommendation.num_ctx !== undefined) lines.push(` export OLLAMA_NUM_CTX="${recommendation.num_ctx}"`);
|
|
151
|
+
if (recommendation.num_parallel !== undefined) lines.push(` export OLLAMA_NUM_PARALLEL="${recommendation.num_parallel}"`);
|
|
152
|
+
if (recommendation.max_loaded_models !== undefined) lines.push(` export OLLAMA_MAX_LOADED_MODELS="${recommendation.max_loaded_models}"`);
|
|
153
|
+
if (recommendation.max_queue !== undefined) lines.push(` export OLLAMA_MAX_QUEUE="${recommendation.max_queue}"`);
|
|
154
|
+
if (recommendation.keep_alive !== undefined) lines.push(` export OLLAMA_KEEP_ALIVE="${recommendation.keep_alive}"`);
|
|
155
|
+
if (recommendation.flash_attention !== undefined) lines.push(` export OLLAMA_FLASH_ATTENTION="${recommendation.flash_attention}"`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return lines.join("\n");
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const ALLOWED_CLI_COMMANDS = new Set([
|
|
162
|
+
"policy",
|
|
163
|
+
"audit",
|
|
164
|
+
"calibrate",
|
|
165
|
+
"check",
|
|
166
|
+
"ollama",
|
|
167
|
+
"installed",
|
|
168
|
+
"ollama-plan",
|
|
169
|
+
"recommend",
|
|
170
|
+
"list-models",
|
|
171
|
+
"ai-check",
|
|
172
|
+
"ai-run",
|
|
173
|
+
"demo",
|
|
174
|
+
"sync",
|
|
175
|
+
"search",
|
|
176
|
+
"smart-recommend",
|
|
177
|
+
"hw-detect",
|
|
178
|
+
]);
|
|
179
|
+
|
|
104
180
|
// ============================================================================
|
|
105
181
|
// MCP SERVER
|
|
106
182
|
// ============================================================================
|
|
107
183
|
|
|
108
184
|
const server = new McpServer({
|
|
109
185
|
name: "llm-checker",
|
|
110
|
-
version: "3.
|
|
186
|
+
version: "3.4.0",
|
|
111
187
|
});
|
|
112
188
|
|
|
113
189
|
// ============================================================================
|
|
@@ -198,6 +274,352 @@ server.tool(
|
|
|
198
274
|
}
|
|
199
275
|
);
|
|
200
276
|
|
|
277
|
+
server.tool(
|
|
278
|
+
"ollama_plan",
|
|
279
|
+
"Build an Ollama capacity plan for selected local models and return recommended context/parallel/memory settings",
|
|
280
|
+
{
|
|
281
|
+
models: z
|
|
282
|
+
.array(z.string())
|
|
283
|
+
.optional()
|
|
284
|
+
.describe("Optional list of model tags/families to include (default: all local models)"),
|
|
285
|
+
ctx: z.number().int().positive().optional().describe("Target context window in tokens"),
|
|
286
|
+
concurrency: z.number().int().positive().optional().describe("Target parallel request count"),
|
|
287
|
+
objective: z
|
|
288
|
+
.enum(["latency", "balanced", "throughput"])
|
|
289
|
+
.optional()
|
|
290
|
+
.describe("Optimization objective"),
|
|
291
|
+
reserve_gb: z.number().min(0).optional().describe("Memory reserve in GB for OS/background workloads"),
|
|
292
|
+
},
|
|
293
|
+
async ({ models, ctx, concurrency, objective, reserve_gb }) => {
|
|
294
|
+
const args = ["ollama-plan", "--json"];
|
|
295
|
+
if (Array.isArray(models) && models.length > 0) args.push("--models", ...models);
|
|
296
|
+
if (ctx !== undefined) args.push("--ctx", String(ctx));
|
|
297
|
+
if (concurrency !== undefined) args.push("--concurrency", String(concurrency));
|
|
298
|
+
if (objective) args.push("--objective", objective);
|
|
299
|
+
if (reserve_gb !== undefined) args.push("--reserve-gb", String(reserve_gb));
|
|
300
|
+
|
|
301
|
+
const result = await run(args, 180000);
|
|
302
|
+
const payload = tryParseJSON(result);
|
|
303
|
+
|
|
304
|
+
if (!payload) {
|
|
305
|
+
return {
|
|
306
|
+
content: [{ type: "text", text: result }],
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
const summary = summarizeOllamaPlan(payload);
|
|
311
|
+
const output = summary
|
|
312
|
+
? `${summary}\n\nRAW JSON:\n${JSON.stringify(payload, null, 2)}`
|
|
313
|
+
: JSON.stringify(payload, null, 2);
|
|
314
|
+
|
|
315
|
+
return {
|
|
316
|
+
content: [{ type: "text", text: output }],
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
);
|
|
320
|
+
|
|
321
|
+
server.tool(
|
|
322
|
+
"ollama_plan_env",
|
|
323
|
+
"Return shell export commands from an Ollama capacity plan (recommended or fallback profile)",
|
|
324
|
+
{
|
|
325
|
+
profile: z
|
|
326
|
+
.enum(["recommended", "fallback"])
|
|
327
|
+
.optional()
|
|
328
|
+
.describe("Which profile to return (default: recommended)"),
|
|
329
|
+
models: z
|
|
330
|
+
.array(z.string())
|
|
331
|
+
.optional()
|
|
332
|
+
.describe("Optional list of model tags/families to include (default: all local models)"),
|
|
333
|
+
ctx: z.number().int().positive().optional().describe("Target context window in tokens"),
|
|
334
|
+
concurrency: z.number().int().positive().optional().describe("Target parallel request count"),
|
|
335
|
+
objective: z
|
|
336
|
+
.enum(["latency", "balanced", "throughput"])
|
|
337
|
+
.optional()
|
|
338
|
+
.describe("Optimization objective"),
|
|
339
|
+
reserve_gb: z.number().min(0).optional().describe("Memory reserve in GB for OS/background workloads"),
|
|
340
|
+
},
|
|
341
|
+
async ({ profile, models, ctx, concurrency, objective, reserve_gb }) => {
|
|
342
|
+
const args = ["ollama-plan", "--json"];
|
|
343
|
+
if (Array.isArray(models) && models.length > 0) args.push("--models", ...models);
|
|
344
|
+
if (ctx !== undefined) args.push("--ctx", String(ctx));
|
|
345
|
+
if (concurrency !== undefined) args.push("--concurrency", String(concurrency));
|
|
346
|
+
if (objective) args.push("--objective", objective);
|
|
347
|
+
if (reserve_gb !== undefined) args.push("--reserve-gb", String(reserve_gb));
|
|
348
|
+
|
|
349
|
+
const result = await run(args, 180000);
|
|
350
|
+
const payload = tryParseJSON(result);
|
|
351
|
+
if (!payload?.plan) {
|
|
352
|
+
return {
|
|
353
|
+
content: [{ type: "text", text: `Failed to parse ollama-plan output:\n${result}` }],
|
|
354
|
+
isError: true,
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
const selectedProfile = profile || "recommended";
|
|
359
|
+
const plan = payload.plan;
|
|
360
|
+
let envValues = null;
|
|
361
|
+
|
|
362
|
+
if (selectedProfile === "fallback") {
|
|
363
|
+
const fallback = plan.fallback || {};
|
|
364
|
+
envValues = {
|
|
365
|
+
OLLAMA_NUM_CTX: fallback.num_ctx,
|
|
366
|
+
OLLAMA_NUM_PARALLEL: fallback.num_parallel,
|
|
367
|
+
OLLAMA_MAX_LOADED_MODELS: fallback.max_loaded_models,
|
|
368
|
+
};
|
|
369
|
+
} else {
|
|
370
|
+
envValues = plan.shell?.env || null;
|
|
371
|
+
if (!envValues) {
|
|
372
|
+
const recommendation = plan.recommendation || {};
|
|
373
|
+
envValues = {
|
|
374
|
+
OLLAMA_NUM_CTX: recommendation.num_ctx,
|
|
375
|
+
OLLAMA_NUM_PARALLEL: recommendation.num_parallel,
|
|
376
|
+
OLLAMA_MAX_LOADED_MODELS: recommendation.max_loaded_models,
|
|
377
|
+
OLLAMA_MAX_QUEUE: recommendation.max_queue,
|
|
378
|
+
OLLAMA_KEEP_ALIVE: recommendation.keep_alive,
|
|
379
|
+
OLLAMA_FLASH_ATTENTION: recommendation.flash_attention,
|
|
380
|
+
};
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
const exports = formatExportBlock(envValues);
|
|
385
|
+
if (!exports) {
|
|
386
|
+
return {
|
|
387
|
+
content: [{ type: "text", text: "No environment values available for this plan/profile." }],
|
|
388
|
+
isError: true,
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return {
|
|
393
|
+
content: [
|
|
394
|
+
{
|
|
395
|
+
type: "text",
|
|
396
|
+
text: [`PROFILE: ${selectedProfile.toUpperCase()}`, "", exports].join("\n"),
|
|
397
|
+
},
|
|
398
|
+
],
|
|
399
|
+
};
|
|
400
|
+
}
|
|
401
|
+
);
|
|
402
|
+
|
|
403
|
+
server.tool(
|
|
404
|
+
"cli_help",
|
|
405
|
+
"List all llm-checker CLI commands exposed via cli_exec",
|
|
406
|
+
{},
|
|
407
|
+
async () => {
|
|
408
|
+
const commands = [...ALLOWED_CLI_COMMANDS].sort();
|
|
409
|
+
const lines = [
|
|
410
|
+
"Available commands for cli_exec:",
|
|
411
|
+
...commands.map((command) => ` - ${command}`),
|
|
412
|
+
"",
|
|
413
|
+
"Examples:",
|
|
414
|
+
' cli_exec command="ollama-plan" args=["--json"]',
|
|
415
|
+
' cli_exec command="policy" args=["validate","--file","policy.yaml","--json"]',
|
|
416
|
+
' cli_exec command="search" args=["qwen","--use-case","coding","--limit","5"]',
|
|
417
|
+
];
|
|
418
|
+
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
419
|
+
}
|
|
420
|
+
);
|
|
421
|
+
|
|
422
|
+
server.tool(
|
|
423
|
+
"cli_exec",
|
|
424
|
+
"Execute any supported llm-checker CLI command (allowlisted) with custom arguments",
|
|
425
|
+
{
|
|
426
|
+
command: z.string().describe("Top-level command (use cli_help to list allowed commands)"),
|
|
427
|
+
args: z
|
|
428
|
+
.array(z.string())
|
|
429
|
+
.optional()
|
|
430
|
+
.describe("Additional CLI args, exactly as used in terminal (without shell quoting)"),
|
|
431
|
+
timeout_ms: z.number().int().min(1000).max(600000).optional().describe("Execution timeout in milliseconds"),
|
|
432
|
+
},
|
|
433
|
+
async ({ command, args, timeout_ms }) => {
|
|
434
|
+
const trimmedCommand = String(command || "").trim();
|
|
435
|
+
if (!ALLOWED_CLI_COMMANDS.has(trimmedCommand)) {
|
|
436
|
+
return {
|
|
437
|
+
content: [
|
|
438
|
+
{
|
|
439
|
+
type: "text",
|
|
440
|
+
text: `Unsupported command "${trimmedCommand}". Use cli_help to list allowed commands.`,
|
|
441
|
+
},
|
|
442
|
+
],
|
|
443
|
+
isError: true,
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
const safeArgs = Array.isArray(args) ? args : [];
|
|
448
|
+
if (safeArgs.length > 100) {
|
|
449
|
+
return {
|
|
450
|
+
content: [{ type: "text", text: "Too many arguments. Limit is 100." }],
|
|
451
|
+
isError: true,
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
const result = await run([trimmedCommand, ...safeArgs], timeout_ms || 180000);
|
|
456
|
+
return { content: [{ type: "text", text: result }] };
|
|
457
|
+
}
|
|
458
|
+
);
|
|
459
|
+
|
|
460
|
+
server.tool(
|
|
461
|
+
"policy_validate",
|
|
462
|
+
"Validate a policy file against the v1 schema and return structured validation output",
|
|
463
|
+
{
|
|
464
|
+
file: z.string().optional().describe("Policy file path (default: policy.yaml)"),
|
|
465
|
+
},
|
|
466
|
+
async ({ file }) => {
|
|
467
|
+
const args = ["policy", "validate", "--json"];
|
|
468
|
+
if (file) args.push("--file", file);
|
|
469
|
+
|
|
470
|
+
const result = await run(args, 120000);
|
|
471
|
+
const payload = tryParseJSON(result);
|
|
472
|
+
if (!payload) {
|
|
473
|
+
return {
|
|
474
|
+
content: [{ type: "text", text: result }],
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
const status = payload.valid ? "VALID" : "INVALID";
|
|
479
|
+
const header = [
|
|
480
|
+
`POLICY VALIDATION: ${status}`,
|
|
481
|
+
`File: ${payload.file || file || "policy.yaml"}`,
|
|
482
|
+
`Errors: ${payload.errorCount ?? (Array.isArray(payload.errors) ? payload.errors.length : 0)}`,
|
|
483
|
+
].join("\n");
|
|
484
|
+
|
|
485
|
+
return {
|
|
486
|
+
content: [{ type: "text", text: `${header}\n\n${JSON.stringify(payload, null, 2)}` }],
|
|
487
|
+
isError: !payload.valid,
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
);
|
|
491
|
+
|
|
492
|
+
server.tool(
|
|
493
|
+
"audit_export",
|
|
494
|
+
"Run policy compliance audit export (json/csv/sarif/all) for check/recommend flows",
|
|
495
|
+
{
|
|
496
|
+
policy: z.string().describe("Policy file path"),
|
|
497
|
+
command: z
|
|
498
|
+
.enum(["check", "recommend"])
|
|
499
|
+
.optional()
|
|
500
|
+
.describe("Evaluation source (default: check)"),
|
|
501
|
+
format: z
|
|
502
|
+
.enum(["json", "csv", "sarif", "all"])
|
|
503
|
+
.optional()
|
|
504
|
+
.describe("Export format (default: json)"),
|
|
505
|
+
out: z.string().optional().describe("Output file path (single format only)"),
|
|
506
|
+
out_dir: z.string().optional().describe("Output directory when --out is omitted"),
|
|
507
|
+
use_case: z.string().optional().describe("Use case when command=check"),
|
|
508
|
+
category: z.string().optional().describe("Category hint when command=recommend"),
|
|
509
|
+
optimize: z
|
|
510
|
+
.enum(["balanced", "speed", "quality", "context", "coding"])
|
|
511
|
+
.optional()
|
|
512
|
+
.describe("Optimization profile when command=recommend"),
|
|
513
|
+
runtime: z
|
|
514
|
+
.enum(["ollama", "vllm", "mlx"])
|
|
515
|
+
.optional()
|
|
516
|
+
.describe("Runtime backend for check mode"),
|
|
517
|
+
include_cloud: z.boolean().optional().describe("Include cloud models in check-mode analysis"),
|
|
518
|
+
max_size: z.string().optional().describe('Maximum model size for check mode (example: "24B" or "12GB")'),
|
|
519
|
+
min_size: z.string().optional().describe('Minimum model size for check mode (example: "3B" or "2GB")'),
|
|
520
|
+
limit: z.number().int().positive().optional().describe("Model analysis limit for check mode"),
|
|
521
|
+
verbose: z.boolean().optional().describe("Enable verbose progress (default: true)"),
|
|
522
|
+
},
|
|
523
|
+
async ({
|
|
524
|
+
policy,
|
|
525
|
+
command,
|
|
526
|
+
format,
|
|
527
|
+
out,
|
|
528
|
+
out_dir,
|
|
529
|
+
use_case,
|
|
530
|
+
category,
|
|
531
|
+
optimize,
|
|
532
|
+
runtime,
|
|
533
|
+
include_cloud,
|
|
534
|
+
max_size,
|
|
535
|
+
min_size,
|
|
536
|
+
limit,
|
|
537
|
+
verbose,
|
|
538
|
+
}) => {
|
|
539
|
+
const args = ["audit", "export", "--policy", policy];
|
|
540
|
+
if (command) args.push("--command", command);
|
|
541
|
+
if (format) args.push("--format", format);
|
|
542
|
+
if (out) args.push("--out", out);
|
|
543
|
+
if (out_dir) args.push("--out-dir", out_dir);
|
|
544
|
+
if (use_case) args.push("--use-case", use_case);
|
|
545
|
+
if (category) args.push("--category", category);
|
|
546
|
+
if (optimize) args.push("--optimize", optimize);
|
|
547
|
+
if (runtime) args.push("--runtime", runtime);
|
|
548
|
+
if (include_cloud) args.push("--include-cloud");
|
|
549
|
+
if (max_size) args.push("--max-size", max_size);
|
|
550
|
+
if (min_size) args.push("--min-size", min_size);
|
|
551
|
+
if (limit !== undefined) args.push("--limit", String(limit));
|
|
552
|
+
if (verbose === false) args.push("--no-verbose");
|
|
553
|
+
|
|
554
|
+
const result = await run(args, 300000);
|
|
555
|
+
const hadFailure =
|
|
556
|
+
/audit export failed:/i.test(result) ||
|
|
557
|
+
/blocking violations detected/i.test(result) ||
|
|
558
|
+
/enforcement result:\s*blocking/i.test(result);
|
|
559
|
+
return {
|
|
560
|
+
content: [{ type: "text", text: result }],
|
|
561
|
+
isError: hadFailure,
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
);
|
|
565
|
+
|
|
566
|
+
server.tool(
|
|
567
|
+
"calibrate",
|
|
568
|
+
"Generate calibration artifacts from a JSONL prompt suite (dry-run, contract-only, or full benchmark mode)",
|
|
569
|
+
{
|
|
570
|
+
suite: z.string().describe("Prompt suite path in JSONL format"),
|
|
571
|
+
models: z.array(z.string()).describe("Model identifiers to include"),
|
|
572
|
+
output: z.string().describe("Calibration result output path (.json/.yaml/.yml)"),
|
|
573
|
+
runtime: z
|
|
574
|
+
.enum(["ollama", "vllm", "mlx"])
|
|
575
|
+
.optional()
|
|
576
|
+
.describe("Inference runtime backend"),
|
|
577
|
+
mode: z
|
|
578
|
+
.enum(["dry-run", "contract-only", "full"])
|
|
579
|
+
.optional()
|
|
580
|
+
.describe("Execution mode"),
|
|
581
|
+
objective: z
|
|
582
|
+
.enum(["speed", "quality", "balanced"])
|
|
583
|
+
.optional()
|
|
584
|
+
.describe("Calibration objective"),
|
|
585
|
+
policy_out: z.string().optional().describe("Optional calibration policy output path"),
|
|
586
|
+
warmup: z.number().int().positive().optional().describe("Warmup runs per prompt in full mode"),
|
|
587
|
+
iterations: z.number().int().positive().optional().describe("Measured iterations per prompt in full mode"),
|
|
588
|
+
timeout_ms: z.number().int().positive().optional().describe("Per-prompt timeout in full mode (ms)"),
|
|
589
|
+
dry_run: z.boolean().optional().describe("Shortcut flag for dry-run mode"),
|
|
590
|
+
},
|
|
591
|
+
async ({
|
|
592
|
+
suite,
|
|
593
|
+
models,
|
|
594
|
+
output,
|
|
595
|
+
runtime,
|
|
596
|
+
mode,
|
|
597
|
+
objective,
|
|
598
|
+
policy_out,
|
|
599
|
+
warmup,
|
|
600
|
+
iterations,
|
|
601
|
+
timeout_ms,
|
|
602
|
+
dry_run,
|
|
603
|
+
}) => {
|
|
604
|
+
const args = ["calibrate", "--suite", suite, "--models", ...models, "--output", output];
|
|
605
|
+
if (runtime) args.push("--runtime", runtime);
|
|
606
|
+
if (mode) args.push("--mode", mode);
|
|
607
|
+
if (objective) args.push("--objective", objective);
|
|
608
|
+
if (policy_out) args.push("--policy-out", policy_out);
|
|
609
|
+
if (warmup !== undefined) args.push("--warmup", String(warmup));
|
|
610
|
+
if (iterations !== undefined) args.push("--iterations", String(iterations));
|
|
611
|
+
if (timeout_ms !== undefined) args.push("--timeout-ms", String(timeout_ms));
|
|
612
|
+
if (dry_run) args.push("--dry-run");
|
|
613
|
+
|
|
614
|
+
const result = await run(args, 600000);
|
|
615
|
+
const hadFailure = /calibration failed:/i.test(result);
|
|
616
|
+
return {
|
|
617
|
+
content: [{ type: "text", text: result }],
|
|
618
|
+
isError: hadFailure,
|
|
619
|
+
};
|
|
620
|
+
}
|
|
621
|
+
);
|
|
622
|
+
|
|
201
623
|
// ============================================================================
|
|
202
624
|
// OLLAMA MANAGEMENT TOOLS
|
|
203
625
|
// ============================================================================
|
package/package.json
CHANGED
|
@@ -322,7 +322,7 @@ class CUDADetector {
|
|
|
322
322
|
const modelRaw = this.readJetsonModel();
|
|
323
323
|
const model = this.normalizeJetsonModel(modelRaw);
|
|
324
324
|
const cudaVersion = this.detectJetsonCudaVersion();
|
|
325
|
-
const driverVersion = this.detectJetsonDriverVersion();
|
|
325
|
+
const driverVersion = this.detectJetsonDriverVersion() || 'unknown';
|
|
326
326
|
const totalSystemGB = Math.max(1, Math.round(os.totalmem() / (1024 ** 3)));
|
|
327
327
|
const sharedGpuMemoryGB = Math.max(1, Math.round(totalSystemGB * 0.85));
|
|
328
328
|
const capabilities = this.getJetsonCapabilities(modelRaw || model);
|
|
@@ -423,11 +423,26 @@ class CUDADetector {
|
|
|
423
423
|
}
|
|
424
424
|
|
|
425
425
|
detectJetsonDriverVersion() {
|
|
426
|
-
const
|
|
427
|
-
|
|
426
|
+
const driverSources = [
|
|
427
|
+
'/proc/driver/nvidia/version',
|
|
428
|
+
'/sys/module/nvidia/version'
|
|
429
|
+
];
|
|
430
|
+
|
|
431
|
+
for (const source of driverSources) {
|
|
432
|
+
const versionInfo = this.readFileIfExists(source);
|
|
433
|
+
if (!versionInfo) continue;
|
|
434
|
+
|
|
435
|
+
const kernelMatch = versionInfo.match(/Kernel Module(?:\s+for\s+\w+)?\s+([0-9]+(?:\.[0-9]+){1,3})/i);
|
|
436
|
+
if (kernelMatch) return kernelMatch[1];
|
|
437
|
+
|
|
438
|
+
const nvrmMatch = versionInfo.match(/NVRM version:\s*.*?([0-9]+(?:\.[0-9]+){1,3})/i);
|
|
439
|
+
if (nvrmMatch) return nvrmMatch[1];
|
|
428
440
|
|
|
429
|
-
|
|
430
|
-
|
|
441
|
+
const genericMatch = versionInfo.match(/\b([0-9]+(?:\.[0-9]+){1,3})\b/);
|
|
442
|
+
if (genericMatch) return genericMatch[1];
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
return null;
|
|
431
446
|
}
|
|
432
447
|
|
|
433
448
|
getJetsonCapabilities(model) {
|
|
@@ -734,10 +749,13 @@ class CUDADetector {
|
|
|
734
749
|
const primary = this.getPrimaryGPU();
|
|
735
750
|
const gpuName = primary.name.toLowerCase()
|
|
736
751
|
.replace(/nvidia|geforce|quadro|tesla/gi, '')
|
|
737
|
-
.replace(
|
|
738
|
-
.
|
|
752
|
+
.replace(/[^a-z0-9]+/gi, '-')
|
|
753
|
+
.replace(/-+/g, '-')
|
|
754
|
+
.replace(/^-|-$/g, '');
|
|
755
|
+
const normalizedGpuName = gpuName || 'gpu';
|
|
756
|
+
const normalizedVRAM = Number.isFinite(info.totalVRAM) ? Math.max(0, Math.round(info.totalVRAM)) : 0;
|
|
739
757
|
|
|
740
|
-
return `cuda-${
|
|
758
|
+
return `cuda-${normalizedGpuName}-${normalizedVRAM}gb${info.isMultiGPU ? '-x' + info.gpus.length : ''}`;
|
|
741
759
|
}
|
|
742
760
|
|
|
743
761
|
/**
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
class OllamaCapacityPlanner {
|
|
2
|
+
constructor(options = {}) {
|
|
3
|
+
this.minContext = options.minContext || 2048;
|
|
4
|
+
this.maxParallelCap = options.maxParallelCap || 8;
|
|
5
|
+
this.defaultReserveGB = options.defaultReserveGB || 2;
|
|
6
|
+
this.kvFactorPer4k = options.kvFactorPer4k || 0.08; // GB per 1B params at 4k ctx
|
|
7
|
+
this.modelOverheadGB = options.modelOverheadGB || 0.7;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
toFiniteNumber(value, fallback = 0) {
|
|
11
|
+
const numeric = Number(value);
|
|
12
|
+
return Number.isFinite(numeric) ? numeric : fallback;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
clamp(value, min, max) {
|
|
16
|
+
return Math.min(max, Math.max(min, value));
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
normalizeObjective(objective) {
|
|
20
|
+
const normalized = String(objective || 'balanced').toLowerCase();
|
|
21
|
+
if (normalized === 'latency' || normalized === 'throughput' || normalized === 'balanced') {
|
|
22
|
+
return normalized;
|
|
23
|
+
}
|
|
24
|
+
return 'balanced';
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
objectiveProfile(objective) {
|
|
28
|
+
if (objective === 'latency') {
|
|
29
|
+
return {
|
|
30
|
+
parallelCap: 2,
|
|
31
|
+
loadedCap: 1,
|
|
32
|
+
keepAlive: '30m'
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (objective === 'throughput') {
|
|
37
|
+
return {
|
|
38
|
+
parallelCap: 6,
|
|
39
|
+
loadedCap: 3,
|
|
40
|
+
keepAlive: '10m'
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
parallelCap: 3,
|
|
46
|
+
loadedCap: 2,
|
|
47
|
+
keepAlive: '15m'
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
estimateParamsB(model = {}) {
|
|
52
|
+
const sizeMatch = String(model.size || '').match(/(\d+(?:\.\d+)?)\s*b/i);
|
|
53
|
+
if (sizeMatch) {
|
|
54
|
+
return this.toFiniteNumber(sizeMatch[1], 0);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const nameMatch = String(model.name || '').match(/(\d+(?:\.\d+)?)\s*b\b/i);
|
|
58
|
+
if (nameMatch) {
|
|
59
|
+
return this.toFiniteNumber(nameMatch[1], 0);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Approximate from quantized model file size (Q4 ~0.65 GB per 1B params)
|
|
63
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
64
|
+
if (fileSizeGB > 0) {
|
|
65
|
+
return fileSizeGB / 0.65;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return 7; // conservative fallback
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
estimateBaseMemoryGB(model = {}) {
|
|
72
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
|
|
73
|
+
if (fileSizeGB > 0) {
|
|
74
|
+
return fileSizeGB + this.modelOverheadGB;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const paramsB = this.estimateParamsB(model);
|
|
78
|
+
return paramsB * 0.65 + this.modelOverheadGB;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
estimateKVCacheGB(paramsB, contextTokens) {
|
|
82
|
+
const ctx = this.toFiniteNumber(contextTokens, this.minContext);
|
|
83
|
+
return paramsB * this.kvFactorPer4k * (ctx / 4096);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
normalizeModels(models = []) {
|
|
87
|
+
const normalized = models
|
|
88
|
+
.filter((model) => model && model.name)
|
|
89
|
+
.map((model) => {
|
|
90
|
+
const paramsB = this.estimateParamsB(model);
|
|
91
|
+
const baseMemoryGB = this.estimateBaseMemoryGB(model);
|
|
92
|
+
const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, Math.max(0, baseMemoryGB - this.modelOverheadGB));
|
|
93
|
+
return {
|
|
94
|
+
name: model.name,
|
|
95
|
+
size: model.size || `${Math.round(paramsB)}B`,
|
|
96
|
+
fileSizeGB: Math.round(fileSizeGB * 10) / 10,
|
|
97
|
+
paramsB: Math.round(paramsB * 10) / 10,
|
|
98
|
+
baseMemoryGB: Math.round(baseMemoryGB * 100) / 100
|
|
99
|
+
};
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Heaviest first to keep planning conservative
|
|
103
|
+
normalized.sort((a, b) => b.baseMemoryGB - a.baseMemoryGB);
|
|
104
|
+
return normalized;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
resolveHardwareBudget(hardware = {}, reserveGB = null) {
|
|
108
|
+
const summary = hardware.summary || {};
|
|
109
|
+
const reserve = this.toFiniteNumber(reserveGB, this.defaultReserveGB);
|
|
110
|
+
|
|
111
|
+
const effectiveMemory = this.toFiniteNumber(summary.effectiveMemory, 0);
|
|
112
|
+
const systemRAM = this.toFiniteNumber(summary.systemRAM, 0);
|
|
113
|
+
const vram = this.toFiniteNumber(summary.totalVRAM, 0);
|
|
114
|
+
const fallbackTotal = this.toFiniteNumber(hardware.memory?.total, 8);
|
|
115
|
+
|
|
116
|
+
const rawCapacityGB = effectiveMemory || vram || (systemRAM > 0 ? systemRAM * 0.7 : 0) || fallbackTotal * 0.7;
|
|
117
|
+
const memoryBudgetGB = Math.max(2, rawCapacityGB - reserve);
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
backend: summary.bestBackend || 'cpu',
|
|
121
|
+
backendName: summary.backendName || summary.bestBackend || 'CPU',
|
|
122
|
+
rawCapacityGB: Math.round(rawCapacityGB * 10) / 10,
|
|
123
|
+
reserveGB: Math.round(reserve * 10) / 10,
|
|
124
|
+
memoryBudgetGB: Math.round(memoryBudgetGB * 10) / 10
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
computeLoadState(models, contextTokens, loadedCount, budgetGB) {
|
|
129
|
+
const activeModels = models.slice(0, loadedCount);
|
|
130
|
+
const baseTotalGB = activeModels.reduce((sum, model) => sum + model.baseMemoryGB, 0);
|
|
131
|
+
const maxParamsB = activeModels.reduce((max, model) => Math.max(max, model.paramsB), 0);
|
|
132
|
+
const kvAtContextGB = this.estimateKVCacheGB(maxParamsB, contextTokens);
|
|
133
|
+
const kvPerTokenGB = maxParamsB > 0 ? (maxParamsB * this.kvFactorPer4k) / 4096 : 0;
|
|
134
|
+
const availableForKVGB = budgetGB - baseTotalGB;
|
|
135
|
+
|
|
136
|
+
let maxParallelAtContext = 0;
|
|
137
|
+
if (kvAtContextGB <= 0) {
|
|
138
|
+
maxParallelAtContext = this.maxParallelCap;
|
|
139
|
+
} else if (availableForKVGB > 0) {
|
|
140
|
+
maxParallelAtContext = Math.floor(availableForKVGB / kvAtContextGB);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
activeModels,
|
|
145
|
+
baseTotalGB,
|
|
146
|
+
maxParamsB,
|
|
147
|
+
kvAtContextGB,
|
|
148
|
+
kvPerTokenGB,
|
|
149
|
+
availableForKVGB,
|
|
150
|
+
maxParallelAtContext
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
maxLoadedModelsFor(models, contextTokens, parallel, budgetGB, hardCap) {
|
|
155
|
+
const cap = Math.max(1, Math.min(hardCap, models.length));
|
|
156
|
+
let best = 1;
|
|
157
|
+
for (let i = 1; i <= cap; i += 1) {
|
|
158
|
+
const state = this.computeLoadState(models, contextTokens, i, budgetGB);
|
|
159
|
+
const estimatedTotal = state.baseTotalGB + (state.kvAtContextGB * parallel);
|
|
160
|
+
if (estimatedTotal <= budgetGB) {
|
|
161
|
+
best = i;
|
|
162
|
+
} else {
|
|
163
|
+
break;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return best;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
calculateRiskLevel({
|
|
170
|
+
budgetGB,
|
|
171
|
+
requestedTotalGB,
|
|
172
|
+
recommendedTotalGB,
|
|
173
|
+
requestedFits
|
|
174
|
+
}) {
|
|
175
|
+
const safeBudget = Math.max(0.1, budgetGB);
|
|
176
|
+
const requestedUtil = requestedTotalGB / safeBudget;
|
|
177
|
+
const recommendedUtil = recommendedTotalGB / safeBudget;
|
|
178
|
+
const overage = Math.max(0, requestedTotalGB - safeBudget) / safeBudget;
|
|
179
|
+
|
|
180
|
+
const score = Math.min(
|
|
181
|
+
100,
|
|
182
|
+
Math.round((overage * 100) + (recommendedUtil * 55) + (requestedFits ? 0 : 20))
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
let level = 'low';
|
|
186
|
+
if (score >= 75) level = 'critical';
|
|
187
|
+
else if (score >= 55) level = 'high';
|
|
188
|
+
else if (score >= 35) level = 'medium';
|
|
189
|
+
|
|
190
|
+
return { level, score };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
plan({
|
|
194
|
+
hardware,
|
|
195
|
+
models,
|
|
196
|
+
targetContext = 8192,
|
|
197
|
+
targetConcurrency = 2,
|
|
198
|
+
objective = 'balanced',
|
|
199
|
+
reserveGB = null
|
|
200
|
+
}) {
|
|
201
|
+
const normalizedObjective = this.normalizeObjective(objective);
|
|
202
|
+
const profile = this.objectiveProfile(normalizedObjective);
|
|
203
|
+
const modelPool = this.normalizeModels(models);
|
|
204
|
+
|
|
205
|
+
if (modelPool.length === 0) {
|
|
206
|
+
throw new Error('At least one model is required for planning.');
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const requestedCtx = this.clamp(
|
|
210
|
+
Math.round(this.toFiniteNumber(targetContext, 8192)),
|
|
211
|
+
512,
|
|
212
|
+
131072
|
|
213
|
+
);
|
|
214
|
+
const requestedConcurrency = this.clamp(
|
|
215
|
+
Math.round(this.toFiniteNumber(targetConcurrency, 2)),
|
|
216
|
+
1,
|
|
217
|
+
64
|
|
218
|
+
);
|
|
219
|
+
|
|
220
|
+
const hardwareBudget = this.resolveHardwareBudget(hardware, reserveGB);
|
|
221
|
+
const budgetGB = hardwareBudget.memoryBudgetGB;
|
|
222
|
+
|
|
223
|
+
const desiredLoaded = Math.max(1, Math.min(profile.loadedCap, modelPool.length));
|
|
224
|
+
let loadedModels = desiredLoaded;
|
|
225
|
+
|
|
226
|
+
// Ensure the base model memory is feasible.
|
|
227
|
+
while (loadedModels > 1) {
|
|
228
|
+
const state = this.computeLoadState(modelPool, requestedCtx, loadedModels, budgetGB);
|
|
229
|
+
if (state.availableForKVGB > 0) {
|
|
230
|
+
break;
|
|
231
|
+
}
|
|
232
|
+
loadedModels -= 1;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
let requestedState = this.computeLoadState(modelPool, requestedCtx, loadedModels, budgetGB);
|
|
236
|
+
let recommendedCtx = requestedCtx;
|
|
237
|
+
|
|
238
|
+
if (requestedState.maxParallelAtContext < 1) {
|
|
239
|
+
const ctxFitAtParallel1 = requestedState.kvPerTokenGB > 0
|
|
240
|
+
? Math.floor(requestedState.availableForKVGB / requestedState.kvPerTokenGB)
|
|
241
|
+
: requestedCtx;
|
|
242
|
+
recommendedCtx = this.clamp(
|
|
243
|
+
Math.max(this.minContext, Math.min(requestedCtx, ctxFitAtParallel1 || this.minContext)),
|
|
244
|
+
this.minContext,
|
|
245
|
+
requestedCtx
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
let recommendedState = this.computeLoadState(modelPool, recommendedCtx, loadedModels, budgetGB);
|
|
250
|
+
if (recommendedState.maxParallelAtContext < 1) {
|
|
251
|
+
recommendedCtx = this.minContext;
|
|
252
|
+
recommendedState = this.computeLoadState(modelPool, recommendedCtx, loadedModels, budgetGB);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
let recommendedParallel = Math.max(
|
|
256
|
+
1,
|
|
257
|
+
Math.min(
|
|
258
|
+
requestedConcurrency,
|
|
259
|
+
profile.parallelCap,
|
|
260
|
+
this.maxParallelCap,
|
|
261
|
+
Math.max(1, recommendedState.maxParallelAtContext)
|
|
262
|
+
)
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
let recommendedLoaded = this.maxLoadedModelsFor(
|
|
266
|
+
modelPool,
|
|
267
|
+
recommendedCtx,
|
|
268
|
+
recommendedParallel,
|
|
269
|
+
budgetGB,
|
|
270
|
+
profile.loadedCap
|
|
271
|
+
);
|
|
272
|
+
|
|
273
|
+
// Recompute state after final loaded model selection.
|
|
274
|
+
recommendedState = this.computeLoadState(modelPool, recommendedCtx, recommendedLoaded, budgetGB);
|
|
275
|
+
|
|
276
|
+
const maxCtxParallel1 = recommendedState.kvPerTokenGB > 0
|
|
277
|
+
? Math.floor(recommendedState.availableForKVGB / recommendedState.kvPerTokenGB)
|
|
278
|
+
: requestedCtx;
|
|
279
|
+
const maxCtxAtRecommendedParallel = recommendedState.kvPerTokenGB > 0
|
|
280
|
+
? Math.floor(recommendedState.availableForKVGB / (recommendedState.kvPerTokenGB * recommendedParallel))
|
|
281
|
+
: requestedCtx;
|
|
282
|
+
|
|
283
|
+
if (maxCtxAtRecommendedParallel > 0) {
|
|
284
|
+
recommendedCtx = this.clamp(
|
|
285
|
+
Math.min(recommendedCtx, maxCtxAtRecommendedParallel),
|
|
286
|
+
this.minContext,
|
|
287
|
+
requestedCtx
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
recommendedState = this.computeLoadState(modelPool, recommendedCtx, recommendedLoaded, budgetGB);
|
|
292
|
+
const requestedTotalGB = requestedState.baseTotalGB + (requestedState.kvAtContextGB * requestedConcurrency);
|
|
293
|
+
const recommendedTotalGB = recommendedState.baseTotalGB + (recommendedState.kvAtContextGB * recommendedParallel);
|
|
294
|
+
const requestedFits = requestedTotalGB <= budgetGB;
|
|
295
|
+
|
|
296
|
+
const risk = this.calculateRiskLevel({
|
|
297
|
+
budgetGB,
|
|
298
|
+
requestedTotalGB,
|
|
299
|
+
recommendedTotalGB,
|
|
300
|
+
requestedFits
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
const flashAttention = hardwareBudget.backend === 'cpu' ? '0' : '1';
|
|
304
|
+
const maxQueue = Math.max(4, recommendedParallel * 4);
|
|
305
|
+
|
|
306
|
+
const fallbackCtx = this.clamp(Math.min(4096, recommendedCtx), this.minContext, recommendedCtx);
|
|
307
|
+
const fallbackState = this.computeLoadState(modelPool, fallbackCtx, 1, budgetGB);
|
|
308
|
+
const fallbackTotalGB = fallbackState.baseTotalGB + fallbackState.kvAtContextGB;
|
|
309
|
+
|
|
310
|
+
const notes = [];
|
|
311
|
+
if (!requestedFits) {
|
|
312
|
+
notes.push('Requested settings exceed available memory budget; reduced settings are recommended.');
|
|
313
|
+
}
|
|
314
|
+
if (recommendedCtx < requestedCtx) {
|
|
315
|
+
notes.push(`Context reduced from ${requestedCtx} to ${recommendedCtx} to avoid memory pressure.`);
|
|
316
|
+
}
|
|
317
|
+
if (recommendedParallel < requestedConcurrency) {
|
|
318
|
+
notes.push(`Parallelism reduced from ${requestedConcurrency} to ${recommendedParallel} to keep memory stable.`);
|
|
319
|
+
}
|
|
320
|
+
if (recommendedLoaded < desiredLoaded) {
|
|
321
|
+
notes.push(`Loaded models capped at ${recommendedLoaded} for this objective and memory budget.`);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return {
|
|
325
|
+
objective: normalizedObjective,
|
|
326
|
+
inputs: {
|
|
327
|
+
targetContext: requestedCtx,
|
|
328
|
+
targetConcurrency: requestedConcurrency
|
|
329
|
+
},
|
|
330
|
+
hardware: hardwareBudget,
|
|
331
|
+
models: recommendedState.activeModels.map((model) => ({
|
|
332
|
+
name: model.name,
|
|
333
|
+
size: model.size,
|
|
334
|
+
fileSizeGB: model.fileSizeGB,
|
|
335
|
+
paramsB: model.paramsB,
|
|
336
|
+
estimatedBaseMemoryGB: Math.round(model.baseMemoryGB * 100) / 100
|
|
337
|
+
})),
|
|
338
|
+
envelope: {
|
|
339
|
+
context: {
|
|
340
|
+
requested: requestedCtx,
|
|
341
|
+
recommended: recommendedCtx,
|
|
342
|
+
min_safe: this.minContext,
|
|
343
|
+
max_for_parallel_1: Math.max(0, maxCtxParallel1 || 0),
|
|
344
|
+
max_for_recommended_parallel: Math.max(0, maxCtxAtRecommendedParallel || 0)
|
|
345
|
+
},
|
|
346
|
+
parallel: {
|
|
347
|
+
requested: requestedConcurrency,
|
|
348
|
+
recommended: recommendedParallel,
|
|
349
|
+
max_at_requested_ctx: Math.max(0, requestedState.maxParallelAtContext)
|
|
350
|
+
},
|
|
351
|
+
loaded_models: {
|
|
352
|
+
requested: desiredLoaded,
|
|
353
|
+
recommended: recommendedLoaded,
|
|
354
|
+
max_at_recommended_settings: this.maxLoadedModelsFor(
|
|
355
|
+
modelPool,
|
|
356
|
+
recommendedCtx,
|
|
357
|
+
recommendedParallel,
|
|
358
|
+
budgetGB,
|
|
359
|
+
modelPool.length
|
|
360
|
+
)
|
|
361
|
+
}
|
|
362
|
+
},
|
|
363
|
+
recommendation: {
|
|
364
|
+
num_ctx: recommendedCtx,
|
|
365
|
+
num_parallel: recommendedParallel,
|
|
366
|
+
max_loaded_models: recommendedLoaded,
|
|
367
|
+
max_queue: maxQueue,
|
|
368
|
+
keep_alive: profile.keepAlive,
|
|
369
|
+
flash_attention: flashAttention
|
|
370
|
+
},
|
|
371
|
+
memory: {
|
|
372
|
+
budgetGB: Math.round(budgetGB * 100) / 100,
|
|
373
|
+
requestedEstimatedGB: Math.round(requestedTotalGB * 100) / 100,
|
|
374
|
+
recommendedEstimatedGB: Math.round(recommendedTotalGB * 100) / 100,
|
|
375
|
+
utilizationPercent: Math.round((recommendedTotalGB / Math.max(0.1, budgetGB)) * 100)
|
|
376
|
+
},
|
|
377
|
+
risk,
|
|
378
|
+
fallback: {
|
|
379
|
+
num_ctx: fallbackCtx,
|
|
380
|
+
num_parallel: 1,
|
|
381
|
+
max_loaded_models: 1,
|
|
382
|
+
estimated_memory_gb: Math.round(fallbackTotalGB * 100) / 100
|
|
383
|
+
},
|
|
384
|
+
shell: {
|
|
385
|
+
env: {
|
|
386
|
+
OLLAMA_NUM_CTX: String(recommendedCtx),
|
|
387
|
+
OLLAMA_NUM_PARALLEL: String(recommendedParallel),
|
|
388
|
+
OLLAMA_MAX_LOADED_MODELS: String(recommendedLoaded),
|
|
389
|
+
OLLAMA_MAX_QUEUE: String(maxQueue),
|
|
390
|
+
OLLAMA_KEEP_ALIVE: profile.keepAlive,
|
|
391
|
+
OLLAMA_FLASH_ATTENTION: flashAttention
|
|
392
|
+
}
|
|
393
|
+
},
|
|
394
|
+
notes
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
module.exports = OllamaCapacityPlanner;
|