llm-checker 3.4.0 → 3.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -133,9 +133,22 @@ llm-checker ai-run --calibrated --category coding --prompt "Refactor this functi
133
133
 
134
134
  LLM Checker is published in all primary channels:
135
135
 
136
- - npm (latest): [`llm-checker@latest`](https://www.npmjs.com/package/llm-checker)
136
+ - npm (latest, recommended): [`llm-checker@latest`](https://www.npmjs.com/package/llm-checker)
137
137
  - GitHub Releases: [Release history](https://github.com/Pavelevich/llm-checker/releases)
138
- - GitHub Packages: [`@pavelevich/llm-checker`](https://github.com/users/Pavelevich/packages/npm/package/llm-checker)
138
+ - GitHub Packages (legacy mirror, may lag): [`@pavelevich/llm-checker`](https://github.com/users/Pavelevich/packages/npm/package/llm-checker)
139
+
140
+ ### Important: Use npm for Latest Builds
141
+
142
+ If you need the newest release, install from npm (`llm-checker`), not the scoped GitHub Packages mirror.
143
+
144
+ If you installed `@pavelevich/llm-checker` and version looks old:
145
+
146
+ ```bash
147
+ npm uninstall -g @pavelevich/llm-checker
148
+ npm install -g llm-checker@latest
149
+ hash -r
150
+ llm-checker --version
151
+ ```
139
152
 
140
153
  ### v3.3.0 Highlights
141
154
 
@@ -148,7 +161,9 @@ LLM Checker is published in all primary channels:
148
161
  - Hardened Jetson CUDA detection to avoid false CPU-only fallback.
149
162
  - Documentation reorganized under `docs/` with clearer onboarding paths.
150
163
 
151
- ### Optional: Install from GitHub Packages
164
+ ### Optional (Legacy): Install from GitHub Packages
165
+
166
+ Use this only if you explicitly need GitHub Packages. It may not match npm latest.
152
167
 
153
168
  ```bash
154
169
  # 1) Configure registry + token (PAT with read:packages)
@@ -261,6 +276,11 @@ Once connected, Claude can use these tools:
261
276
  | `installed` | Rank your already-downloaded Ollama models |
262
277
  | `search` | Search the Ollama model catalog with filters |
263
278
  | `smart_recommend` | Advanced recommendations using the full scoring engine |
279
+ | `ollama_plan` | Build a capacity plan for local models with recommended context/parallel/memory settings |
280
+ | `ollama_plan_env` | Return ready-to-paste `export ...` env vars from the recommended or fallback plan profile |
281
+ | `policy_validate` | Validate a policy file against the v1 schema and return structured validation output |
282
+ | `audit_export` | Run policy compliance export (`json`/`csv`/`sarif`/`all`) for `check` or `recommend` flows |
283
+ | `calibrate` | Generate calibration artifacts from a prompt suite with typed MCP inputs |
264
284
 
265
285
  **Ollama Management:**
266
286
 
@@ -281,6 +301,8 @@ Once connected, Claude can use these tools:
281
301
  | `cleanup_models` | Analyze installed models — find redundancies, cloud-only models, oversized models, and upgrade candidates |
282
302
  | `project_recommend` | Scan a project directory (languages, frameworks, size) and recommend the best model for that codebase |
283
303
  | `ollama_monitor` | Real-time system status: RAM usage, loaded models, memory headroom analysis |
304
+ | `cli_help` | List all allowlisted CLI commands exposed through MCP |
305
+ | `cli_exec` | Execute any allowlisted `llm-checker` CLI command with custom args (policy/audit/calibrate/sync/ai-run/etc.) |
284
306
 
285
307
  ### Example Prompts
286
308
 
@@ -310,6 +332,7 @@ Claude will automatically call the right tools and give you actionable results.
310
332
  | `calibrate` | Generate calibration result + routing policy artifacts from a JSONL prompt suite |
311
333
  | `installed` | Rank your installed Ollama models by compatibility |
312
334
  | `ollama-plan` | Compute safe Ollama runtime env vars (`NUM_CTX`, `NUM_PARALLEL`, `MAX_LOADED_MODELS`) for selected local models |
335
+ | `gpu-plan` | Simulate `pin`/`replica`/`spread` multi-GPU placement with memory-fit and throughput estimates per model |
313
336
 
314
337
  ### Advanced Commands (require `sql.js`)
315
338
 
@@ -3255,6 +3255,140 @@ program
3255
3255
  }
3256
3256
  });
3257
3257
 
3258
+ program
3259
+ .command('gpu-plan')
3260
+ .description('Recommend multi-GPU placement strategies for selected local models')
3261
+ .option('--models <models...>', 'Model tags/families to include (default: all local models)')
3262
+ .option('--ctx <tokens>', 'Target context window in tokens', '8192')
3263
+ .option('--concurrency <n>', 'Target parallel request count', '2')
3264
+ .option('--objective <mode>', 'Optimization objective (latency|balanced|throughput)', 'balanced')
3265
+ .option('--reserve-gb <gb>', 'Memory reserve to subtract from available GPU memory', '1')
3266
+ .option('--json', 'Output plan as JSON')
3267
+ .action(async (options) => {
3268
+ const spinner = options.json ? null : ora('Building GPU placement plan...').start();
3269
+
3270
+ try {
3271
+ const requestedObjective = String(options.objective || 'balanced').toLowerCase();
3272
+ const supportedObjectives = new Set(['latency', 'balanced', 'throughput']);
3273
+ if (!supportedObjectives.has(requestedObjective)) {
3274
+ throw new Error(`Invalid objective "${options.objective}". Use latency, balanced, or throughput.`);
3275
+ }
3276
+
3277
+ const targetContext = parsePositiveIntegerOption(options.ctx, '--ctx');
3278
+ const targetConcurrency = parsePositiveIntegerOption(options.concurrency, '--concurrency');
3279
+ const reserveGB = parseNonNegativeNumberOption(options.reserveGb, '--reserve-gb');
3280
+
3281
+ const OllamaClient = require('../src/ollama/client');
3282
+ const UnifiedDetector = require('../src/hardware/unified-detector');
3283
+ const OllamaGPUPlacementPlanner = require('../src/ollama/gpu-placement-planner');
3284
+
3285
+ const ollamaClient = new OllamaClient();
3286
+ const availability = await ollamaClient.checkOllamaAvailability();
3287
+ if (!availability.available) {
3288
+ throw new Error(availability.error || 'Ollama is not available');
3289
+ }
3290
+
3291
+ const localModels = await ollamaClient.getLocalModels();
3292
+ if (!localModels || localModels.length === 0) {
3293
+ throw new Error('No local Ollama models found. Install one with: ollama pull llama3.2:3b');
3294
+ }
3295
+
3296
+ const { selected, missing } = selectModelsForPlan(localModels, options.models || []);
3297
+ if (selected.length === 0) {
3298
+ throw new Error(
3299
+ `No matching local models found for: ${(options.models || []).join(', ')}`
3300
+ );
3301
+ }
3302
+
3303
+ const detector = new UnifiedDetector();
3304
+ const hardware = await detector.detect();
3305
+ const planner = new OllamaGPUPlacementPlanner();
3306
+
3307
+ const plan = planner.plan({
3308
+ hardware,
3309
+ models: selected,
3310
+ targetContext,
3311
+ targetConcurrency,
3312
+ objective: requestedObjective,
3313
+ reserveGB
3314
+ });
3315
+
3316
+ if (options.json) {
3317
+ console.log(JSON.stringify({
3318
+ generated_at: new Date().toISOString(),
3319
+ selection: {
3320
+ requested: options.models || [],
3321
+ selected: selected.map((model) => model.name),
3322
+ missing
3323
+ },
3324
+ plan
3325
+ }, null, 2));
3326
+ return;
3327
+ }
3328
+
3329
+ if (spinner) spinner.succeed('GPU placement plan generated');
3330
+
3331
+ console.log('\n' + chalk.bgMagenta.white.bold(' GPU PLACEMENT PLAN '));
3332
+ console.log(chalk.magenta('Backend:'), `${plan.hardware.backend_name} (${plan.hardware.backend})`);
3333
+ console.log(
3334
+ chalk.magenta('GPU inventory:'),
3335
+ `${plan.hardware.gpu_count} device(s), ${plan.hardware.total_usable_memory_gb}GB usable (reserve ${plan.hardware.reserve_gb}GB)`
3336
+ );
3337
+ console.log(
3338
+ chalk.magenta('Target envelope:'),
3339
+ `ctx=${plan.inputs.target_context}, concurrency=${plan.inputs.target_concurrency}, objective=${plan.objective}`
3340
+ );
3341
+
3342
+ if (missing.length > 0) {
3343
+ console.log(chalk.yellow('Missing model filters:'), missing.join(', '));
3344
+ }
3345
+
3346
+ if (!plan.hardware.is_multi_gpu) {
3347
+ console.log(chalk.yellow('Only one GPU detected: replica/spread are included for simulation but may be infeasible.'));
3348
+ }
3349
+
3350
+ for (const modelPlan of plan.models) {
3351
+ const recommended = modelPlan.recommended || {};
3352
+ const recFit = recommended.feasible ? chalk.green('fit') : chalk.red('no-fit');
3353
+ const recRisk = recommended.risk ? `${recommended.risk.level.toUpperCase()} (${recommended.risk.score})` : 'N/A';
3354
+
3355
+ console.log(chalk.magenta.bold(`\nModel: ${modelPlan.name} (${modelPlan.size})`));
3356
+ console.log(
3357
+ ` Recommended: ${chalk.bold((recommended.strategy || 'unknown').toUpperCase())} | ${recFit} | ~${recommended.estimated_tps || 0} tok/s | risk ${recRisk}`
3358
+ );
3359
+
3360
+ if (recommended.device_env_var && recommended.visible_devices) {
3361
+ console.log(` Device pinning hint: export ${recommended.device_env_var}=${recommended.visible_devices}`);
3362
+ }
3363
+
3364
+ console.log(chalk.magenta(' Strategies:'));
3365
+ for (const strategy of modelPlan.strategies) {
3366
+ const fit = strategy.feasible ? chalk.green('fit') : chalk.red('no-fit');
3367
+ const risk = strategy.risk ? `${strategy.risk.level} (${strategy.risk.score})` : 'n/a';
3368
+ console.log(
3369
+ ` - ${strategy.strategy.padEnd(7)} ${fit} | ~${strategy.estimated_tps} tok/s | ${strategy.memory_per_gpu_gb}GB/GPU | risk ${risk}`
3370
+ );
3371
+ }
3372
+ }
3373
+
3374
+ if (plan.notes && plan.notes.length > 0) {
3375
+ console.log(chalk.magenta.bold('\nNotes:'));
3376
+ for (const note of plan.notes) {
3377
+ console.log(` - ${note}`);
3378
+ }
3379
+ }
3380
+
3381
+ console.log('');
3382
+ } catch (error) {
3383
+ if (spinner) spinner.fail('Failed to build GPU placement plan');
3384
+ console.error(chalk.red('Error:'), error.message);
3385
+ if (process.env.DEBUG) {
3386
+ console.error(error.stack);
3387
+ }
3388
+ process.exit(1);
3389
+ }
3390
+ });
3391
+
3258
3392
  program
3259
3393
  .command('recommend')
3260
3394
  .description('Get intelligent model recommendations for your hardware')
@@ -4155,8 +4289,8 @@ program
4155
4289
  }
4156
4290
 
4157
4291
  if (backend === 'cuda' && info.info) {
4158
- console.log(` Driver: ${info.info.driver}`);
4159
- console.log(` CUDA: ${info.info.cuda}`);
4292
+ console.log(` Driver: ${info.info.driver || 'unknown'}`);
4293
+ console.log(` CUDA: ${info.info.cuda || 'unknown'}`);
4160
4294
  console.log(` Total VRAM: ${info.info.totalVRAM}GB`);
4161
4295
  for (const gpu of info.info.gpus) {
4162
4296
  console.log(` ${gpu.name}: ${gpu.memory.total}GB`);
@@ -101,13 +101,89 @@ function nsToSec(ns) {
101
101
  return (ns / 1e9).toFixed(2);
102
102
  }
103
103
 
104
+ function tryParseJSON(text) {
105
+ try {
106
+ return JSON.parse(text);
107
+ } catch {
108
+ return null;
109
+ }
110
+ }
111
+
112
+ function formatExportBlock(envObject) {
113
+ if (!envObject || typeof envObject !== "object") return "";
114
+ const entries = Object.entries(envObject).filter(([, value]) => value !== undefined && value !== null);
115
+ if (entries.length === 0) return "";
116
+ return entries
117
+ .map(([key, value]) => `export ${key}="${String(value)}"`)
118
+ .join("\n");
119
+ }
120
+
121
+ function summarizeOllamaPlan(payload) {
122
+ if (!payload || typeof payload !== "object") return null;
123
+ const plan = payload.plan;
124
+ if (!plan || typeof plan !== "object") return null;
125
+
126
+ const selectedModels = Array.isArray(plan.models)
127
+ ? plan.models.map((model) => model?.name).filter(Boolean)
128
+ : [];
129
+ const hardware = plan.hardware || {};
130
+ const memory = plan.memory || {};
131
+ const recommendation = plan.recommendation || {};
132
+ const risk = plan.risk || {};
133
+
134
+ const lines = [
135
+ "OLLAMA CAPACITY PLAN",
136
+ `Hardware: ${hardware.backendName || hardware.backend || "unknown"}`,
137
+ `Models: ${selectedModels.length > 0 ? selectedModels.join(", ") : "none selected"}`,
138
+ "",
139
+ "Recommended envelope:",
140
+ ` Context: ${plan.envelope?.context?.recommended ?? "?"}`,
141
+ ` Parallel: ${plan.envelope?.parallel?.recommended ?? "?"}`,
142
+ ` Loaded models: ${plan.envelope?.loaded_models?.recommended ?? "?"}`,
143
+ ` Estimated memory: ${memory.recommendedEstimatedGB ?? "?"}GB / ${memory.budgetGB ?? "?"}GB (${memory.utilizationPercent ?? "?"}%)`,
144
+ ` Risk: ${(risk.level || "unknown").toUpperCase()} (${risk.score ?? "?"}/100)`,
145
+ ];
146
+
147
+ if (recommendation && Object.keys(recommendation).length > 0) {
148
+ lines.push("");
149
+ lines.push("Recommended env vars:");
150
+ if (recommendation.num_ctx !== undefined) lines.push(` export OLLAMA_NUM_CTX="${recommendation.num_ctx}"`);
151
+ if (recommendation.num_parallel !== undefined) lines.push(` export OLLAMA_NUM_PARALLEL="${recommendation.num_parallel}"`);
152
+ if (recommendation.max_loaded_models !== undefined) lines.push(` export OLLAMA_MAX_LOADED_MODELS="${recommendation.max_loaded_models}"`);
153
+ if (recommendation.max_queue !== undefined) lines.push(` export OLLAMA_MAX_QUEUE="${recommendation.max_queue}"`);
154
+ if (recommendation.keep_alive !== undefined) lines.push(` export OLLAMA_KEEP_ALIVE="${recommendation.keep_alive}"`);
155
+ if (recommendation.flash_attention !== undefined) lines.push(` export OLLAMA_FLASH_ATTENTION="${recommendation.flash_attention}"`);
156
+ }
157
+
158
+ return lines.join("\n");
159
+ }
160
+
161
+ const ALLOWED_CLI_COMMANDS = new Set([
162
+ "policy",
163
+ "audit",
164
+ "calibrate",
165
+ "check",
166
+ "ollama",
167
+ "installed",
168
+ "ollama-plan",
169
+ "recommend",
170
+ "list-models",
171
+ "ai-check",
172
+ "ai-run",
173
+ "demo",
174
+ "sync",
175
+ "search",
176
+ "smart-recommend",
177
+ "hw-detect",
178
+ ]);
179
+
104
180
  // ============================================================================
105
181
  // MCP SERVER
106
182
  // ============================================================================
107
183
 
108
184
  const server = new McpServer({
109
185
  name: "llm-checker",
110
- version: "3.2.0",
186
+ version: "3.4.0",
111
187
  });
112
188
 
113
189
  // ============================================================================
@@ -198,6 +274,352 @@ server.tool(
198
274
  }
199
275
  );
200
276
 
277
+ server.tool(
278
+ "ollama_plan",
279
+ "Build an Ollama capacity plan for selected local models and return recommended context/parallel/memory settings",
280
+ {
281
+ models: z
282
+ .array(z.string())
283
+ .optional()
284
+ .describe("Optional list of model tags/families to include (default: all local models)"),
285
+ ctx: z.number().int().positive().optional().describe("Target context window in tokens"),
286
+ concurrency: z.number().int().positive().optional().describe("Target parallel request count"),
287
+ objective: z
288
+ .enum(["latency", "balanced", "throughput"])
289
+ .optional()
290
+ .describe("Optimization objective"),
291
+ reserve_gb: z.number().min(0).optional().describe("Memory reserve in GB for OS/background workloads"),
292
+ },
293
+ async ({ models, ctx, concurrency, objective, reserve_gb }) => {
294
+ const args = ["ollama-plan", "--json"];
295
+ if (Array.isArray(models) && models.length > 0) args.push("--models", ...models);
296
+ if (ctx !== undefined) args.push("--ctx", String(ctx));
297
+ if (concurrency !== undefined) args.push("--concurrency", String(concurrency));
298
+ if (objective) args.push("--objective", objective);
299
+ if (reserve_gb !== undefined) args.push("--reserve-gb", String(reserve_gb));
300
+
301
+ const result = await run(args, 180000);
302
+ const payload = tryParseJSON(result);
303
+
304
+ if (!payload) {
305
+ return {
306
+ content: [{ type: "text", text: result }],
307
+ };
308
+ }
309
+
310
+ const summary = summarizeOllamaPlan(payload);
311
+ const output = summary
312
+ ? `${summary}\n\nRAW JSON:\n${JSON.stringify(payload, null, 2)}`
313
+ : JSON.stringify(payload, null, 2);
314
+
315
+ return {
316
+ content: [{ type: "text", text: output }],
317
+ };
318
+ }
319
+ );
320
+
321
+ server.tool(
322
+ "ollama_plan_env",
323
+ "Return shell export commands from an Ollama capacity plan (recommended or fallback profile)",
324
+ {
325
+ profile: z
326
+ .enum(["recommended", "fallback"])
327
+ .optional()
328
+ .describe("Which profile to return (default: recommended)"),
329
+ models: z
330
+ .array(z.string())
331
+ .optional()
332
+ .describe("Optional list of model tags/families to include (default: all local models)"),
333
+ ctx: z.number().int().positive().optional().describe("Target context window in tokens"),
334
+ concurrency: z.number().int().positive().optional().describe("Target parallel request count"),
335
+ objective: z
336
+ .enum(["latency", "balanced", "throughput"])
337
+ .optional()
338
+ .describe("Optimization objective"),
339
+ reserve_gb: z.number().min(0).optional().describe("Memory reserve in GB for OS/background workloads"),
340
+ },
341
+ async ({ profile, models, ctx, concurrency, objective, reserve_gb }) => {
342
+ const args = ["ollama-plan", "--json"];
343
+ if (Array.isArray(models) && models.length > 0) args.push("--models", ...models);
344
+ if (ctx !== undefined) args.push("--ctx", String(ctx));
345
+ if (concurrency !== undefined) args.push("--concurrency", String(concurrency));
346
+ if (objective) args.push("--objective", objective);
347
+ if (reserve_gb !== undefined) args.push("--reserve-gb", String(reserve_gb));
348
+
349
+ const result = await run(args, 180000);
350
+ const payload = tryParseJSON(result);
351
+ if (!payload?.plan) {
352
+ return {
353
+ content: [{ type: "text", text: `Failed to parse ollama-plan output:\n${result}` }],
354
+ isError: true,
355
+ };
356
+ }
357
+
358
+ const selectedProfile = profile || "recommended";
359
+ const plan = payload.plan;
360
+ let envValues = null;
361
+
362
+ if (selectedProfile === "fallback") {
363
+ const fallback = plan.fallback || {};
364
+ envValues = {
365
+ OLLAMA_NUM_CTX: fallback.num_ctx,
366
+ OLLAMA_NUM_PARALLEL: fallback.num_parallel,
367
+ OLLAMA_MAX_LOADED_MODELS: fallback.max_loaded_models,
368
+ };
369
+ } else {
370
+ envValues = plan.shell?.env || null;
371
+ if (!envValues) {
372
+ const recommendation = plan.recommendation || {};
373
+ envValues = {
374
+ OLLAMA_NUM_CTX: recommendation.num_ctx,
375
+ OLLAMA_NUM_PARALLEL: recommendation.num_parallel,
376
+ OLLAMA_MAX_LOADED_MODELS: recommendation.max_loaded_models,
377
+ OLLAMA_MAX_QUEUE: recommendation.max_queue,
378
+ OLLAMA_KEEP_ALIVE: recommendation.keep_alive,
379
+ OLLAMA_FLASH_ATTENTION: recommendation.flash_attention,
380
+ };
381
+ }
382
+ }
383
+
384
+ const exports = formatExportBlock(envValues);
385
+ if (!exports) {
386
+ return {
387
+ content: [{ type: "text", text: "No environment values available for this plan/profile." }],
388
+ isError: true,
389
+ };
390
+ }
391
+
392
+ return {
393
+ content: [
394
+ {
395
+ type: "text",
396
+ text: [`PROFILE: ${selectedProfile.toUpperCase()}`, "", exports].join("\n"),
397
+ },
398
+ ],
399
+ };
400
+ }
401
+ );
402
+
403
+ server.tool(
404
+ "cli_help",
405
+ "List all llm-checker CLI commands exposed via cli_exec",
406
+ {},
407
+ async () => {
408
+ const commands = [...ALLOWED_CLI_COMMANDS].sort();
409
+ const lines = [
410
+ "Available commands for cli_exec:",
411
+ ...commands.map((command) => ` - ${command}`),
412
+ "",
413
+ "Examples:",
414
+ ' cli_exec command="ollama-plan" args=["--json"]',
415
+ ' cli_exec command="policy" args=["validate","--file","policy.yaml","--json"]',
416
+ ' cli_exec command="search" args=["qwen","--use-case","coding","--limit","5"]',
417
+ ];
418
+ return { content: [{ type: "text", text: lines.join("\n") }] };
419
+ }
420
+ );
421
+
422
+ server.tool(
423
+ "cli_exec",
424
+ "Execute any supported llm-checker CLI command (allowlisted) with custom arguments",
425
+ {
426
+ command: z.string().describe("Top-level command (use cli_help to list allowed commands)"),
427
+ args: z
428
+ .array(z.string())
429
+ .optional()
430
+ .describe("Additional CLI args, exactly as used in terminal (without shell quoting)"),
431
+ timeout_ms: z.number().int().min(1000).max(600000).optional().describe("Execution timeout in milliseconds"),
432
+ },
433
+ async ({ command, args, timeout_ms }) => {
434
+ const trimmedCommand = String(command || "").trim();
435
+ if (!ALLOWED_CLI_COMMANDS.has(trimmedCommand)) {
436
+ return {
437
+ content: [
438
+ {
439
+ type: "text",
440
+ text: `Unsupported command "${trimmedCommand}". Use cli_help to list allowed commands.`,
441
+ },
442
+ ],
443
+ isError: true,
444
+ };
445
+ }
446
+
447
+ const safeArgs = Array.isArray(args) ? args : [];
448
+ if (safeArgs.length > 100) {
449
+ return {
450
+ content: [{ type: "text", text: "Too many arguments. Limit is 100." }],
451
+ isError: true,
452
+ };
453
+ }
454
+
455
+ const result = await run([trimmedCommand, ...safeArgs], timeout_ms || 180000);
456
+ return { content: [{ type: "text", text: result }] };
457
+ }
458
+ );
459
+
460
+ server.tool(
461
+ "policy_validate",
462
+ "Validate a policy file against the v1 schema and return structured validation output",
463
+ {
464
+ file: z.string().optional().describe("Policy file path (default: policy.yaml)"),
465
+ },
466
+ async ({ file }) => {
467
+ const args = ["policy", "validate", "--json"];
468
+ if (file) args.push("--file", file);
469
+
470
+ const result = await run(args, 120000);
471
+ const payload = tryParseJSON(result);
472
+ if (!payload) {
473
+ return {
474
+ content: [{ type: "text", text: result }],
475
+ };
476
+ }
477
+
478
+ const status = payload.valid ? "VALID" : "INVALID";
479
+ const header = [
480
+ `POLICY VALIDATION: ${status}`,
481
+ `File: ${payload.file || file || "policy.yaml"}`,
482
+ `Errors: ${payload.errorCount ?? (Array.isArray(payload.errors) ? payload.errors.length : 0)}`,
483
+ ].join("\n");
484
+
485
+ return {
486
+ content: [{ type: "text", text: `${header}\n\n${JSON.stringify(payload, null, 2)}` }],
487
+ isError: !payload.valid,
488
+ };
489
+ }
490
+ );
491
+
492
+ server.tool(
493
+ "audit_export",
494
+ "Run policy compliance audit export (json/csv/sarif/all) for check/recommend flows",
495
+ {
496
+ policy: z.string().describe("Policy file path"),
497
+ command: z
498
+ .enum(["check", "recommend"])
499
+ .optional()
500
+ .describe("Evaluation source (default: check)"),
501
+ format: z
502
+ .enum(["json", "csv", "sarif", "all"])
503
+ .optional()
504
+ .describe("Export format (default: json)"),
505
+ out: z.string().optional().describe("Output file path (single format only)"),
506
+ out_dir: z.string().optional().describe("Output directory when --out is omitted"),
507
+ use_case: z.string().optional().describe("Use case when command=check"),
508
+ category: z.string().optional().describe("Category hint when command=recommend"),
509
+ optimize: z
510
+ .enum(["balanced", "speed", "quality", "context", "coding"])
511
+ .optional()
512
+ .describe("Optimization profile when command=recommend"),
513
+ runtime: z
514
+ .enum(["ollama", "vllm", "mlx"])
515
+ .optional()
516
+ .describe("Runtime backend for check mode"),
517
+ include_cloud: z.boolean().optional().describe("Include cloud models in check-mode analysis"),
518
+ max_size: z.string().optional().describe('Maximum model size for check mode (example: "24B" or "12GB")'),
519
+ min_size: z.string().optional().describe('Minimum model size for check mode (example: "3B" or "2GB")'),
520
+ limit: z.number().int().positive().optional().describe("Model analysis limit for check mode"),
521
+ verbose: z.boolean().optional().describe("Enable verbose progress (default: true)"),
522
+ },
523
+ async ({
524
+ policy,
525
+ command,
526
+ format,
527
+ out,
528
+ out_dir,
529
+ use_case,
530
+ category,
531
+ optimize,
532
+ runtime,
533
+ include_cloud,
534
+ max_size,
535
+ min_size,
536
+ limit,
537
+ verbose,
538
+ }) => {
539
+ const args = ["audit", "export", "--policy", policy];
540
+ if (command) args.push("--command", command);
541
+ if (format) args.push("--format", format);
542
+ if (out) args.push("--out", out);
543
+ if (out_dir) args.push("--out-dir", out_dir);
544
+ if (use_case) args.push("--use-case", use_case);
545
+ if (category) args.push("--category", category);
546
+ if (optimize) args.push("--optimize", optimize);
547
+ if (runtime) args.push("--runtime", runtime);
548
+ if (include_cloud) args.push("--include-cloud");
549
+ if (max_size) args.push("--max-size", max_size);
550
+ if (min_size) args.push("--min-size", min_size);
551
+ if (limit !== undefined) args.push("--limit", String(limit));
552
+ if (verbose === false) args.push("--no-verbose");
553
+
554
+ const result = await run(args, 300000);
555
+ const hadFailure =
556
+ /audit export failed:/i.test(result) ||
557
+ /blocking violations detected/i.test(result) ||
558
+ /enforcement result:\s*blocking/i.test(result);
559
+ return {
560
+ content: [{ type: "text", text: result }],
561
+ isError: hadFailure,
562
+ };
563
+ }
564
+ );
565
+
566
+ server.tool(
567
+ "calibrate",
568
+ "Generate calibration artifacts from a JSONL prompt suite (dry-run, contract-only, or full benchmark mode)",
569
+ {
570
+ suite: z.string().describe("Prompt suite path in JSONL format"),
571
+ models: z.array(z.string()).describe("Model identifiers to include"),
572
+ output: z.string().describe("Calibration result output path (.json/.yaml/.yml)"),
573
+ runtime: z
574
+ .enum(["ollama", "vllm", "mlx"])
575
+ .optional()
576
+ .describe("Inference runtime backend"),
577
+ mode: z
578
+ .enum(["dry-run", "contract-only", "full"])
579
+ .optional()
580
+ .describe("Execution mode"),
581
+ objective: z
582
+ .enum(["speed", "quality", "balanced"])
583
+ .optional()
584
+ .describe("Calibration objective"),
585
+ policy_out: z.string().optional().describe("Optional calibration policy output path"),
586
+ warmup: z.number().int().positive().optional().describe("Warmup runs per prompt in full mode"),
587
+ iterations: z.number().int().positive().optional().describe("Measured iterations per prompt in full mode"),
588
+ timeout_ms: z.number().int().positive().optional().describe("Per-prompt timeout in full mode (ms)"),
589
+ dry_run: z.boolean().optional().describe("Shortcut flag for dry-run mode"),
590
+ },
591
+ async ({
592
+ suite,
593
+ models,
594
+ output,
595
+ runtime,
596
+ mode,
597
+ objective,
598
+ policy_out,
599
+ warmup,
600
+ iterations,
601
+ timeout_ms,
602
+ dry_run,
603
+ }) => {
604
+ const args = ["calibrate", "--suite", suite, "--models", ...models, "--output", output];
605
+ if (runtime) args.push("--runtime", runtime);
606
+ if (mode) args.push("--mode", mode);
607
+ if (objective) args.push("--objective", objective);
608
+ if (policy_out) args.push("--policy-out", policy_out);
609
+ if (warmup !== undefined) args.push("--warmup", String(warmup));
610
+ if (iterations !== undefined) args.push("--iterations", String(iterations));
611
+ if (timeout_ms !== undefined) args.push("--timeout-ms", String(timeout_ms));
612
+ if (dry_run) args.push("--dry-run");
613
+
614
+ const result = await run(args, 600000);
615
+ const hadFailure = /calibration failed:/i.test(result);
616
+ return {
617
+ content: [{ type: "text", text: result }],
618
+ isError: hadFailure,
619
+ };
620
+ }
621
+ );
622
+
201
623
  // ============================================================================
202
624
  // OLLAMA MANAGEMENT TOOLS
203
625
  // ============================================================================
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "llm-checker",
3
- "version": "3.4.0",
3
+ "version": "3.4.2",
4
4
  "description": "Intelligent CLI tool with AI-powered model selection that analyzes your hardware and recommends optimal LLM models for your system",
5
5
  "bin": {
6
6
  "llm-checker": "bin/cli.js",
@@ -322,7 +322,7 @@ class CUDADetector {
322
322
  const modelRaw = this.readJetsonModel();
323
323
  const model = this.normalizeJetsonModel(modelRaw);
324
324
  const cudaVersion = this.detectJetsonCudaVersion();
325
- const driverVersion = this.detectJetsonDriverVersion();
325
+ const driverVersion = this.detectJetsonDriverVersion() || 'unknown';
326
326
  const totalSystemGB = Math.max(1, Math.round(os.totalmem() / (1024 ** 3)));
327
327
  const sharedGpuMemoryGB = Math.max(1, Math.round(totalSystemGB * 0.85));
328
328
  const capabilities = this.getJetsonCapabilities(modelRaw || model);
@@ -423,11 +423,26 @@ class CUDADetector {
423
423
  }
424
424
 
425
425
  detectJetsonDriverVersion() {
426
- const versionInfo = this.readFileIfExists('/proc/driver/nvidia/version');
427
- if (!versionInfo) return null;
426
+ const driverSources = [
427
+ '/proc/driver/nvidia/version',
428
+ '/sys/module/nvidia/version'
429
+ ];
430
+
431
+ for (const source of driverSources) {
432
+ const versionInfo = this.readFileIfExists(source);
433
+ if (!versionInfo) continue;
434
+
435
+ const kernelMatch = versionInfo.match(/Kernel Module(?:\s+for\s+\w+)?\s+([0-9]+(?:\.[0-9]+){1,3})/i);
436
+ if (kernelMatch) return kernelMatch[1];
437
+
438
+ const nvrmMatch = versionInfo.match(/NVRM version:\s*.*?([0-9]+(?:\.[0-9]+){1,3})/i);
439
+ if (nvrmMatch) return nvrmMatch[1];
428
440
 
429
- const match = versionInfo.match(/Kernel Module\s+([0-9.]+)/i);
430
- return match ? match[1] : null;
441
+ const genericMatch = versionInfo.match(/\b([0-9]+(?:\.[0-9]+){1,3})\b/);
442
+ if (genericMatch) return genericMatch[1];
443
+ }
444
+
445
+ return null;
431
446
  }
432
447
 
433
448
  getJetsonCapabilities(model) {
@@ -734,10 +749,13 @@ class CUDADetector {
734
749
  const primary = this.getPrimaryGPU();
735
750
  const gpuName = primary.name.toLowerCase()
736
751
  .replace(/nvidia|geforce|quadro|tesla/gi, '')
737
- .replace(/\s+/g, '-')
738
- .trim();
752
+ .replace(/[^a-z0-9]+/gi, '-')
753
+ .replace(/-+/g, '-')
754
+ .replace(/^-|-$/g, '');
755
+ const normalizedGpuName = gpuName || 'gpu';
756
+ const normalizedVRAM = Number.isFinite(info.totalVRAM) ? Math.max(0, Math.round(info.totalVRAM)) : 0;
739
757
 
740
- return `cuda-${gpuName}-${info.totalVRAM}gb${info.isMultiGPU ? '-x' + info.gpus.length : ''}`;
758
+ return `cuda-${normalizedGpuName}-${normalizedVRAM}gb${info.isMultiGPU ? '-x' + info.gpus.length : ''}`;
741
759
  }
742
760
 
743
761
  /**
@@ -0,0 +1,496 @@
1
+ class OllamaGPUPlacementPlanner {
2
+ constructor(options = {}) {
3
+ this.minContext = options.minContext || 2048;
4
+ this.defaultReserveGB = options.defaultReserveGB || 1;
5
+ this.kvFactorPer4k = options.kvFactorPer4k || 0.08; // GB per 1B params at 4K context
6
+ this.modelOverheadGB = options.modelOverheadGB || 0.7;
7
+ this.spreadOverheadGB = options.spreadOverheadGB || 0.35;
8
+ }
9
+
10
+ toFiniteNumber(value, fallback = 0) {
11
+ const numeric = Number(value);
12
+ return Number.isFinite(numeric) ? numeric : fallback;
13
+ }
14
+
15
+ clamp(value, min, max) {
16
+ return Math.min(max, Math.max(min, value));
17
+ }
18
+
19
+ normalizeObjective(objective) {
20
+ const normalized = String(objective || 'balanced').toLowerCase();
21
+ if (normalized === 'latency' || normalized === 'balanced' || normalized === 'throughput') {
22
+ return normalized;
23
+ }
24
+ return 'balanced';
25
+ }
26
+
27
+ parseParamsB(model = {}) {
28
+ const bySize = String(model.size || '').match(/(\d+(?:\.\d+)?)\s*b/i);
29
+ if (bySize) {
30
+ return this.toFiniteNumber(bySize[1], 0);
31
+ }
32
+
33
+ const byName = String(model.name || '').match(/(\d+(?:\.\d+)?)\s*b\b/i);
34
+ if (byName) {
35
+ return this.toFiniteNumber(byName[1], 0);
36
+ }
37
+
38
+ const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
39
+ if (fileSizeGB > 0) {
40
+ return fileSizeGB / 0.65;
41
+ }
42
+
43
+ return 7;
44
+ }
45
+
46
+ estimateBaseMemoryGB(model = {}) {
47
+ const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, 0);
48
+ if (fileSizeGB > 0) {
49
+ return fileSizeGB + this.modelOverheadGB;
50
+ }
51
+
52
+ const paramsB = this.parseParamsB(model);
53
+ return paramsB * 0.65 + this.modelOverheadGB;
54
+ }
55
+
56
+ estimateKVCacheGB(paramsB, contextTokens) {
57
+ const ctx = this.toFiniteNumber(contextTokens, this.minContext);
58
+ return paramsB * this.kvFactorPer4k * (ctx / 4096);
59
+ }
60
+
61
+ normalizeQuantization(rawQuant) {
62
+ const quant = String(rawQuant || 'Q4_K_M').toUpperCase();
63
+ if (quant.includes('FP16') || quant.includes('F16')) return 'FP16';
64
+ if (quant.includes('Q8')) return 'Q8_0';
65
+ if (quant.includes('Q6')) return 'Q6_K';
66
+ if (quant.includes('Q5')) return 'Q5_K_M';
67
+ if (quant.includes('IQ4')) return 'IQ4_XS';
68
+ if (quant.includes('Q4')) return 'Q4_K_M';
69
+ if (quant.includes('IQ3')) return 'IQ3_XXS';
70
+ if (quant.includes('Q3')) return 'Q3_K_M';
71
+ if (quant.includes('Q2')) return 'Q2_K';
72
+ return 'Q4_K_M';
73
+ }
74
+
75
+ quantizationMultiplier(quantization) {
76
+ const table = {
77
+ FP16: 1.0,
78
+ Q8_0: 1.5,
79
+ Q6_K: 1.8,
80
+ Q5_K_M: 2.0,
81
+ Q4_K_M: 2.4,
82
+ IQ4_XS: 2.5,
83
+ Q3_K_M: 2.9,
84
+ IQ3_XXS: 3.1,
85
+ Q2_K: 3.4
86
+ };
87
+ return table[this.normalizeQuantization(quantization)] || 2.0;
88
+ }
89
+
90
+ estimateTokensPerSecond(gpu, model, contextTokens) {
91
+ const paramsB = Math.max(0.5, this.toFiniteNumber(model.paramsB, 7));
92
+ const speedCoefficient = Math.max(1, this.toFiniteNumber(gpu.speedCoefficient, 60));
93
+ const quantMult = this.quantizationMultiplier(model.quantization);
94
+
95
+ // Larger contexts reduce generation speed in practice.
96
+ const contextScale = Math.max(0.55, Math.pow(4096 / Math.max(4096, contextTokens), 0.12));
97
+ return Math.max(1, Math.round((speedCoefficient / paramsB) * quantMult * contextScale));
98
+ }
99
+
100
+ normalizeModels(models = []) {
101
+ const normalized = models
102
+ .filter((model) => model && model.name)
103
+ .map((model) => {
104
+ const paramsB = this.parseParamsB(model);
105
+ const baseMemoryGB = this.estimateBaseMemoryGB(model);
106
+ const fileSizeGB = this.toFiniteNumber(model.fileSizeGB, Math.max(0, baseMemoryGB - this.modelOverheadGB));
107
+ const quantization = model.quantization || model.details?.quantization_level || 'Q4_K_M';
108
+ return {
109
+ name: model.name,
110
+ size: model.size || `${Math.round(paramsB)}B`,
111
+ fileSizeGB: Math.round(fileSizeGB * 10) / 10,
112
+ paramsB: Math.round(paramsB * 10) / 10,
113
+ baseMemoryGB: Math.round(baseMemoryGB * 100) / 100,
114
+ quantization
115
+ };
116
+ });
117
+
118
+ normalized.sort((a, b) => b.baseMemoryGB - a.baseMemoryGB);
119
+ return normalized;
120
+ }
121
+
122
+ resolveDeviceEnvVar(backend) {
123
+ if (backend === 'cuda') return 'CUDA_VISIBLE_DEVICES';
124
+ if (backend === 'rocm') return 'HIP_VISIBLE_DEVICES';
125
+ return null;
126
+ }
127
+
128
+ resolveHardware(hardware = {}, reserveGB = null) {
129
+ const summary = hardware.summary || {};
130
+ const primary = hardware.primary || {};
131
+ const backend = summary.bestBackend || primary.type || 'cpu';
132
+ const backendName = summary.backendName || primary.name || 'CPU';
133
+
134
+ const backendInfo = primary.info || hardware.backends?.[backend]?.info || {};
135
+ const rawGpus = Array.isArray(backendInfo.gpus) ? backendInfo.gpus : [];
136
+
137
+ let gpus = rawGpus.map((gpu, index) => ({
138
+ index: this.toFiniteNumber(gpu.index, index),
139
+ name: String(gpu.name || `GPU ${index}`),
140
+ memoryGB: Math.max(1, this.toFiniteNumber(gpu.memory?.total, 0)),
141
+ speedCoefficient: Math.max(1, this.toFiniteNumber(gpu.speedCoefficient, summary.speedCoefficient || 60))
142
+ }));
143
+
144
+ if (!gpus.length && this.toFiniteNumber(summary.totalVRAM, 0) > 0) {
145
+ gpus = [{
146
+ index: 0,
147
+ name: summary.gpuModel || 'GPU 0',
148
+ memoryGB: Math.max(1, this.toFiniteNumber(summary.totalVRAM, 0)),
149
+ speedCoefficient: Math.max(1, this.toFiniteNumber(summary.speedCoefficient, 80))
150
+ }];
151
+ }
152
+
153
+ if (!gpus.length) {
154
+ gpus = [{
155
+ index: 0,
156
+ name: summary.cpuModel || 'CPU',
157
+ memoryGB: Math.max(4, this.toFiniteNumber(summary.effectiveMemory, this.toFiniteNumber(hardware.memory?.total, 16) * 0.7)),
158
+ speedCoefficient: Math.max(1, this.toFiniteNumber(summary.speedCoefficient, 25))
159
+ }];
160
+ }
161
+
162
+ gpus.sort((a, b) => {
163
+ if (b.speedCoefficient !== a.speedCoefficient) return b.speedCoefficient - a.speedCoefficient;
164
+ return b.memoryGB - a.memoryGB;
165
+ });
166
+
167
+ const reserve = this.toFiniteNumber(reserveGB, this.defaultReserveGB);
168
+ const reservePerGPU = reserve / Math.max(1, gpus.length);
169
+ const usableGPUs = gpus.map((gpu) => ({
170
+ ...gpu,
171
+ usableMemoryGB: Math.max(1, Math.round((gpu.memoryGB - reservePerGPU) * 100) / 100)
172
+ }));
173
+
174
+ const totalUsableGB = usableGPUs.reduce((sum, gpu) => sum + gpu.usableMemoryGB, 0);
175
+
176
+ return {
177
+ backend,
178
+ backendName,
179
+ reserveGB: Math.round(reserve * 100) / 100,
180
+ isMultiGPU: usableGPUs.length > 1,
181
+ deviceEnvVar: this.resolveDeviceEnvVar(backend),
182
+ gpus: usableGPUs,
183
+ totalUsableGB: Math.round(totalUsableGB * 100) / 100
184
+ };
185
+ }
186
+
187
+ makeRisk(utilization, fits, strategy) {
188
+ if (!fits) {
189
+ return { level: 'critical', score: 95 };
190
+ }
191
+
192
+ const complexityPenalty = strategy === 'spread' ? 12 : strategy === 'replica' ? 6 : 0;
193
+ const score = Math.min(100, Math.round((utilization * 72) + complexityPenalty));
194
+
195
+ let level = 'low';
196
+ if (score >= 75) level = 'critical';
197
+ else if (score >= 55) level = 'high';
198
+ else if (score >= 35) level = 'medium';
199
+
200
+ return { level, score };
201
+ }
202
+
203
+ strategyScore(strategyPlan, objective) {
204
+ const complexityPenalty = strategyPlan.strategy === 'spread'
205
+ ? (objective === 'latency' ? 12 : 8)
206
+ : strategyPlan.strategy === 'replica'
207
+ ? (objective === 'latency' ? 5 : 3)
208
+ : 0;
209
+
210
+ const riskWeight = objective === 'throughput' ? 0.15 : objective === 'latency' ? 0.28 : 0.22;
211
+ const infeasiblePenalty = strategyPlan.feasible ? 0 : 220;
212
+
213
+ return strategyPlan.estimated_tps - (strategyPlan.risk.score * riskWeight) - complexityPenalty - infeasiblePenalty;
214
+ }
215
+
216
+ simulatePin(model, hardwarePlan, contextTokens, targetConcurrency) {
217
+ const gpu = hardwarePlan.gpus[0];
218
+ const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
219
+ const totalMemoryGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
220
+ const fits = totalMemoryGB <= gpu.usableMemoryGB;
221
+ const utilization = totalMemoryGB / Math.max(0.1, gpu.usableMemoryGB);
222
+
223
+ const baseTPS = this.estimateTokensPerSecond(gpu, model, contextTokens);
224
+ const throughputPenalty = fits ? 1 : Math.max(0.25, gpu.usableMemoryGB / Math.max(0.1, totalMemoryGB));
225
+
226
+ return {
227
+ strategy: 'pin',
228
+ feasible: fits,
229
+ estimated_tps: Math.max(1, Math.round(baseTPS * throughputPenalty)),
230
+ memory_per_gpu_gb: Math.round(totalMemoryGB * 100) / 100,
231
+ total_memory_gb: Math.round(totalMemoryGB * 100) / 100,
232
+ utilization_percent: Math.round(utilization * 100),
233
+ gpu_count: 1,
234
+ placement: [{
235
+ gpu_index: gpu.index,
236
+ gpu_name: gpu.name,
237
+ concurrency: targetConcurrency
238
+ }],
239
+ device_env_var: hardwarePlan.deviceEnvVar,
240
+ visible_devices: hardwarePlan.deviceEnvVar ? String(gpu.index) : null,
241
+ risk: this.makeRisk(utilization, fits, 'pin'),
242
+ notes: fits
243
+ ? ['Single-GPU placement keeps routing simple and minimizes scheduling overhead.']
244
+ : ['Model+context+concurrency exceeds single-GPU memory.']
245
+ };
246
+ }
247
+
248
+ simulateReplica(model, hardwarePlan, contextTokens, targetConcurrency) {
249
+ const gpus = hardwarePlan.gpus;
250
+ const maxReplicas = Math.min(gpus.length, targetConcurrency);
251
+ const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
252
+
253
+ let selectedReplicas = 1;
254
+ let memoryPerReplicaGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
255
+ let feasible = false;
256
+
257
+ for (let replicas = maxReplicas; replicas >= 1; replicas -= 1) {
258
+ const perReplicaConcurrency = Math.ceil(targetConcurrency / replicas);
259
+ const candidateMemory = model.baseMemoryGB + (kvPerRequestGB * perReplicaConcurrency);
260
+ const candidateGPUs = gpus.slice(0, replicas);
261
+ const fitsAll = candidateGPUs.every((gpu) => candidateMemory <= gpu.usableMemoryGB);
262
+ if (fitsAll) {
263
+ selectedReplicas = replicas;
264
+ memoryPerReplicaGB = candidateMemory;
265
+ feasible = true;
266
+ break;
267
+ }
268
+ }
269
+
270
+ const chosenGPUs = gpus.slice(0, selectedReplicas);
271
+ const baseTPS = chosenGPUs.reduce(
272
+ (sum, gpu) => sum + this.estimateTokensPerSecond(gpu, model, contextTokens),
273
+ 0
274
+ );
275
+ const replicaEfficiency = Math.max(0.8, 0.95 - ((selectedReplicas - 1) * 0.02));
276
+ const estimatedTPS = Math.max(1, Math.round(baseTPS * replicaEfficiency));
277
+
278
+ const maxUtilization = chosenGPUs.reduce((max, gpu) => {
279
+ const util = memoryPerReplicaGB / Math.max(0.1, gpu.usableMemoryGB);
280
+ return Math.max(max, util);
281
+ }, 0);
282
+
283
+ const placement = [];
284
+ let remaining = targetConcurrency;
285
+ for (let i = 0; i < chosenGPUs.length; i += 1) {
286
+ const gpu = chosenGPUs[i];
287
+ const slotsLeft = chosenGPUs.length - i;
288
+ const assigned = Math.ceil(remaining / slotsLeft);
289
+ placement.push({
290
+ gpu_index: gpu.index,
291
+ gpu_name: gpu.name,
292
+ concurrency: assigned
293
+ });
294
+ remaining -= assigned;
295
+ }
296
+
297
+ const visibleDevices = chosenGPUs.map((gpu) => gpu.index).join(',');
298
+
299
+ return {
300
+ strategy: 'replica',
301
+ feasible,
302
+ estimated_tps: estimatedTPS,
303
+ memory_per_gpu_gb: Math.round(memoryPerReplicaGB * 100) / 100,
304
+ total_memory_gb: Math.round(memoryPerReplicaGB * selectedReplicas * 100) / 100,
305
+ utilization_percent: Math.round(maxUtilization * 100),
306
+ gpu_count: selectedReplicas,
307
+ placement,
308
+ device_env_var: hardwarePlan.deviceEnvVar,
309
+ visible_devices: hardwarePlan.deviceEnvVar ? visibleDevices : null,
310
+ risk: this.makeRisk(maxUtilization, feasible, 'replica'),
311
+ notes: feasible
312
+ ? ['Replica strategy scales throughput by running independent model copies per GPU.']
313
+ : ['No replica count can satisfy per-GPU memory constraints at requested settings.']
314
+ };
315
+ }
316
+
317
+ simulateSpread(model, hardwarePlan, contextTokens, targetConcurrency) {
318
+ const gpus = hardwarePlan.gpus;
319
+ if (gpus.length < 2) {
320
+ return {
321
+ strategy: 'spread',
322
+ feasible: false,
323
+ estimated_tps: 0,
324
+ memory_per_gpu_gb: 0,
325
+ total_memory_gb: 0,
326
+ utilization_percent: 0,
327
+ gpu_count: 1,
328
+ placement: [],
329
+ device_env_var: hardwarePlan.deviceEnvVar,
330
+ visible_devices: null,
331
+ risk: { level: 'critical', score: 100 },
332
+ notes: ['Tensor/spread placement requires at least two GPUs.']
333
+ };
334
+ }
335
+
336
+ const kvPerRequestGB = this.estimateKVCacheGB(model.paramsB, contextTokens);
337
+ const totalMemoryGB = model.baseMemoryGB + (kvPerRequestGB * targetConcurrency);
338
+
339
+ let selectedGPUCount = 2;
340
+ let memoryPerGPU = totalMemoryGB / selectedGPUCount + this.spreadOverheadGB;
341
+ let feasible = false;
342
+
343
+ for (let shardCount = 2; shardCount <= gpus.length; shardCount += 1) {
344
+ const candidatePerGPU = totalMemoryGB / shardCount + this.spreadOverheadGB;
345
+ const shardGPUs = gpus.slice(0, shardCount);
346
+ const fits = shardGPUs.every((gpu) => candidatePerGPU <= gpu.usableMemoryGB);
347
+ if (fits) {
348
+ selectedGPUCount = shardCount;
349
+ memoryPerGPU = candidatePerGPU;
350
+ feasible = true;
351
+ break;
352
+ }
353
+ }
354
+
355
+ if (!feasible) {
356
+ selectedGPUCount = gpus.length;
357
+ memoryPerGPU = totalMemoryGB / selectedGPUCount + this.spreadOverheadGB;
358
+ }
359
+
360
+ const chosenGPUs = gpus.slice(0, selectedGPUCount);
361
+ const primaryTPS = this.estimateTokensPerSecond(chosenGPUs[0], model, contextTokens);
362
+ const scaleFactor = 1 + (0.55 * (selectedGPUCount - 1));
363
+ const interconnectPenalty = Math.max(0.65, 1 - (0.07 * (selectedGPUCount - 1)));
364
+ const estimatedTPS = Math.max(1, Math.round(primaryTPS * scaleFactor * interconnectPenalty));
365
+
366
+ const minUsableMemory = chosenGPUs.reduce((min, gpu) => Math.min(min, gpu.usableMemoryGB), Infinity);
367
+ const utilization = memoryPerGPU / Math.max(0.1, minUsableMemory);
368
+ const visibleDevices = chosenGPUs.map((gpu) => gpu.index).join(',');
369
+
370
+ return {
371
+ strategy: 'spread',
372
+ feasible,
373
+ estimated_tps: estimatedTPS,
374
+ memory_per_gpu_gb: Math.round(memoryPerGPU * 100) / 100,
375
+ total_memory_gb: Math.round(memoryPerGPU * selectedGPUCount * 100) / 100,
376
+ utilization_percent: Math.round(utilization * 100),
377
+ gpu_count: selectedGPUCount,
378
+ placement: chosenGPUs.map((gpu) => ({
379
+ gpu_index: gpu.index,
380
+ gpu_name: gpu.name,
381
+ role: 'shard'
382
+ })),
383
+ device_env_var: hardwarePlan.deviceEnvVar,
384
+ visible_devices: hardwarePlan.deviceEnvVar ? visibleDevices : null,
385
+ risk: this.makeRisk(utilization, feasible, 'spread'),
386
+ notes: feasible
387
+ ? ['Spread strategy shards one model across multiple GPUs and favors capacity over simplicity.']
388
+ : ['Even full spread cannot fit requested settings within per-GPU memory limits.']
389
+ };
390
+ }
391
+
392
+ pickRecommendedStrategy(strategies, objective) {
393
+ const scored = strategies.map((plan) => ({
394
+ ...plan,
395
+ objective_score: Math.round(this.strategyScore(plan, objective) * 100) / 100
396
+ }));
397
+
398
+ scored.sort((a, b) => b.objective_score - a.objective_score);
399
+ return scored[0];
400
+ }
401
+
402
+ buildNotes(hardwarePlan, modelPlans) {
403
+ const notes = [];
404
+ if (!hardwarePlan.isMultiGPU) {
405
+ notes.push('Detected single-GPU (or CPU-only) runtime; replica/spread strategies may not be feasible.');
406
+ }
407
+
408
+ const infeasibleCount = modelPlans.filter((model) => !model.recommended?.feasible).length;
409
+ if (infeasibleCount > 0) {
410
+ notes.push(`${infeasibleCount} model(s) exceed safe memory at requested ctx/concurrency. Lower --ctx or --concurrency.`);
411
+ }
412
+
413
+ if (!hardwarePlan.deviceEnvVar) {
414
+ notes.push('Backend does not expose a standard GPU visibility env var; use strategy output as conceptual placement guidance.');
415
+ }
416
+
417
+ return notes;
418
+ }
419
+
420
+ plan({
421
+ hardware,
422
+ models,
423
+ targetContext = 8192,
424
+ targetConcurrency = 2,
425
+ objective = 'balanced',
426
+ reserveGB = null
427
+ }) {
428
+ const normalizedModels = this.normalizeModels(models);
429
+ if (!normalizedModels.length) {
430
+ throw new Error('At least one model is required for GPU planning.');
431
+ }
432
+
433
+ const normalizedObjective = this.normalizeObjective(objective);
434
+ const contextTokens = this.clamp(
435
+ Math.round(this.toFiniteNumber(targetContext, 8192)),
436
+ 512,
437
+ 131072
438
+ );
439
+ const concurrency = this.clamp(
440
+ Math.round(this.toFiniteNumber(targetConcurrency, 2)),
441
+ 1,
442
+ 64
443
+ );
444
+
445
+ const hardwarePlan = this.resolveHardware(hardware, reserveGB);
446
+
447
+ const modelPlans = normalizedModels.map((model) => {
448
+ const strategies = [
449
+ this.simulatePin(model, hardwarePlan, contextTokens, concurrency),
450
+ this.simulateReplica(model, hardwarePlan, contextTokens, concurrency),
451
+ this.simulateSpread(model, hardwarePlan, contextTokens, concurrency)
452
+ ];
453
+
454
+ const recommended = this.pickRecommendedStrategy(strategies, normalizedObjective);
455
+
456
+ return {
457
+ name: model.name,
458
+ size: model.size,
459
+ file_size_gb: model.fileSizeGB,
460
+ params_b: model.paramsB,
461
+ quantization: this.normalizeQuantization(model.quantization),
462
+ estimated_base_memory_gb: model.baseMemoryGB,
463
+ recommended,
464
+ strategies
465
+ };
466
+ });
467
+
468
+ return {
469
+ objective: normalizedObjective,
470
+ inputs: {
471
+ target_context: contextTokens,
472
+ target_concurrency: concurrency
473
+ },
474
+ hardware: {
475
+ backend: hardwarePlan.backend,
476
+ backend_name: hardwarePlan.backendName,
477
+ is_multi_gpu: hardwarePlan.isMultiGPU,
478
+ gpu_count: hardwarePlan.gpus.length,
479
+ reserve_gb: hardwarePlan.reserveGB,
480
+ total_usable_memory_gb: hardwarePlan.totalUsableGB,
481
+ device_env_var: hardwarePlan.deviceEnvVar,
482
+ gpus: hardwarePlan.gpus.map((gpu) => ({
483
+ index: gpu.index,
484
+ name: gpu.name,
485
+ memory_gb: gpu.memoryGB,
486
+ usable_memory_gb: gpu.usableMemoryGB,
487
+ speed_coefficient: gpu.speedCoefficient
488
+ }))
489
+ },
490
+ models: modelPlans,
491
+ notes: this.buildNotes(hardwarePlan, modelPlans)
492
+ };
493
+ }
494
+ }
495
+
496
+ module.exports = OllamaGPUPlacementPlanner;