llm-checker 3.7.0 → 3.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -1
- package/bin/enhanced_cli.js +46 -0
- package/bin/mcp-server.mjs +5 -0
- package/package.json +1 -1
- package/src/data/model-database.js +3 -1
- package/src/data/registry-ingestors.js +20 -6
- package/src/data/registry-recommender.js +122 -4
- package/src/data/seed/models.db +0 -0
- package/src/models/deterministic-selector.js +69 -36
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
**Intelligent Ollama Model Selector**
|
|
6
6
|
|
|
7
7
|
AI-powered CLI that analyzes your hardware and recommends optimal LLM models.
|
|
8
|
-
Deterministic scoring across **
|
|
8
|
+
Deterministic scoring across a packaged **multi-source registry** (Hugging Face + Ollama + GPT4All, **33k+ exact artifacts**) and the Ollama catalog, with live sync, runtime targeting, and hardware-calibrated memory estimation.
|
|
9
9
|
|
|
10
10
|
[](https://www.npmjs.com/package/llm-checker)
|
|
11
11
|
[](https://www.npmjs.com/package/llm-checker)
|
|
@@ -39,6 +39,7 @@ Choosing the right LLM for your hardware is complex. With thousands of model var
|
|
|
39
39
|
| | Feature | Description |
|
|
40
40
|
|:---:|---|---|
|
|
41
41
|
| **200+** | Packaged Model Catalog | Ships with a synced Ollama SQLite catalog and can refresh from Ollama on demand |
|
|
42
|
+
| **33k+** | Multi-Source Registry | Exact installable/downloadable artifacts from Hugging Face, Ollama, and GPT4All with per-source commands and runtime targeting |
|
|
42
43
|
| **4D** | Scoring Engine | Quality, Speed, Fit, Context — weighted by use case |
|
|
43
44
|
| **Multi-GPU** | Hardware Detection | Apple Silicon, NVIDIA CUDA, AMD ROCm, Intel Arc, CPU, integrated/dedicated inventory visibility |
|
|
44
45
|
| **Calibrated** | Memory Estimation | Bytes-per-parameter formula validated against real Ollama sizes |
|
|
@@ -151,6 +152,14 @@ hash -r
|
|
|
151
152
|
llm-checker --version
|
|
152
153
|
```
|
|
153
154
|
|
|
155
|
+
### v3.7.0 Highlights
|
|
156
|
+
|
|
157
|
+
- New **multi-source model registry**: a packaged snapshot of ~33,700 exact installable/downloadable artifacts from Hugging Face, Ollama, and GPT4All, with per-source commands (`hf download ...`, `ollama pull ...`).
|
|
158
|
+
- `recommend` and `check` now draw candidates from the registry through one canonical deterministic scoring core, with `--runtime auto/ollama/vllm/mlx/llama.cpp/transformers` targeting; they fall back to the Ollama catalog when the registry is unavailable.
|
|
159
|
+
- New `registry-sync`, `registry-search`, and `registry-recommend` commands.
|
|
160
|
+
- Mixture-of-Experts models are sized by their **total** parameter count (all experts stay resident under Ollama/Metal/vLLM), so a large MoE can no longer falsely "fit" small hardware.
|
|
161
|
+
- Carries the 3.6.1 batch: unified scoring across `check`/`recommend`/`smart-recommend` (#88), high-end/multi-GPU VRAM detection (#95), MCP server hardening (#97), and the Windows interactive-panel fixes (#86).
|
|
162
|
+
|
|
154
163
|
### v3.5.13 Highlights
|
|
155
164
|
|
|
156
165
|
- Ships npm packages with a ready-to-use SQLite model catalog:
|
|
@@ -389,6 +398,27 @@ llm-checker search "qwen coder" --json
|
|
|
389
398
|
| `search <query>` | Search the synced catalog with filters and intelligent scoring |
|
|
390
399
|
| `smart-recommend` | Advanced recommendations using the full scoring engine |
|
|
391
400
|
|
|
401
|
+
### Model Registry Commands (v3.7.0+)
|
|
402
|
+
|
|
403
|
+
Exact installable/downloadable artifacts from a packaged multi-source registry (Hugging Face + Ollama + GPT4All).
|
|
404
|
+
|
|
405
|
+
| Command | Description |
|
|
406
|
+
|---------|-------------|
|
|
407
|
+
| `registry-sync` | Sync the multi-source registry (Hugging Face, Ollama, GPT4All) |
|
|
408
|
+
| `registry-search [query]` | Search exact artifacts with `--source`, `--format`, `--runtime`, `--quant`, `--max-size`, `--min-params`/`--max-params` filters |
|
|
409
|
+
| `registry-recommend [query]` | Recommend the best exact artifacts for your hardware, with `--runtime auto/ollama/vllm/mlx/llama.cpp/transformers` targeting and `--category`/`--optimize` |
|
|
410
|
+
|
|
411
|
+
```bash
|
|
412
|
+
# Best coding artifacts across all sources, auto runtime
|
|
413
|
+
llm-checker registry-recommend --category coding
|
|
414
|
+
|
|
415
|
+
# Only Apple-native MLX artifacts
|
|
416
|
+
llm-checker registry-recommend --category coding --runtime mlx
|
|
417
|
+
|
|
418
|
+
# Search Hugging Face for vLLM-ready reasoning models under 24B
|
|
419
|
+
llm-checker registry-search qwen --source huggingface --runtime vllm --max-params 24
|
|
420
|
+
```
|
|
421
|
+
|
|
392
422
|
### Enterprise Policy Commands
|
|
393
423
|
|
|
394
424
|
| Command | Description |
|
package/bin/enhanced_cli.js
CHANGED
|
@@ -410,6 +410,30 @@ function parsePositiveNumberOption(value, fallback = null) {
|
|
|
410
410
|
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
|
|
411
411
|
}
|
|
412
412
|
|
|
413
|
+
// Allowed enum values for the registry commands. Invalid values must be rejected
|
|
414
|
+
// with a clear error instead of silently returning "no results" or falling back
|
|
415
|
+
// to the built-in catalog.
|
|
416
|
+
const REGISTRY_SOURCES = ['ollama', 'huggingface', 'gpt4all'];
|
|
417
|
+
const REGISTRY_FORMATS = ['gguf', 'safetensors', 'mlx', 'ollama', 'pytorch', 'pytorch_bin', 'ggml'];
|
|
418
|
+
const REGISTRY_RUNTIMES = ['auto', 'all', '*', 'ollama', 'llama.cpp', 'transformers', 'vllm', 'mlx'];
|
|
419
|
+
const REGISTRY_OPTIMIZE = ['balanced', 'speed', 'quality', 'context', 'coding'];
|
|
420
|
+
|
|
421
|
+
function assertRegistryEnum(label, value, allowed) {
|
|
422
|
+
if (value === undefined || value === null || value === '') return;
|
|
423
|
+
if (!allowed.includes(String(value).toLowerCase())) {
|
|
424
|
+
const shown = allowed.filter((v) => !['all', '*'].includes(v)).join(', ');
|
|
425
|
+
throw new Error(`Invalid --${label} "${value}". Allowed: ${shown}`);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Throws on the first invalid registry enum option. Returns nothing on success.
|
|
430
|
+
function validateRegistryFilters(options = {}) {
|
|
431
|
+
assertRegistryEnum('source', options.source, REGISTRY_SOURCES);
|
|
432
|
+
assertRegistryEnum('format', options.format, REGISTRY_FORMATS);
|
|
433
|
+
assertRegistryEnum('runtime', options.runtime, REGISTRY_RUNTIMES);
|
|
434
|
+
assertRegistryEnum('optimize', options.optimize, REGISTRY_OPTIMIZE);
|
|
435
|
+
}
|
|
436
|
+
|
|
413
437
|
function truncateMiddle(value, maxLength = 48) {
|
|
414
438
|
const text = String(value || '');
|
|
415
439
|
if (text.length <= maxLength) return text;
|
|
@@ -4886,6 +4910,17 @@ program
|
|
|
4886
4910
|
.option('-l, --limit <n>', 'Maximum number of results', '20')
|
|
4887
4911
|
.option('-j, --json', 'Output as JSON')
|
|
4888
4912
|
.action(async (query = '', options) => {
|
|
4913
|
+
try {
|
|
4914
|
+
validateRegistryFilters(options);
|
|
4915
|
+
} catch (validationError) {
|
|
4916
|
+
if (options.json) {
|
|
4917
|
+
console.log(JSON.stringify({ error: validationError.message }, null, 2));
|
|
4918
|
+
} else {
|
|
4919
|
+
console.error(chalk.red(`✗ ${validationError.message}`));
|
|
4920
|
+
}
|
|
4921
|
+
process.exitCode = 1;
|
|
4922
|
+
return;
|
|
4923
|
+
}
|
|
4889
4924
|
if (!options.json) showAsciiArt('registry-search');
|
|
4890
4925
|
|
|
4891
4926
|
const ModelDatabase = require('../src/data/model-database');
|
|
@@ -4993,6 +5028,17 @@ program
|
|
|
4993
5028
|
.option('-l, --limit <n>', 'Maximum number of recommendations', '10')
|
|
4994
5029
|
.option('-j, --json', 'Output as JSON')
|
|
4995
5030
|
.action(async (query = '', options) => {
|
|
5031
|
+
try {
|
|
5032
|
+
validateRegistryFilters(options);
|
|
5033
|
+
} catch (validationError) {
|
|
5034
|
+
if (options.json) {
|
|
5035
|
+
console.log(JSON.stringify({ error: validationError.message }, null, 2));
|
|
5036
|
+
} else {
|
|
5037
|
+
console.error(chalk.red(`✗ ${validationError.message}`));
|
|
5038
|
+
}
|
|
5039
|
+
process.exitCode = 1;
|
|
5040
|
+
return;
|
|
5041
|
+
}
|
|
4996
5042
|
if (!options.json) showAsciiArt('registry-recommend');
|
|
4997
5043
|
|
|
4998
5044
|
const UnifiedDetector = require('../src/hardware/unified-detector');
|
package/bin/mcp-server.mjs
CHANGED
|
@@ -290,9 +290,14 @@ const ALLOWED_CLI_COMMANDS = new Set([
|
|
|
290
290
|
"sync",
|
|
291
291
|
"search",
|
|
292
292
|
"smart-recommend",
|
|
293
|
+
"registry-sync",
|
|
294
|
+
"registry-search",
|
|
295
|
+
"registry-recommend",
|
|
293
296
|
"hw-detect",
|
|
294
297
|
]);
|
|
295
298
|
|
|
299
|
+
export { ALLOWED_CLI_COMMANDS };
|
|
300
|
+
|
|
296
301
|
// ============================================================================
|
|
297
302
|
// MCP SERVER
|
|
298
303
|
// ============================================================================
|
package/package.json
CHANGED
|
@@ -227,9 +227,11 @@ class ModelDatabase {
|
|
|
227
227
|
CREATE INDEX IF NOT EXISTS idx_model_artifacts_source ON model_artifacts(source_id);
|
|
228
228
|
CREATE INDEX IF NOT EXISTS idx_model_artifacts_format ON model_artifacts(format);
|
|
229
229
|
CREATE INDEX IF NOT EXISTS idx_model_artifacts_quant ON model_artifacts(quantization);
|
|
230
|
-
CREATE INDEX IF NOT EXISTS idx_model_artifacts_runtime ON model_artifacts(runtime_support);
|
|
231
230
|
CREATE INDEX IF NOT EXISTS idx_model_artifacts_size ON model_artifacts(size_gb);
|
|
232
231
|
CREATE INDEX IF NOT EXISTS idx_model_artifacts_downloads ON model_artifacts(downloads DESC);
|
|
232
|
+
-- Drop a dead index from older DBs: runtime_support is a JSON blob only
|
|
233
|
+
-- queried with LIKE, so a B-tree index on it is never used.
|
|
234
|
+
DROP INDEX IF EXISTS idx_model_artifacts_runtime;
|
|
233
235
|
`;
|
|
234
236
|
|
|
235
237
|
if (this.useBetterSqlite) {
|
|
@@ -146,8 +146,10 @@ function parseActiveParamsB(...values) {
|
|
|
146
146
|
|
|
147
147
|
function inferQuantization(...values) {
|
|
148
148
|
const text = values.map((value) => String(value || '')).join(' ');
|
|
149
|
-
|
|
150
|
-
|
|
149
|
+
// Note: F16/FP16/BF16 are PRECISIONS, not quantizations — they're handled by
|
|
150
|
+
// inferPrecision so a full-precision model isn't mislabeled as "quantized".
|
|
151
|
+
const ggufQuant = text.match(/\b(IQ\d(?:_[A-Z0-9]+)?|Q\d(?:_[A-Z0-9]+){0,2}|Q8_0)\b/i);
|
|
152
|
+
if (ggufQuant) return ggufQuant[1].toUpperCase();
|
|
151
153
|
|
|
152
154
|
const bitQuant = text.match(/\b([234568])\s*[-_ ]?bit\b/i);
|
|
153
155
|
if (bitQuant) return `${bitQuant[1]}bit`;
|
|
@@ -271,9 +273,16 @@ function getSiblingSizeBytes(sibling = {}) {
|
|
|
271
273
|
function isModelArtifactFile(filename) {
|
|
272
274
|
const lower = String(filename || '').toLowerCase();
|
|
273
275
|
if (!lower) return false;
|
|
276
|
+
// Exclude non-model weight files that would otherwise be ingested as standalone
|
|
277
|
+
// "models": LoRA/PEFT adapters (a few MB but inherit the repo's param count) and
|
|
278
|
+
// optimizer/training state.
|
|
279
|
+
if (/(^|[/_-])adapter[_-]?(model|config)/.test(lower)) return false;
|
|
280
|
+
if (/(^|[/_-])(lora|optimizer|scheduler|rng_state|trainer_state|training_args)/.test(lower)) return false;
|
|
274
281
|
if (lower.endsWith('.gguf')) return true;
|
|
275
282
|
if (lower.endsWith('.safetensors')) return true;
|
|
276
|
-
if (/pytorch_model.*\.bin$/.test(lower)) return true;
|
|
283
|
+
if (/pytorch_model.*\.(bin)$/.test(lower)) return true;
|
|
284
|
+
// Mistral-style consolidated weights (consolidated.00.pth) were being dropped.
|
|
285
|
+
if (/(^|[/])consolidated.*\.(pt|pth|bin)$/.test(lower)) return true;
|
|
277
286
|
if (/model.*\.(bin|pt|pth)$/.test(lower)) return true;
|
|
278
287
|
if (/ggml.*\.bin$/.test(lower)) return true;
|
|
279
288
|
return false;
|
|
@@ -398,11 +407,15 @@ function normalizeGpt4AllEntry(entry) {
|
|
|
398
407
|
|
|
399
408
|
const repoMatch = url.match(/huggingface\.co\/([^/]+\/[^/]+)\/resolve\/([^/]+)\/(.+)$/);
|
|
400
409
|
const repoId = repoMatch ? repoMatch[1] : `gpt4all/${name}`;
|
|
410
|
+
// When the download points at a Hugging Face repo, use that repo id as the
|
|
411
|
+
// canonical model id so the same model lines up across sources for dedup.
|
|
412
|
+
const canonicalModelId = repoMatch ? repoMatch[1] : name;
|
|
401
413
|
const filename = repoMatch ? decodeURIComponent(repoMatch[3]) : (filenameCandidate || url.split('/').filter(Boolean).pop());
|
|
402
414
|
const repoKey = makeScopedId('gpt4all', repoId);
|
|
403
415
|
const tags = ['gpt4all', entry.type, entry.quant].filter(Boolean);
|
|
404
416
|
const paramsB = parseParamsB(entry.parameters, name, filename);
|
|
405
|
-
|
|
417
|
+
// Sizes can arrive as comma-formatted strings ("8,000,000,000"); strip non-digits.
|
|
418
|
+
const sizeBytes = Number(String(entry.filesize ?? entry.fileSize ?? entry.size ?? 0).replace(/[^0-9.]/g, '')) || null;
|
|
406
419
|
const format = inferFormat(filename, tags);
|
|
407
420
|
|
|
408
421
|
return {
|
|
@@ -412,7 +425,7 @@ function normalizeGpt4AllEntry(entry) {
|
|
|
412
425
|
source_id: 'gpt4all',
|
|
413
426
|
repo_id: repoId,
|
|
414
427
|
namespace: repoId.includes('/') ? repoId.split('/')[0] : 'gpt4all',
|
|
415
|
-
canonical_model_id:
|
|
428
|
+
canonical_model_id: canonicalModelId,
|
|
416
429
|
display_name: name,
|
|
417
430
|
url: repoMatch ? `https://huggingface.co/${repoId}` : url,
|
|
418
431
|
license: entry.license || 'unknown',
|
|
@@ -434,7 +447,7 @@ function normalizeGpt4AllEntry(entry) {
|
|
|
434
447
|
source_id: 'gpt4all',
|
|
435
448
|
repo_key: repoKey,
|
|
436
449
|
repo_id: repoId,
|
|
437
|
-
canonical_model_id:
|
|
450
|
+
canonical_model_id: canonicalModelId,
|
|
438
451
|
artifact_name: filename || name,
|
|
439
452
|
filename: filename || '',
|
|
440
453
|
format,
|
|
@@ -746,6 +759,7 @@ module.exports = {
|
|
|
746
759
|
inferFormat,
|
|
747
760
|
inferQuantization,
|
|
748
761
|
inferRuntimeSupport,
|
|
762
|
+
isModelArtifactFile,
|
|
749
763
|
parseParamsB,
|
|
750
764
|
buildHuggingFaceDownloadUrl
|
|
751
765
|
};
|
|
@@ -173,7 +173,12 @@ function artifactToSelectorModel(row) {
|
|
|
173
173
|
.filter(Boolean)
|
|
174
174
|
.map((tag) => String(tag).toLowerCase());
|
|
175
175
|
|
|
176
|
-
|
|
176
|
+
// A sharded weight file's size is only ONE shard, not the whole model. Don't
|
|
177
|
+
// let it stand in for the model's memory (that made a 56B model look like
|
|
178
|
+
// ~4.6GB and "fit" tiny hardware); leave size unset so memory estimates from
|
|
179
|
+
// the (total) parameter count instead.
|
|
180
|
+
const rawSizeGB = Number(row.size_gb);
|
|
181
|
+
const sizeGB = (!shardedFile && Number.isFinite(rawSizeGB) && rawSizeGB > 0) ? rawSizeGB : NaN;
|
|
177
182
|
const sizeByQuant = Number.isFinite(sizeGB) && sizeGB > 0
|
|
178
183
|
? { [quant]: sizeGB }
|
|
179
184
|
: {};
|
|
@@ -240,6 +245,84 @@ function dedupeRecommendationPool(models) {
|
|
|
240
245
|
return [...deduped.values()];
|
|
241
246
|
}
|
|
242
247
|
|
|
248
|
+
// A source may trail the top score by up to this and still earn a guaranteed slot.
|
|
249
|
+
const SOURCE_DIVERSITY_MARGIN = 15;
|
|
250
|
+
// Never surface a model below this score purely for source diversity.
|
|
251
|
+
const SOURCE_DIVERSITY_FLOOR = 55;
|
|
252
|
+
|
|
253
|
+
// Group key that ignores quantization / shard / tag so variants of the SAME
|
|
254
|
+
// model collapse together (e.g. all `qwen2.5-coder:7b-*` quants, or every
|
|
255
|
+
// `layers-N.safetensors` shard of one HF repo).
|
|
256
|
+
function modelDiversityKey(candidate) {
|
|
257
|
+
const meta = (candidate && candidate.meta) || {};
|
|
258
|
+
const name = String(meta.name || meta.model_identifier || '')
|
|
259
|
+
.toLowerCase()
|
|
260
|
+
.replace(/:.*$/, '') // drop an ollama :tag
|
|
261
|
+
.replace(/\s+/g, ' ')
|
|
262
|
+
.trim();
|
|
263
|
+
const p = Number(meta.paramsB);
|
|
264
|
+
if (Number.isFinite(p) && p > 0) {
|
|
265
|
+
return `${name}|${Math.round(p * 10) / 10}`;
|
|
266
|
+
}
|
|
267
|
+
// Params unknown: do NOT bucket every unknown-size model of the same name
|
|
268
|
+
// together (that silently drops distinct models / sources). Keep them apart by
|
|
269
|
+
// source + identifier.
|
|
270
|
+
const src = String(meta.source || '').toLowerCase();
|
|
271
|
+
const id = String(meta.model_identifier || meta.name || '').toLowerCase();
|
|
272
|
+
return `${name}|na|${src}|${id}`;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Collapse quant/shard/tag variants of the same model to a single best-scoring
|
|
276
|
+
// entry, so the top picks are DISTINCT models instead of 12 quants of one.
|
|
277
|
+
function collapseToDistinctModels(candidates) {
|
|
278
|
+
const best = new Map();
|
|
279
|
+
for (const c of Array.isArray(candidates) ? candidates : []) {
|
|
280
|
+
if (!c) continue;
|
|
281
|
+
const key = modelDiversityKey(c);
|
|
282
|
+
const cur = best.get(key);
|
|
283
|
+
if (!cur || (Number(c.score) || 0) > (Number(cur.score) || 0)) best.set(key, c);
|
|
284
|
+
}
|
|
285
|
+
return [...best.values()].sort((a, b) => (Number(b.score) || 0) - (Number(a.score) || 0));
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// Guarantee that each source with a competitive candidate appears in the top
|
|
289
|
+
// `limit`, so Hugging Face / GPT4All artifacts are visible when they score close
|
|
290
|
+
// to Ollama. Diversity never promotes a clearly worse model (floor + margin gates).
|
|
291
|
+
function applySourceDiversity(distinctSorted, limit) {
|
|
292
|
+
const list = Array.isArray(distinctSorted) ? distinctSorted : [];
|
|
293
|
+
if (list.length === 0) return [];
|
|
294
|
+
const max = Number(limit) > 0 ? Number(limit) : 10;
|
|
295
|
+
if (list.length <= max) return list.slice(0, max);
|
|
296
|
+
const topScore = Number(list[0].score) || 0;
|
|
297
|
+
|
|
298
|
+
// Reserve most slots for the genuine best-by-score so diversity can never
|
|
299
|
+
// displace several real top picks for several obscure sources. Only the tail
|
|
300
|
+
// (~40% of slots) is used to surface competitive alternate sources.
|
|
301
|
+
const guaranteed = Math.max(1, Math.ceil(max * 0.6));
|
|
302
|
+
const result = list.slice(0, guaranteed);
|
|
303
|
+
const chosen = new Set(result);
|
|
304
|
+
const present = new Set(result.map((c) => (c.meta && c.meta.source) || 'unknown'));
|
|
305
|
+
|
|
306
|
+
while (result.length < max) {
|
|
307
|
+
// Prefer the best candidate from a not-yet-shown source that is still
|
|
308
|
+
// competitive (within margin + above floor); otherwise the next best overall.
|
|
309
|
+
let pick = list.find((c) => {
|
|
310
|
+
if (chosen.has(c)) return false;
|
|
311
|
+
const src = (c.meta && c.meta.source) || 'unknown';
|
|
312
|
+
const score = Number(c.score) || 0;
|
|
313
|
+
return !present.has(src) && score >= SOURCE_DIVERSITY_FLOOR && score >= topScore - SOURCE_DIVERSITY_MARGIN;
|
|
314
|
+
});
|
|
315
|
+
if (!pick) pick = list.find((c) => !chosen.has(c));
|
|
316
|
+
if (!pick) break;
|
|
317
|
+
result.push(pick);
|
|
318
|
+
chosen.add(pick);
|
|
319
|
+
present.add((pick.meta && pick.meta.source) || 'unknown');
|
|
320
|
+
}
|
|
321
|
+
return result
|
|
322
|
+
.sort((a, b) => (Number(b.score) || 0) - (Number(a.score) || 0))
|
|
323
|
+
.slice(0, max);
|
|
324
|
+
}
|
|
325
|
+
|
|
243
326
|
function candidateToRecommendation(candidate) {
|
|
244
327
|
const artifact = candidate.meta.artifact || {};
|
|
245
328
|
return {
|
|
@@ -360,9 +443,32 @@ class RegistryRecommender {
|
|
|
360
443
|
|
|
361
444
|
const selectorHardware = normalizeHardwareForSelector(options.hardware || {});
|
|
362
445
|
const normalizedRuntime = runtimeFilter || 'auto';
|
|
446
|
+
|
|
447
|
+
// No registry artifacts matched the filters: return an empty result rather
|
|
448
|
+
// than letting the deterministic selector silently substitute its built-in
|
|
449
|
+
// catalog (which would mislabel non-registry models as "registry" rows).
|
|
450
|
+
if (modelPool.length === 0) {
|
|
451
|
+
return {
|
|
452
|
+
category,
|
|
453
|
+
runtime: normalizedRuntime,
|
|
454
|
+
rows,
|
|
455
|
+
modelPool,
|
|
456
|
+
result: {
|
|
457
|
+
category,
|
|
458
|
+
optimizeFor: this.selector.normalizeOptimizationObjective(options.optimizeFor || 'balanced'),
|
|
459
|
+
runtime: normalizedRuntime,
|
|
460
|
+
candidates: [],
|
|
461
|
+
total_evaluated: 0,
|
|
462
|
+
timestamp: new Date().toISOString()
|
|
463
|
+
}
|
|
464
|
+
};
|
|
465
|
+
}
|
|
466
|
+
// Rank a wider window than requested so we can collapse model variants and
|
|
467
|
+
// apply source diversity before trimming to the caller's limit.
|
|
468
|
+
const rankWindow = Math.max(limit * 8, 200);
|
|
363
469
|
const result = runtimeFilter
|
|
364
470
|
? await this.selector.selectModels(category, {
|
|
365
|
-
topN:
|
|
471
|
+
topN: rankWindow,
|
|
366
472
|
enableProbe: false,
|
|
367
473
|
silent: true,
|
|
368
474
|
optimizeFor: options.optimizeFor || 'balanced',
|
|
@@ -374,13 +480,20 @@ class RegistryRecommender {
|
|
|
374
480
|
})
|
|
375
481
|
: this.scoreAutoRuntimePool({
|
|
376
482
|
category,
|
|
377
|
-
limit,
|
|
483
|
+
limit: rankWindow,
|
|
378
484
|
targetCtx,
|
|
379
485
|
optimizeFor: options.optimizeFor || 'balanced',
|
|
380
486
|
hardware: selectorHardware,
|
|
381
487
|
modelPool
|
|
382
488
|
});
|
|
383
489
|
|
|
490
|
+
// Collapse quant/shard variants to distinct models, then guarantee source
|
|
491
|
+
// diversity, and finally trim to the requested limit.
|
|
492
|
+
if (result && Array.isArray(result.candidates)) {
|
|
493
|
+
const distinct = collapseToDistinctModels(result.candidates);
|
|
494
|
+
result.candidates = applySourceDiversity(distinct, limit);
|
|
495
|
+
}
|
|
496
|
+
|
|
384
497
|
return {
|
|
385
498
|
category,
|
|
386
499
|
runtime: normalizedRuntime,
|
|
@@ -493,7 +606,9 @@ class RegistryRecommender {
|
|
|
493
606
|
optimizeFor: objective,
|
|
494
607
|
runtime: 'auto',
|
|
495
608
|
hardware: normalizedHardware,
|
|
496
|
-
|
|
609
|
+
// Return a wide sorted window; selectCategory collapses variants and
|
|
610
|
+
// applies source diversity before trimming to the caller's limit.
|
|
611
|
+
candidates: candidates.slice(0, Math.max(limit, 2000)),
|
|
497
612
|
total_evaluated: filtered.length,
|
|
498
613
|
timestamp: new Date().toISOString()
|
|
499
614
|
};
|
|
@@ -506,6 +621,9 @@ class RegistryRecommender {
|
|
|
506
621
|
|
|
507
622
|
module.exports = {
|
|
508
623
|
RegistryRecommender,
|
|
624
|
+
collapseToDistinctModels,
|
|
625
|
+
applySourceDiversity,
|
|
626
|
+
modelDiversityKey,
|
|
509
627
|
artifactToSelectorModel,
|
|
510
628
|
candidateToRecommendation,
|
|
511
629
|
normalizeHardwareForSelector,
|
package/src/data/seed/models.db
CHANGED
|
Binary file
|
|
@@ -243,13 +243,12 @@ class DeterministicModelSelector {
|
|
|
243
243
|
directVRAM ??
|
|
244
244
|
0;
|
|
245
245
|
|
|
246
|
-
// Multi-GPU
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
}
|
|
246
|
+
// Multi-GPU: only scale up when memory is known to be PER-GPU (vramPerGPU).
|
|
247
|
+
// A bare `vram`/`vramGB` is treated as the box total and never multiplied,
|
|
248
|
+
// so we don't double an already-total figure and falsely "fit" a model
|
|
249
|
+
// (e.g. a 2x24=48GB box must stay 48GB, not become 96GB).
|
|
250
|
+
if (!explicitTotalVRAM && gpuCount > 1 && vramPerGPU) {
|
|
251
|
+
vramGB = vramPerGPU * gpuCount;
|
|
253
252
|
}
|
|
254
253
|
|
|
255
254
|
let gpuType = gpu.type;
|
|
@@ -1152,6 +1151,17 @@ class DeterministicModelSelector {
|
|
|
1152
1151
|
return explicitParams;
|
|
1153
1152
|
}
|
|
1154
1153
|
|
|
1154
|
+
// Use the variant's OWN artifact size to DISAMBIGUATE the model-level size
|
|
1155
|
+
// list. A size-unknown variant (e.g. `:latest`) must not blindly inherit
|
|
1156
|
+
// model_sizes[0]: for qwen3 (model_sizes ["30b","235b"]) that mislabeled a
|
|
1157
|
+
// small qwen3:latest as 30B and poisoned the real qwen3:30b size map, making
|
|
1158
|
+
// a 19GB model falsely "fit" a 16GB machine.
|
|
1159
|
+
const artifactSizeGB = this.extractVariantSizeGB(variant, null);
|
|
1160
|
+
const artifactParamsB =
|
|
1161
|
+
(!this.isCloudVariantTag(variant.tag) && Number.isFinite(artifactSizeGB) && artifactSizeGB > 0)
|
|
1162
|
+
? this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant)
|
|
1163
|
+
: null;
|
|
1164
|
+
|
|
1155
1165
|
const metadataCandidates = this.extractParameterCandidates(
|
|
1156
1166
|
ollamaModel.model_sizes,
|
|
1157
1167
|
ollamaModel.parameters,
|
|
@@ -1159,12 +1169,23 @@ class DeterministicModelSelector {
|
|
|
1159
1169
|
ollamaModel.parameter_count
|
|
1160
1170
|
);
|
|
1161
1171
|
if (metadataCandidates.length > 0) {
|
|
1172
|
+
if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
|
|
1173
|
+
// Pick the listed size CLOSEST to what this variant's own artifact
|
|
1174
|
+
// implies; if even the closest is far off, trust the artifact size.
|
|
1175
|
+
let closest = metadataCandidates[0];
|
|
1176
|
+
let bestDiff = Math.abs(closest - artifactParamsB);
|
|
1177
|
+
for (const cand of metadataCandidates) {
|
|
1178
|
+
const diff = Math.abs(cand - artifactParamsB);
|
|
1179
|
+
if (diff < bestDiff) { bestDiff = diff; closest = cand; }
|
|
1180
|
+
}
|
|
1181
|
+
const tolerance = Math.max(2, closest * 0.5);
|
|
1182
|
+
return bestDiff <= tolerance ? closest : artifactParamsB;
|
|
1183
|
+
}
|
|
1162
1184
|
return metadataCandidates[0];
|
|
1163
1185
|
}
|
|
1164
1186
|
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
return this.inferParamsFromArtifactSizeGB(artifactSizeGB, quant);
|
|
1187
|
+
if (Number.isFinite(artifactParamsB) && artifactParamsB > 0) {
|
|
1188
|
+
return artifactParamsB;
|
|
1168
1189
|
}
|
|
1169
1190
|
|
|
1170
1191
|
const modelArtifactSizeGB = this.extractArtifactSizeGBFromValue(ollamaModel.main_size);
|
|
@@ -1512,28 +1533,35 @@ class DeterministicModelSelector {
|
|
|
1512
1533
|
return false;
|
|
1513
1534
|
}
|
|
1514
1535
|
|
|
1536
|
+
// Guard against malformed external pool rows (a missing tags/modalities
|
|
1537
|
+
// /name field used to throw and silently nuke the whole category).
|
|
1538
|
+
const tags = Array.isArray(model.tags) ? model.tags : [];
|
|
1539
|
+
const modalities = Array.isArray(model.modalities) ? model.modalities : [];
|
|
1540
|
+
const name = String(model.name || model.model_identifier || '').toLowerCase();
|
|
1541
|
+
const paramsB = Number(model.paramsB) || 0;
|
|
1542
|
+
|
|
1515
1543
|
switch (category) {
|
|
1516
1544
|
case 'coding':
|
|
1517
|
-
return
|
|
1518
|
-
|
|
1519
|
-
|
|
1545
|
+
return tags.some(tag => ['coder', 'code', 'instruct'].includes(tag)) ||
|
|
1546
|
+
name.includes('code');
|
|
1547
|
+
|
|
1520
1548
|
case 'multimodal':
|
|
1521
|
-
return
|
|
1522
|
-
|
|
1523
|
-
|
|
1549
|
+
return modalities.includes('vision') ||
|
|
1550
|
+
tags.includes('vision');
|
|
1551
|
+
|
|
1524
1552
|
case 'embeddings':
|
|
1525
|
-
return
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1553
|
+
return tags.includes('embedding') ||
|
|
1554
|
+
tags.includes('embeddings') ||
|
|
1555
|
+
name.includes('embed') ||
|
|
1556
|
+
name.includes('bge-') ||
|
|
1557
|
+
name.includes('nomic-embed') ||
|
|
1558
|
+
name.includes('all-minilm') ||
|
|
1531
1559
|
model.specialization === 'embeddings';
|
|
1532
|
-
|
|
1560
|
+
|
|
1533
1561
|
case 'reasoning':
|
|
1534
|
-
return
|
|
1535
|
-
|
|
1536
|
-
|
|
1562
|
+
return tags.includes('instruct') ||
|
|
1563
|
+
paramsB >= 7; // Prefer larger models for reasoning
|
|
1564
|
+
|
|
1537
1565
|
default: // general, reading, summarization
|
|
1538
1566
|
return true; // Most models can handle these
|
|
1539
1567
|
}
|
|
@@ -1711,15 +1739,19 @@ class DeterministicModelSelector {
|
|
|
1711
1739
|
: (Number.isFinite(directVariantMatch) && directVariantMatch > 0 ? directVariantMatch : null);
|
|
1712
1740
|
|
|
1713
1741
|
const parameterProfile = this.resolveMemoryParameterProfile(model);
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
const
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1742
|
+
// Weight memory must account for ALL resident parameters. For MoE under
|
|
1743
|
+
// Ollama / Metal / vLLM every expert is resident, so size the weights by
|
|
1744
|
+
// the TOTAL parameter count (not the active count). Active params drive
|
|
1745
|
+
// speed and KV-cache only. Sizing weights by active params used to make a
|
|
1746
|
+
// 236B MoE look like ~14GB and falsely "fit" small hardware.
|
|
1747
|
+
const weightParamsB =
|
|
1748
|
+
parameterProfile.isMoE && Number.isFinite(parameterProfile.totalParamsB) && parameterProfile.totalParamsB > 0
|
|
1749
|
+
? parameterProfile.totalParamsB
|
|
1750
|
+
: parameterProfile.effectiveParamsB;
|
|
1751
|
+
const modeledWeightGB = weightParamsB * bpp;
|
|
1752
|
+
// A real observed artifact size always wins for weight memory — never let
|
|
1753
|
+
// an MoE "sparse inference" assumption discard a measured on-disk size.
|
|
1754
|
+
const useObservedArtifactSize = Number.isFinite(observedWeightGB) && observedWeightGB > 0;
|
|
1723
1755
|
const modelMemGB = useObservedArtifactSize ? observedWeightGB : modeledWeightGB;
|
|
1724
1756
|
const effectiveCtx = Number.isFinite(Number(ctx)) && Number(ctx) > 0 ? Number(ctx) : 4096;
|
|
1725
1757
|
|
|
@@ -1729,9 +1761,10 @@ class DeterministicModelSelector {
|
|
|
1729
1761
|
|
|
1730
1762
|
// Runtime overhead (Metal/CUDA context, buffers)
|
|
1731
1763
|
const runtimeOverhead = useObservedArtifactSize ? 0.35 : 0.5;
|
|
1764
|
+
const usedMoeTotal = parameterProfile.isMoE && weightParamsB === parameterProfile.totalParamsB;
|
|
1732
1765
|
const memorySource = useObservedArtifactSize
|
|
1733
1766
|
? 'observed_artifact_size'
|
|
1734
|
-
: (
|
|
1767
|
+
: (usedMoeTotal ? 'moe_total_params' : 'estimated_from_params');
|
|
1735
1768
|
|
|
1736
1769
|
return {
|
|
1737
1770
|
parameterProfile,
|