llm-checker 3.5.15 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -8
- package/analyzer/compatibility.js +5 -0
- package/analyzer/performance.js +5 -4
- package/bin/cli.js +5 -39
- package/bin/enhanced_cli.js +449 -24
- package/bin/mcp-server.mjs +266 -101
- package/package.json +13 -8
- package/src/ai/multi-objective-selector.js +118 -11
- package/src/calibration/calibration-manager.js +4 -1
- package/src/data/model-database.js +489 -5
- package/src/data/registry-ingestors.js +751 -0
- package/src/data/registry-recommender.js +514 -0
- package/src/data/seed/README.md +11 -3
- package/src/data/seed/models.db +0 -0
- package/src/data/sync-manager.js +32 -18
- package/src/hardware/backends/apple-silicon.js +5 -1
- package/src/hardware/backends/cuda-detector.js +47 -19
- package/src/hardware/backends/intel-detector.js +6 -2
- package/src/hardware/backends/rocm-detector.js +6 -2
- package/src/hardware/detector.js +57 -30
- package/src/hardware/unified-detector.js +129 -25
- package/src/index.js +68 -4
- package/src/models/ai-check-selector.js +36 -5
- package/src/models/deterministic-selector.js +179 -18
- package/src/models/expanded_database.js +9 -5
- package/src/models/intelligent-selector.js +87 -1
- package/src/models/moe-assumptions.js +11 -0
- package/src/models/requirements.js +16 -11
- package/src/models/scoring-core.js +341 -0
- package/src/models/scoring-engine.js +9 -2
- package/src/ollama/capacity-planner.js +15 -2
- package/src/ollama/client.js +70 -30
- package/src/ollama/enhanced-client.js +20 -2
- package/src/ollama/manager.js +14 -2
- package/src/policy/cli-policy.js +8 -2
- package/src/policy/policy-engine.js +2 -1
- package/src/provenance/model-provenance.js +4 -1
- package/src/ui/cli-theme.js +47 -7
- package/src/ui/interactive-panel.js +162 -24
|
@@ -62,6 +62,25 @@ Respond with JSON only, no additional text.`;
|
|
|
62
62
|
/**
|
|
63
63
|
* Main AI-Check function
|
|
64
64
|
*/
|
|
65
|
+
/** Normalize the --models option (array, or comma/space-separated string) to a list. */
|
|
66
|
+
parseModelFilter(models) {
|
|
67
|
+
if (!models) return [];
|
|
68
|
+
const list = Array.isArray(models) ? models : String(models).split(/[,\s]+/);
|
|
69
|
+
return list.map((m) => String(m).trim().toLowerCase()).filter(Boolean);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** True when an Ollama DB model matches a user-supplied name fragment. */
|
|
73
|
+
modelMatchesFilter(model, needle) {
|
|
74
|
+
const identifier = String(model?.model_identifier || '').toLowerCase();
|
|
75
|
+
const name = String(model?.model_name || '').toLowerCase();
|
|
76
|
+
return (
|
|
77
|
+
identifier === needle ||
|
|
78
|
+
name === needle ||
|
|
79
|
+
identifier.includes(needle) ||
|
|
80
|
+
name.includes(needle)
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
|
|
65
84
|
async aiCheck(options = {}) {
|
|
66
85
|
const {
|
|
67
86
|
category = 'general',
|
|
@@ -90,11 +109,23 @@ Respond with JSON only, no additional text.`;
|
|
|
90
109
|
const budget = hardware.gpu.unified ? hardware.usableMemGB :
|
|
91
110
|
(hardware.gpu.vramGB || hardware.usableMemGB);
|
|
92
111
|
|
|
93
|
-
//
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
112
|
+
// Optional explicit model filter (--models qwen2.5,llama3.1). When present
|
|
113
|
+
// it overrides the category filter: the user asked for specific models.
|
|
114
|
+
const modelFilter = this.parseModelFilter(options.models);
|
|
115
|
+
let categoryModels;
|
|
116
|
+
if (modelFilter.length > 0) {
|
|
117
|
+
categoryModels = allOllamaModels.filter((model) =>
|
|
118
|
+
modelFilter.some((needle) => this.modelMatchesFilter(model, needle))
|
|
119
|
+
);
|
|
120
|
+
if (!silent) {
|
|
121
|
+
console.log(chalk.cyan('│') + ` Restricted to ${categoryModels.length} model(s) matching --models`);
|
|
122
|
+
}
|
|
123
|
+
} else {
|
|
124
|
+
// Filter models by category first
|
|
125
|
+
categoryModels = this.filterOllamaModelsByCategory(allOllamaModels, category);
|
|
126
|
+
if (!silent) {
|
|
127
|
+
console.log(chalk.cyan('│') + ` ${categoryModels.length} models match ${category} category`);
|
|
128
|
+
}
|
|
98
129
|
}
|
|
99
130
|
|
|
100
131
|
// Evaluate each model using deterministic scoring
|
|
@@ -1556,10 +1556,21 @@ class DeterministicModelSelector {
|
|
|
1556
1556
|
const S = speedEstimate.score;
|
|
1557
1557
|
const F = this.calculateFitScore(requiredGB, budget);
|
|
1558
1558
|
const C = this.calculateContextScore(model, targetCtx);
|
|
1559
|
+
const capacityAdjustment = this.calculateHighCapacitySizeAdjustment(
|
|
1560
|
+
hardware,
|
|
1561
|
+
model,
|
|
1562
|
+
budget,
|
|
1563
|
+
category,
|
|
1564
|
+
optimizeFor
|
|
1565
|
+
);
|
|
1559
1566
|
|
|
1560
1567
|
// 4. Calculate final weighted score
|
|
1561
1568
|
const weights = this.getScoringWeights(category, optimizeFor);
|
|
1562
|
-
const
|
|
1569
|
+
const weightedScore = Q * weights[0] + S * weights[1] + F * weights[2] + C * weights[3];
|
|
1570
|
+
const score = Math.max(
|
|
1571
|
+
0,
|
|
1572
|
+
Math.min(100, Math.round((weightedScore + capacityAdjustment.score) * 10) / 10)
|
|
1573
|
+
);
|
|
1563
1574
|
|
|
1564
1575
|
// 5. Build rationale
|
|
1565
1576
|
const rationale = this.buildRationale(
|
|
@@ -1572,7 +1583,8 @@ class DeterministicModelSelector {
|
|
|
1572
1583
|
Q,
|
|
1573
1584
|
S,
|
|
1574
1585
|
memoryEstimate,
|
|
1575
|
-
speedEstimate
|
|
1586
|
+
speedEstimate,
|
|
1587
|
+
capacityAdjustment
|
|
1576
1588
|
);
|
|
1577
1589
|
|
|
1578
1590
|
return {
|
|
@@ -1599,7 +1611,8 @@ class DeterministicModelSelector {
|
|
|
1599
1611
|
runtime: speedEstimate.runtime,
|
|
1600
1612
|
moe: speedEstimate.moe
|
|
1601
1613
|
},
|
|
1602
|
-
components: { Q, S, F, C }
|
|
1614
|
+
components: { Q, S, F, C, H: capacityAdjustment.score },
|
|
1615
|
+
optimizeFor
|
|
1603
1616
|
};
|
|
1604
1617
|
}
|
|
1605
1618
|
|
|
@@ -1858,6 +1871,9 @@ class DeterministicModelSelector {
|
|
|
1858
1871
|
if (hardware.cpu.cores >= 8) base *= 1.1;
|
|
1859
1872
|
if (hardware.acceleration.supports_metal || hardware.acceleration.supports_cuda) base *= 1.2;
|
|
1860
1873
|
|
|
1874
|
+
const acceleratorScale = this.calculateAcceleratorSpeedScale(hardware, backend);
|
|
1875
|
+
base *= acceleratorScale.multiplier;
|
|
1876
|
+
|
|
1861
1877
|
const normalizedRuntime = normalizeMoERuntime(runtime);
|
|
1862
1878
|
const moe = estimateMoESpeedMultiplier({
|
|
1863
1879
|
model,
|
|
@@ -1880,7 +1896,46 @@ class DeterministicModelSelector {
|
|
|
1880
1896
|
estimatedTPS,
|
|
1881
1897
|
score,
|
|
1882
1898
|
runtime: normalizedRuntime,
|
|
1883
|
-
moe
|
|
1899
|
+
moe,
|
|
1900
|
+
acceleratorScale
|
|
1901
|
+
};
|
|
1902
|
+
}
|
|
1903
|
+
|
|
1904
|
+
calculateAcceleratorSpeedScale(hardware = {}, backend = 'cpu_x86') {
|
|
1905
|
+
if (backend !== 'cuda' && backend !== 'metal') {
|
|
1906
|
+
return { multiplier: 1, reason: null };
|
|
1907
|
+
}
|
|
1908
|
+
|
|
1909
|
+
const gpu = hardware.gpu || {};
|
|
1910
|
+
const memory = hardware.memory || {};
|
|
1911
|
+
const toFiniteNumber = (value, fallback = 0) => {
|
|
1912
|
+
const parsed = Number(value);
|
|
1913
|
+
return Number.isFinite(parsed) ? parsed : fallback;
|
|
1914
|
+
};
|
|
1915
|
+
const vramGB = toFiniteNumber(gpu.vramGB ?? gpu.vram ?? gpu.totalVRAM, 0);
|
|
1916
|
+
const ramGB = toFiniteNumber(memory.totalGB ?? memory.total, 0);
|
|
1917
|
+
const acceleratorMemoryGB = backend === 'metal' && Boolean(gpu.unified)
|
|
1918
|
+
? Math.max(vramGB, ramGB)
|
|
1919
|
+
: vramGB;
|
|
1920
|
+
const gpuCount = Math.max(1, toFiniteNumber(gpu.gpuCount ?? gpu.count, 1));
|
|
1921
|
+
|
|
1922
|
+
let multiplier = 1;
|
|
1923
|
+
if (acceleratorMemoryGB >= 160) multiplier *= 3.2;
|
|
1924
|
+
else if (acceleratorMemoryGB >= 96) multiplier *= 2.6;
|
|
1925
|
+
else if (acceleratorMemoryGB >= 80) multiplier *= 2.2;
|
|
1926
|
+
else if (acceleratorMemoryGB >= 48) multiplier *= 1.7;
|
|
1927
|
+
else if (acceleratorMemoryGB >= 24) multiplier *= 1.15;
|
|
1928
|
+
|
|
1929
|
+
if (backend === 'cuda' && gpuCount > 1) {
|
|
1930
|
+
multiplier *= Math.min(1.8, 1 + ((gpuCount - 1) * 0.25));
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
const rounded = Math.round(multiplier * 100) / 100;
|
|
1934
|
+
return {
|
|
1935
|
+
multiplier: rounded,
|
|
1936
|
+
reason: rounded > 1
|
|
1937
|
+
? `${backend.toUpperCase()} capacity x${rounded}`
|
|
1938
|
+
: null
|
|
1884
1939
|
};
|
|
1885
1940
|
}
|
|
1886
1941
|
|
|
@@ -1888,13 +1943,79 @@ class DeterministicModelSelector {
|
|
|
1888
1943
|
const ratio = requiredGB / budgetGB;
|
|
1889
1944
|
if (ratio <= 0.9) return 100;
|
|
1890
1945
|
if (ratio <= 1.0) return 70;
|
|
1891
|
-
return 0; //
|
|
1946
|
+
return 0; // Unreachable in practice: evaluateModel drops requiredGB > budget.
|
|
1892
1947
|
}
|
|
1893
1948
|
|
|
1894
1949
|
calculateContextScore(model, targetCtx) {
|
|
1895
|
-
|
|
1896
|
-
if (
|
|
1897
|
-
|
|
1950
|
+
const ctxMax = Number(model?.ctxMax) || 0;
|
|
1951
|
+
if (ctxMax >= targetCtx) return 100;
|
|
1952
|
+
if (ctxMax >= targetCtx * 0.5) return 70;
|
|
1953
|
+
// Context is NOT pre-filtered: a model that cannot serve the requested
|
|
1954
|
+
// context still scores here (0 for this component) and stays eligible,
|
|
1955
|
+
// weighted down rather than excluded.
|
|
1956
|
+
return 0;
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
getHighCapacitySizeTarget(budgetGB, hardware = {}) {
|
|
1960
|
+
if (!Number.isFinite(budgetGB) || budgetGB < 32) return null;
|
|
1961
|
+
|
|
1962
|
+
const isMultiGPU = Boolean(hardware?.gpu?.isMultiGPU);
|
|
1963
|
+
if (budgetGB >= 128) return { minParamsB: 30, sweetSpotParamsB: 70 };
|
|
1964
|
+
if (budgetGB >= 80) return { minParamsB: 30, sweetSpotParamsB: 70 };
|
|
1965
|
+
if (budgetGB >= 48) return { minParamsB: 20, sweetSpotParamsB: 34 };
|
|
1966
|
+
if (budgetGB >= 32 && isMultiGPU) return { minParamsB: 30, sweetSpotParamsB: 30 };
|
|
1967
|
+
if (budgetGB >= 32) return { minParamsB: 13, sweetSpotParamsB: 30 };
|
|
1968
|
+
return null;
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1971
|
+
calculateHighCapacitySizeAdjustment(hardware, model, budgetGB, category, optimizeFor = 'balanced') {
|
|
1972
|
+
const objective = this.normalizeOptimizationObjective(optimizeFor);
|
|
1973
|
+
if (objective === 'speed' || category === 'embeddings') {
|
|
1974
|
+
return { score: 0, reason: null };
|
|
1975
|
+
}
|
|
1976
|
+
|
|
1977
|
+
const normalizedHardware = this.normalizeHardwareProfile(hardware || {});
|
|
1978
|
+
const tier = this.mapHardwareTier(normalizedHardware);
|
|
1979
|
+
const highCapacityTiers = new Set(['very_high', 'ultra_high', 'extreme', 'flagship']);
|
|
1980
|
+
const target = this.getHighCapacitySizeTarget(budgetGB, normalizedHardware);
|
|
1981
|
+
const hasHighCapacitySignal =
|
|
1982
|
+
Boolean(target) ||
|
|
1983
|
+
highCapacityTiers.has(tier) ||
|
|
1984
|
+
Number(normalizedHardware?.gpu?.vramGB || 0) >= 48;
|
|
1985
|
+
|
|
1986
|
+
if (!hasHighCapacitySignal || !target) {
|
|
1987
|
+
return { score: 0, reason: null };
|
|
1988
|
+
}
|
|
1989
|
+
|
|
1990
|
+
const params = this.parseBillionsValue(model?.paramsB);
|
|
1991
|
+
if (!Number.isFinite(params) || params <= 0) {
|
|
1992
|
+
return { score: 0, reason: null };
|
|
1993
|
+
}
|
|
1994
|
+
|
|
1995
|
+
const categoryMultiplier = category === 'multimodal' ? 0.6 : 1;
|
|
1996
|
+
if (params < target.minParamsB) {
|
|
1997
|
+
const deficitRatio = (target.minParamsB - params) / target.minParamsB;
|
|
1998
|
+
const penalty = -Math.min(24, deficitRatio * 24) * categoryMultiplier;
|
|
1999
|
+
const roundedPenalty = Math.round(penalty * 10) / 10;
|
|
2000
|
+
return {
|
|
2001
|
+
score: roundedPenalty,
|
|
2002
|
+
reason: `below ${target.minParamsB}B high-capacity floor`
|
|
2003
|
+
};
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
const distanceRatio = Math.min(
|
|
2007
|
+
1,
|
|
2008
|
+
Math.abs(params - target.sweetSpotParamsB) / target.sweetSpotParamsB
|
|
2009
|
+
);
|
|
2010
|
+
const bonus = Math.max(0, 12 * (1 - distanceRatio)) * categoryMultiplier;
|
|
2011
|
+
const roundedBonus = Math.round(bonus * 10) / 10;
|
|
2012
|
+
|
|
2013
|
+
return {
|
|
2014
|
+
score: roundedBonus,
|
|
2015
|
+
reason: roundedBonus > 0
|
|
2016
|
+
? `${target.sweetSpotParamsB}B high-capacity target`
|
|
2017
|
+
: null
|
|
2018
|
+
};
|
|
1898
2019
|
}
|
|
1899
2020
|
|
|
1900
2021
|
estimatePracticalMaxParamsForBudget(budgetGB) {
|
|
@@ -1994,7 +2115,19 @@ class DeterministicModelSelector {
|
|
|
1994
2115
|
return highCapacityPromoted;
|
|
1995
2116
|
}
|
|
1996
2117
|
|
|
1997
|
-
buildRationale(
|
|
2118
|
+
buildRationale(
|
|
2119
|
+
hardware,
|
|
2120
|
+
model,
|
|
2121
|
+
quant,
|
|
2122
|
+
requiredGB,
|
|
2123
|
+
budget,
|
|
2124
|
+
category,
|
|
2125
|
+
Q,
|
|
2126
|
+
S,
|
|
2127
|
+
memoryEstimate = null,
|
|
2128
|
+
speedEstimate = null,
|
|
2129
|
+
capacityAdjustment = null
|
|
2130
|
+
) {
|
|
1998
2131
|
const parts = [];
|
|
1999
2132
|
|
|
2000
2133
|
// Memory fit
|
|
@@ -2027,6 +2160,14 @@ class DeterministicModelSelector {
|
|
|
2027
2160
|
const multiplier = Number(speedEstimate.moe.multiplier || 1).toFixed(2);
|
|
2028
2161
|
parts.push(`MoE speed x${multiplier} (${runtimeLabel})`);
|
|
2029
2162
|
}
|
|
2163
|
+
|
|
2164
|
+
if (speedEstimate?.acceleratorScale?.multiplier > 1) {
|
|
2165
|
+
parts.push(speedEstimate.acceleratorScale.reason);
|
|
2166
|
+
}
|
|
2167
|
+
|
|
2168
|
+
if (capacityAdjustment?.reason) {
|
|
2169
|
+
parts.push(capacityAdjustment.reason);
|
|
2170
|
+
}
|
|
2030
2171
|
|
|
2031
2172
|
// Size sweet spot
|
|
2032
2173
|
if (model.paramsB >= 7 && model.paramsB <= 13) {
|
|
@@ -2114,14 +2255,21 @@ class DeterministicModelSelector {
|
|
|
2114
2255
|
|
|
2115
2256
|
updateCandidateWithMeasuredSpeed(candidate, measuredTPS, category) {
|
|
2116
2257
|
const normalizedS = this.normalizeTPSToScore(measuredTPS, category);
|
|
2117
|
-
|
|
2118
|
-
//
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2258
|
+
|
|
2259
|
+
// Re-score with the measured speed using the SAME weighting source as
|
|
2260
|
+
// evaluateModel: getScoringWeights honours the user's optimizeFor profile and
|
|
2261
|
+
// falls back to the general weights for categories (e.g. 'talking') that have
|
|
2262
|
+
// no entry in DETERMINISTIC_WEIGHTS — indexing this.categoryWeights[category]
|
|
2263
|
+
// directly threw a TypeError for those. We also re-add the stored capacity
|
|
2264
|
+
// adjustment (H) and clamp, so a probed score stays comparable to a
|
|
2265
|
+
// non-probed one instead of being silently lower.
|
|
2266
|
+
const weights = this.getScoringWeights(category, candidate.optimizeFor || 'balanced');
|
|
2267
|
+
const { Q, F, C, H = 0 } = candidate.components;
|
|
2268
|
+
|
|
2122
2269
|
candidate.estTPS = measuredTPS;
|
|
2123
2270
|
candidate.components.S = normalizedS;
|
|
2124
|
-
|
|
2271
|
+
const weighted = Q * weights[0] + normalizedS * weights[1] + F * weights[2] + C * weights[3];
|
|
2272
|
+
candidate.score = Math.max(0, Math.min(100, Math.round((weighted + H) * 10) / 10));
|
|
2125
2273
|
}
|
|
2126
2274
|
|
|
2127
2275
|
normalizeTPSToScore(tps, category) {
|
|
@@ -2194,6 +2342,9 @@ class DeterministicModelSelector {
|
|
|
2194
2342
|
estimatedRAM: candidate.requiredGB,
|
|
2195
2343
|
reasoning: candidate.rationale,
|
|
2196
2344
|
runtime: candidate.runtime || candidate.speed?.runtime || 'ollama',
|
|
2345
|
+
installCommand: candidate.meta.installCommand || provenance.install_command || '',
|
|
2346
|
+
downloadUrl: candidate.meta.downloadUrl || provenance.download_url || '',
|
|
2347
|
+
artifactFormat: candidate.meta.artifact?.format || '',
|
|
2197
2348
|
memoryAssumptionSource: candidate.memory?.assumptionSource || 'dense_params',
|
|
2198
2349
|
speedAssumptions: candidate.speed?.moe ? {
|
|
2199
2350
|
applied: Boolean(candidate.speed.moe.applied),
|
|
@@ -2375,19 +2526,24 @@ class DeterministicModelSelector {
|
|
|
2375
2526
|
Object.entries(recommendations).forEach(([category, data]) => {
|
|
2376
2527
|
const bestModel = data.bestModels[0];
|
|
2377
2528
|
if (bestModel) {
|
|
2529
|
+
const command = bestModel.installCommand ||
|
|
2530
|
+
bestModel.provenance?.install_command ||
|
|
2531
|
+
`ollama pull ${bestModel.model_identifier}`;
|
|
2378
2532
|
summary.by_category[category] = {
|
|
2379
2533
|
name: bestModel.model_name || bestModel.name,
|
|
2380
2534
|
identifier: bestModel.model_identifier,
|
|
2381
2535
|
score: Math.round(bestModel.categoryScore || bestModel.score),
|
|
2382
|
-
command
|
|
2536
|
+
command,
|
|
2383
2537
|
size: this.formatModelSize(bestModel),
|
|
2384
2538
|
quantization: bestModel.quantization || bestModel.quant || 'Q4_K_M',
|
|
2539
|
+
runtime: bestModel.runtime || bestModel.provenance?.runtime || 'ollama',
|
|
2385
2540
|
pulls: bestModel.pulls || 0,
|
|
2386
2541
|
source: bestModel.source || bestModel.provenance?.source || 'unknown',
|
|
2387
2542
|
registry: bestModel.registry || bestModel.provenance?.registry || 'unknown',
|
|
2388
2543
|
version: bestModel.version || bestModel.provenance?.version || 'unknown',
|
|
2389
2544
|
license: bestModel.license || bestModel.provenance?.license || 'unknown',
|
|
2390
2545
|
digest: bestModel.digest || bestModel.provenance?.digest || 'unknown',
|
|
2546
|
+
download_url: bestModel.downloadUrl || bestModel.provenance?.download_url || '',
|
|
2391
2547
|
provenance: bestModel.provenance || {
|
|
2392
2548
|
source: bestModel.source || 'unknown',
|
|
2393
2549
|
registry: bestModel.registry || 'unknown',
|
|
@@ -2397,7 +2553,7 @@ class DeterministicModelSelector {
|
|
|
2397
2553
|
}
|
|
2398
2554
|
};
|
|
2399
2555
|
|
|
2400
|
-
summary.quick_commands.push(
|
|
2556
|
+
summary.quick_commands.push(command);
|
|
2401
2557
|
|
|
2402
2558
|
const isGeneralCategory = ['general', 'coding', 'talking', 'reading'].includes(category);
|
|
2403
2559
|
const score = bestModel.categoryScore || bestModel.score || 0;
|
|
@@ -2411,18 +2567,23 @@ class DeterministicModelSelector {
|
|
|
2411
2567
|
});
|
|
2412
2568
|
|
|
2413
2569
|
if (bestOverallModel) {
|
|
2570
|
+
const command = bestOverallModel.installCommand ||
|
|
2571
|
+
bestOverallModel.provenance?.install_command ||
|
|
2572
|
+
`ollama pull ${bestOverallModel.model_identifier}`;
|
|
2414
2573
|
summary.best_overall = {
|
|
2415
2574
|
name: bestOverallModel.model_name || bestOverallModel.name,
|
|
2416
2575
|
identifier: bestOverallModel.model_identifier,
|
|
2417
2576
|
category: bestOverallCategory,
|
|
2418
2577
|
score: Math.round(bestOverallScore),
|
|
2419
|
-
command
|
|
2578
|
+
command,
|
|
2420
2579
|
quantization: bestOverallModel.quantization || bestOverallModel.quant || 'Q4_K_M',
|
|
2580
|
+
runtime: bestOverallModel.runtime || bestOverallModel.provenance?.runtime || 'ollama',
|
|
2421
2581
|
source: bestOverallModel.source || bestOverallModel.provenance?.source || 'unknown',
|
|
2422
2582
|
registry: bestOverallModel.registry || bestOverallModel.provenance?.registry || 'unknown',
|
|
2423
2583
|
version: bestOverallModel.version || bestOverallModel.provenance?.version || 'unknown',
|
|
2424
2584
|
license: bestOverallModel.license || bestOverallModel.provenance?.license || 'unknown',
|
|
2425
2585
|
digest: bestOverallModel.digest || bestOverallModel.provenance?.digest || 'unknown',
|
|
2586
|
+
download_url: bestOverallModel.downloadUrl || bestOverallModel.provenance?.download_url || '',
|
|
2426
2587
|
provenance: bestOverallModel.provenance || {
|
|
2427
2588
|
source: bestOverallModel.source || 'unknown',
|
|
2428
2589
|
registry: bestOverallModel.registry || 'unknown',
|
|
@@ -1007,18 +1007,22 @@ class ExpandedModelsDatabase {
|
|
|
1007
1007
|
}
|
|
1008
1008
|
|
|
1009
1009
|
estimateMemoryUsage(model) {
|
|
1010
|
-
|
|
1010
|
+
// Derive footprint from parameter count, not by stripping the unit off the
|
|
1011
|
+
// size string and treating the bare number as gigabytes — that read a 774M
|
|
1012
|
+
// model ("774M") as ~774 GB and a 22M model as ~22 GB. ~0.7 GB per 1B params
|
|
1013
|
+
// is a reasonable quantized-runtime footprint baseline.
|
|
1014
|
+
const sizeGB = this.extractModelParams(model) * 0.7;
|
|
1011
1015
|
|
|
1012
1016
|
// Rough estimates including model loading overhead
|
|
1013
1017
|
return {
|
|
1014
|
-
minimal: Math.round(sizeGB * 1.2), // With quantization
|
|
1015
|
-
typical: Math.round(sizeGB * 1.5), // Standard loading
|
|
1016
|
-
maximum: Math.round(sizeGB * 2.0) // With full context
|
|
1018
|
+
minimal: Math.max(1, Math.round(sizeGB * 1.2)), // With quantization
|
|
1019
|
+
typical: Math.max(1, Math.round(sizeGB * 1.5)), // Standard loading
|
|
1020
|
+
maximum: Math.max(1, Math.round(sizeGB * 2.0)) // With full context
|
|
1017
1021
|
};
|
|
1018
1022
|
}
|
|
1019
1023
|
|
|
1020
1024
|
estimatePowerConsumption(model, hardware) {
|
|
1021
|
-
const sizeGB =
|
|
1025
|
+
const sizeGB = this.extractModelParams(model) * 0.7;
|
|
1022
1026
|
const tier = this.getHardwareTier(hardware);
|
|
1023
1027
|
|
|
1024
1028
|
const basePower = {
|
|
@@ -10,6 +10,7 @@ const ScoringEngine = require('./scoring-engine');
|
|
|
10
10
|
const UnifiedDetector = require('../hardware/unified-detector');
|
|
11
11
|
const PolicyManager = require('../policy/policy-manager');
|
|
12
12
|
const PolicyEngine = require('../policy/policy-engine');
|
|
13
|
+
const { rankModels } = require('./scoring-core');
|
|
13
14
|
|
|
14
15
|
function isPlainObject(value) {
|
|
15
16
|
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
@@ -66,7 +67,9 @@ class IntelligentSelector {
|
|
|
66
67
|
// Apply filters
|
|
67
68
|
const filtered = this.applyFilters(variants, opts, hardware);
|
|
68
69
|
|
|
69
|
-
// Score all filtered variants
|
|
70
|
+
// Score all filtered variants. ScoringEngine still produces the
|
|
71
|
+
// per-variant `score` objects (final/components/meta) consumed by the
|
|
72
|
+
// smart-recommend display, but the RANKING is unified below.
|
|
70
73
|
const scored = this.scoring.filterAndScore(filtered, hardware, {
|
|
71
74
|
useCase: opts.useCase,
|
|
72
75
|
targetContext: opts.targetContext,
|
|
@@ -75,6 +78,12 @@ class IntelligentSelector {
|
|
|
75
78
|
headroom: opts.headroom || 2
|
|
76
79
|
});
|
|
77
80
|
|
|
81
|
+
// Unify ranking with the canonical scoring core (issue #88): re-order
|
|
82
|
+
// the scored list and rewrite each item's final score using the shared
|
|
83
|
+
// DeterministicModelSelector so smart-recommend agrees with
|
|
84
|
+
// `check`/`recommend` and inherits the PR #89 high-capacity floor.
|
|
85
|
+
await this.applyUnifiedRanking(scored, hardware, opts);
|
|
86
|
+
|
|
78
87
|
const policyEngine = this.resolvePolicyEngine(opts);
|
|
79
88
|
const scoredWithPolicy = policyEngine.evaluateScoredVariants(
|
|
80
89
|
scored,
|
|
@@ -114,6 +123,83 @@ class IntelligentSelector {
|
|
|
114
123
|
};
|
|
115
124
|
}
|
|
116
125
|
|
|
126
|
+
/**
|
|
127
|
+
* Re-rank the ScoringEngine-scored variants using the canonical scoring
|
|
128
|
+
* core so smart-recommend's ordering and headline scores match
|
|
129
|
+
* `check`/`recommend` and inherit the high-capacity right-sizing floor.
|
|
130
|
+
*
|
|
131
|
+
* Mutates `scored` in place: it is sorted by the unified score and each
|
|
132
|
+
* item's `score.final` is overwritten with the canonical 0-100 score.
|
|
133
|
+
* Component/meta sub-scores are left intact so the existing display (which
|
|
134
|
+
* shows Q/S/F and estimated TPS) keeps working. If the core cannot rank a
|
|
135
|
+
* variant (or throws), that item keeps its original ScoringEngine score and
|
|
136
|
+
* sorts after the unified ones, preserving a sensible fallback ordering.
|
|
137
|
+
*/
|
|
138
|
+
async applyUnifiedRanking(scored, hardware, opts = {}) {
|
|
139
|
+
if (!Array.isArray(scored) || scored.length === 0) return scored;
|
|
140
|
+
|
|
141
|
+
let ranking;
|
|
142
|
+
try {
|
|
143
|
+
ranking = await rankModels(
|
|
144
|
+
scored.map((item) => item.variant),
|
|
145
|
+
hardware,
|
|
146
|
+
{
|
|
147
|
+
category: opts.useCase || 'general',
|
|
148
|
+
optimizeFor: opts.optimizeFor || opts.optimize || 'balanced',
|
|
149
|
+
runtime: opts.runtime || 'ollama',
|
|
150
|
+
topN: scored.length
|
|
151
|
+
}
|
|
152
|
+
);
|
|
153
|
+
} catch (error) {
|
|
154
|
+
return scored; // Defensive: keep original ScoringEngine ordering.
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if (!ranking || !Array.isArray(ranking.candidates)) return scored;
|
|
158
|
+
|
|
159
|
+
// Map each source variant -> its unified score + ordering index.
|
|
160
|
+
const unifiedByVariant = new Map();
|
|
161
|
+
ranking.candidates.forEach((candidate, index) => {
|
|
162
|
+
const source = candidate?.meta?.__source;
|
|
163
|
+
if (!source) return;
|
|
164
|
+
unifiedByVariant.set(source, {
|
|
165
|
+
unifiedScore: Math.round(candidate.score * 10) / 10,
|
|
166
|
+
rank: index,
|
|
167
|
+
quant: candidate.quant,
|
|
168
|
+
estimatedTPS: candidate.estTPS
|
|
169
|
+
});
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
for (const item of scored) {
|
|
173
|
+
const unified = unifiedByVariant.get(item.variant);
|
|
174
|
+
if (!unified) {
|
|
175
|
+
// Not ranked by the core (e.g. filtered out): sort last and tag
|
|
176
|
+
// so any downstream tie-breaks are deterministic.
|
|
177
|
+
item.__unifiedRank = Number.MAX_SAFE_INTEGER;
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
item.__unifiedRank = unified.rank;
|
|
181
|
+
if (item.score) {
|
|
182
|
+
item.score.final = Math.min(100, Math.max(0, Math.round(unified.unifiedScore)));
|
|
183
|
+
if (item.score.meta) {
|
|
184
|
+
item.score.meta.unifiedScore = unified.unifiedScore;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
scored.sort((a, b) => {
|
|
190
|
+
const ra = Number.isFinite(a.__unifiedRank) ? a.__unifiedRank : Number.MAX_SAFE_INTEGER;
|
|
191
|
+
const rb = Number.isFinite(b.__unifiedRank) ? b.__unifiedRank : Number.MAX_SAFE_INTEGER;
|
|
192
|
+
if (ra !== rb) return ra - rb;
|
|
193
|
+
return (b.score?.final || 0) - (a.score?.final || 0);
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
for (const item of scored) {
|
|
197
|
+
delete item.__unifiedRank;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
return scored;
|
|
201
|
+
}
|
|
202
|
+
|
|
117
203
|
/**
|
|
118
204
|
* Resolve policy engine from explicit options, in-memory policy, or policy file.
|
|
119
205
|
*/
|
|
@@ -24,6 +24,14 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
|
|
|
24
24
|
maxEffectiveGain: 2.65,
|
|
25
25
|
notes: ['optimized scheduler', 'better expert batching', 'lower offload pressure']
|
|
26
26
|
}),
|
|
27
|
+
transformers: Object.freeze({
|
|
28
|
+
runtime: 'transformers',
|
|
29
|
+
routingOverhead: 0.15,
|
|
30
|
+
communicationOverhead: 0.10,
|
|
31
|
+
offloadOverhead: 0.06,
|
|
32
|
+
maxEffectiveGain: 2.45,
|
|
33
|
+
notes: ['general Hugging Face path', 'broad architecture support', 'higher Python overhead than vLLM']
|
|
34
|
+
}),
|
|
27
35
|
mlx: Object.freeze({
|
|
28
36
|
runtime: 'mlx',
|
|
29
37
|
routingOverhead: 0.16,
|
|
@@ -45,6 +53,9 @@ const MOE_RUNTIME_PROFILES = Object.freeze({
|
|
|
45
53
|
const RUNTIME_ALIASES = Object.freeze({
|
|
46
54
|
ollama: 'ollama',
|
|
47
55
|
vllm: 'vllm',
|
|
56
|
+
transformers: 'transformers',
|
|
57
|
+
'huggingface-transformers': 'transformers',
|
|
58
|
+
hf: 'transformers',
|
|
48
59
|
mlx: 'mlx',
|
|
49
60
|
'mlx-lm': 'mlx',
|
|
50
61
|
mlx_lm: 'mlx',
|
|
@@ -134,17 +134,22 @@ class RequirementsCalculator {
|
|
|
134
134
|
}
|
|
135
135
|
|
|
136
136
|
parseModelSize(sizeString) {
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
137
|
+
// Anchor the number to its unit instead of globally stripping every char
|
|
138
|
+
// that isn't 0-9.kmb: the old approach kept stray k/m/b from model words, so
|
|
139
|
+
// "Llama 3.2 3B" normalized to "m3.23b" and parsed as 0.003B, and unit-only
|
|
140
|
+
// inputs produced NaN. Prefer a number that carries a B/M/K unit (the real
|
|
141
|
+
// size token, "3B") over a bare number (a version like "3.2").
|
|
142
|
+
const text = String(sizeString || '');
|
|
143
|
+
const match = text.match(/(\d+(?:\.\d+)?)\s*([kmb])\b/i) || text.match(/(\d+(?:\.\d+)?)/);
|
|
144
|
+
if (!match) return 1;
|
|
145
|
+
|
|
146
|
+
const value = parseFloat(match[1]);
|
|
147
|
+
if (!Number.isFinite(value)) return 1;
|
|
148
|
+
|
|
149
|
+
const unit = (match[2] || 'b').toLowerCase();
|
|
150
|
+
if (unit === 'k') return value / 1_000_000; // thousands of params -> billions
|
|
151
|
+
if (unit === 'm') return value / 1000; // millions of params -> billions
|
|
152
|
+
return value; // billions
|
|
148
153
|
}
|
|
149
154
|
|
|
150
155
|
getContextMultiplier(contextLength) {
|