llm-checker 3.2.5 → 3.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +27 -9
- package/README.md +72 -8
- package/bin/enhanced_cli.js +13 -2
- package/package.json +2 -2
- package/src/hardware/backends/rocm-detector.js +20 -1
- package/src/hardware/detector.js +75 -10
- package/src/hardware/unified-detector.js +49 -10
- package/src/index.js +19 -4
- package/src/models/deterministic-selector.js +712 -38
- package/src/models/intelligent-selector.js +2 -0
- package/src/models/moe-assumptions.js +311 -0
- package/src/models/scoring-engine.js +38 -13
|
@@ -10,6 +10,14 @@ const path = require('path');
|
|
|
10
10
|
const os = require('os');
|
|
11
11
|
const { spawn } = require('child_process');
|
|
12
12
|
const { DETERMINISTIC_WEIGHTS } = require('./scoring-config');
|
|
13
|
+
const {
|
|
14
|
+
parseBillionsValue: parseMoEBillionsValue,
|
|
15
|
+
parsePositiveNumber: parseMoEPositiveNumber,
|
|
16
|
+
normalizeMoERuntime,
|
|
17
|
+
extractMoEMetadata: extractCanonicalMoEMetadata,
|
|
18
|
+
resolveMoEParameterProfile,
|
|
19
|
+
estimateMoESpeedMultiplier
|
|
20
|
+
} = require('./moe-assumptions');
|
|
13
21
|
|
|
14
22
|
class DeterministicModelSelector {
|
|
15
23
|
constructor() {
|
|
@@ -33,6 +41,7 @@ class DeterministicModelSelector {
|
|
|
33
41
|
// Family quality bumps
|
|
34
42
|
this.familyBumps = {
|
|
35
43
|
'qwen2.5': 2,
|
|
44
|
+
'qwen3': 4,
|
|
36
45
|
'deepseek': 3,
|
|
37
46
|
'mistral': 1,
|
|
38
47
|
'llama3.1': 1,
|
|
@@ -101,6 +110,28 @@ class DeterministicModelSelector {
|
|
|
101
110
|
|
|
102
111
|
// Category scoring weights [Q, S, F, C] from centralized config
|
|
103
112
|
this.categoryWeights = DETERMINISTIC_WEIGHTS;
|
|
113
|
+
|
|
114
|
+
// User optimization profile overrides [Q, S, F, C]
|
|
115
|
+
this.optimizationProfiles = {
|
|
116
|
+
balanced: null,
|
|
117
|
+
speed: [0.25, 0.55, 0.15, 0.05],
|
|
118
|
+
quality: [0.65, 0.10, 0.15, 0.10],
|
|
119
|
+
context: [0.30, 0.10, 0.20, 0.40],
|
|
120
|
+
coding: [0.55, 0.25, 0.10, 0.10]
|
|
121
|
+
};
|
|
122
|
+
|
|
123
|
+
this.freshnessThresholds = {
|
|
124
|
+
staleDays: 365,
|
|
125
|
+
veryStaleDays: 730,
|
|
126
|
+
indexCadenceDays: 14
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
this.modelIndexStatus = {
|
|
130
|
+
source: 'unknown',
|
|
131
|
+
ageDays: null,
|
|
132
|
+
isStale: false,
|
|
133
|
+
cachedAt: null
|
|
134
|
+
};
|
|
104
135
|
}
|
|
105
136
|
|
|
106
137
|
// ============================================================================
|
|
@@ -148,6 +179,7 @@ class DeterministicModelSelector {
|
|
|
148
179
|
const gpu = input.gpu || {};
|
|
149
180
|
const memory = input.memory || {};
|
|
150
181
|
const acceleration = input.acceleration || {};
|
|
182
|
+
const gpuEntries = Array.isArray(gpu.all) ? gpu.all : [];
|
|
151
183
|
|
|
152
184
|
const totalMemGB =
|
|
153
185
|
toNumber(memory.totalGB) ??
|
|
@@ -156,21 +188,62 @@ class DeterministicModelSelector {
|
|
|
156
188
|
toNumber(input.memoryGB) ??
|
|
157
189
|
8;
|
|
158
190
|
|
|
191
|
+
const modelHints = `${gpu.model || ''} ${gpu.vendor || ''} ${gpu.type || ''}`.toLowerCase();
|
|
192
|
+
const inferredUnified =
|
|
193
|
+
Boolean(gpu.unified) ||
|
|
194
|
+
/apple|m1|m2|m3|m4|unified/.test(modelHints);
|
|
195
|
+
|
|
196
|
+
const utilizationFactor = inferredUnified ? 0.85 : 0.8;
|
|
197
|
+
const memoryHeadroomGB = inferredUnified ? 1.5 : 2;
|
|
159
198
|
const usableMemGB =
|
|
160
199
|
toNumber(input.usableMemGB) ??
|
|
161
|
-
Math.max(1, Math.min(
|
|
200
|
+
Math.max(1, Math.min(utilizationFactor * totalMemGB, totalMemGB - memoryHeadroomGB));
|
|
201
|
+
|
|
202
|
+
const gpuCount =
|
|
203
|
+
toNumber(gpu.gpuCount) ??
|
|
204
|
+
toNumber(gpu.count) ??
|
|
205
|
+
(gpuEntries.length > 0 ? gpuEntries.length : null) ??
|
|
206
|
+
toNumber(input.gpuCount) ??
|
|
207
|
+
1;
|
|
208
|
+
|
|
209
|
+
const vramPerGPU =
|
|
210
|
+
toNumber(gpu.vramPerGPU) ??
|
|
211
|
+
toNumber(input.vramPerGPU) ??
|
|
212
|
+
null;
|
|
213
|
+
|
|
214
|
+
const summedEntryVRAM = gpuEntries.reduce((sum, entry) => {
|
|
215
|
+
return sum + (
|
|
216
|
+
toNumber(entry?.vramGB) ??
|
|
217
|
+
toNumber(entry?.vram) ??
|
|
218
|
+
toNumber(entry?.totalVRAM) ??
|
|
219
|
+
0
|
|
220
|
+
);
|
|
221
|
+
}, 0);
|
|
162
222
|
|
|
163
|
-
const
|
|
223
|
+
const explicitTotalVRAM =
|
|
224
|
+
toNumber(gpu.totalVRAM) ??
|
|
225
|
+
toNumber(input.totalVRAM) ??
|
|
226
|
+
toNumber(input.gpuTotalVRAM) ??
|
|
227
|
+
(summedEntryVRAM > 0 ? summedEntryVRAM : null);
|
|
228
|
+
|
|
229
|
+
const directVRAM =
|
|
164
230
|
toNumber(gpu.vramGB) ??
|
|
165
231
|
toNumber(gpu.vram) ??
|
|
166
|
-
|
|
167
|
-
|
|
232
|
+
null;
|
|
233
|
+
|
|
234
|
+
let vramGB =
|
|
235
|
+
explicitTotalVRAM ??
|
|
236
|
+
directVRAM ??
|
|
168
237
|
0;
|
|
169
238
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
239
|
+
// Multi-GPU fallback when only per-GPU memory is known.
|
|
240
|
+
if (!explicitTotalVRAM && gpuCount > 1) {
|
|
241
|
+
if (vramPerGPU) {
|
|
242
|
+
vramGB = vramPerGPU * gpuCount;
|
|
243
|
+
} else if (directVRAM && Boolean(gpu.isMultiGPU || input.isMultiGPU)) {
|
|
244
|
+
vramGB = Math.max(directVRAM, directVRAM * gpuCount);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
174
247
|
|
|
175
248
|
let gpuType = gpu.type;
|
|
176
249
|
if (!gpuType) {
|
|
@@ -206,6 +279,9 @@ class DeterministicModelSelector {
|
|
|
206
279
|
...gpu,
|
|
207
280
|
type: gpuType,
|
|
208
281
|
vramGB,
|
|
282
|
+
vramPerGPU: vramPerGPU ?? (gpuCount > 0 ? (vramGB > 0 ? vramGB / gpuCount : 0) : 0),
|
|
283
|
+
gpuCount,
|
|
284
|
+
isMultiGPU: Boolean(gpu.isMultiGPU || gpuCount > 1),
|
|
209
285
|
unified: inferredUnified
|
|
210
286
|
},
|
|
211
287
|
memory: {
|
|
@@ -217,6 +293,43 @@ class DeterministicModelSelector {
|
|
|
217
293
|
};
|
|
218
294
|
}
|
|
219
295
|
|
|
296
|
+
normalizeOptimizationObjective(objective) {
|
|
297
|
+
if (!objective) return 'balanced';
|
|
298
|
+
const normalized = String(objective).toLowerCase().trim();
|
|
299
|
+
if (['balanced', 'default', 'auto'].includes(normalized)) return 'balanced';
|
|
300
|
+
if (['speed', 'fast', 'latency', 'throughput'].includes(normalized)) return 'speed';
|
|
301
|
+
if (['quality', 'accurate', 'accuracy'].includes(normalized)) return 'quality';
|
|
302
|
+
if (['context', 'long-context', 'long_context', 'memory'].includes(normalized)) return 'context';
|
|
303
|
+
if (['coding', 'code', 'developer'].includes(normalized)) return 'coding';
|
|
304
|
+
return 'balanced';
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
getScoringWeights(category, optimizeFor = 'balanced') {
|
|
308
|
+
const base = this.categoryWeights[category] || this.categoryWeights.general;
|
|
309
|
+
const objective = this.normalizeOptimizationObjective(optimizeFor);
|
|
310
|
+
const objectiveWeights = this.optimizationProfiles[objective];
|
|
311
|
+
|
|
312
|
+
if (!objectiveWeights) {
|
|
313
|
+
return base;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Blend category semantics with requested profile, but keep explicit
|
|
317
|
+
// user intent dominant (especially for quality/context priorities).
|
|
318
|
+
const objectivePriorities = {
|
|
319
|
+
speed: 0.8,
|
|
320
|
+
quality: 0.95,
|
|
321
|
+
context: 0.85,
|
|
322
|
+
coding: 0.8
|
|
323
|
+
};
|
|
324
|
+
const objectivePriority = objectivePriorities[objective] || 0.75;
|
|
325
|
+
const categoryPriority = 1 - objectivePriority;
|
|
326
|
+
|
|
327
|
+
return base.map((weight, idx) => {
|
|
328
|
+
const blended = (weight * categoryPriority) + (objectiveWeights[idx] * objectivePriority);
|
|
329
|
+
return Math.round(blended * 1000) / 1000;
|
|
330
|
+
});
|
|
331
|
+
}
|
|
332
|
+
|
|
220
333
|
async getCPUInfo() {
|
|
221
334
|
const os = require('os');
|
|
222
335
|
return {
|
|
@@ -511,7 +624,8 @@ class DeterministicModelSelector {
|
|
|
511
624
|
if (!fs.existsSync(cachePath)) continue;
|
|
512
625
|
const raw = JSON.parse(fs.readFileSync(cachePath, 'utf8'));
|
|
513
626
|
const sourceModels = Array.isArray(raw) ? raw : (raw.models || []);
|
|
514
|
-
const
|
|
627
|
+
const indexMeta = this.extractModelIndexMetadata(raw, cachePath);
|
|
628
|
+
const normalized = this.normalizeExternalModels(sourceModels, { indexMeta });
|
|
515
629
|
if (normalized.length > 0) return normalized;
|
|
516
630
|
} catch (error) {
|
|
517
631
|
// Ignore broken cache files and keep trying fallbacks
|
|
@@ -520,8 +634,28 @@ class DeterministicModelSelector {
|
|
|
520
634
|
return [];
|
|
521
635
|
}
|
|
522
636
|
|
|
523
|
-
|
|
637
|
+
extractModelIndexMetadata(raw, sourcePath = '') {
|
|
638
|
+
const cachedAtRaw = raw?.cached_at || raw?.generated_at || raw?.last_updated || null;
|
|
639
|
+
const cachedAt = this.parseDateSafe(cachedAtRaw);
|
|
640
|
+
const ageDays = cachedAt
|
|
641
|
+
? Math.max(0, (Date.now() - cachedAt.getTime()) / (1000 * 60 * 60 * 24))
|
|
642
|
+
: null;
|
|
643
|
+
const isStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.indexCadenceDays;
|
|
644
|
+
|
|
645
|
+
const status = {
|
|
646
|
+
source: sourcePath || 'cache',
|
|
647
|
+
ageDays: Number.isFinite(ageDays) ? Math.round(ageDays * 10) / 10 : null,
|
|
648
|
+
isStale: Boolean(isStale),
|
|
649
|
+
cachedAt: cachedAt ? cachedAt.toISOString() : null
|
|
650
|
+
};
|
|
651
|
+
|
|
652
|
+
this.modelIndexStatus = status;
|
|
653
|
+
return status;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
normalizeExternalModels(models = [], context = {}) {
|
|
524
657
|
const normalized = [];
|
|
658
|
+
const indexMeta = context.indexMeta || this.modelIndexStatus || {};
|
|
525
659
|
|
|
526
660
|
for (const model of models) {
|
|
527
661
|
if (!model || typeof model !== 'object') continue;
|
|
@@ -531,17 +665,23 @@ class DeterministicModelSelector {
|
|
|
531
665
|
typeof model.ctxMax === 'number' &&
|
|
532
666
|
model.model_identifier;
|
|
533
667
|
|
|
668
|
+
const freshness = this.computeFreshnessMetadata(model, indexMeta);
|
|
669
|
+
const quantizations = this.extractAvailableQuantizations(model, model.variants || []);
|
|
670
|
+
|
|
534
671
|
if (alreadyNormalized) {
|
|
535
672
|
normalized.push({
|
|
536
673
|
...model,
|
|
537
674
|
tags: Array.isArray(model.tags) ? model.tags : [],
|
|
538
675
|
modalities: Array.isArray(model.modalities) ? model.modalities : ['text'],
|
|
539
676
|
installed: Boolean(model.installed),
|
|
677
|
+
availableQuantizations: model.availableQuantizations || quantizations,
|
|
678
|
+
sizeByQuant: model.sizeByQuant || {},
|
|
540
679
|
source: model.source || 'ollama_database',
|
|
541
680
|
registry: model.registry || 'ollama.com',
|
|
542
681
|
version: model.version || model.model_identifier,
|
|
543
682
|
license: model.license || 'unknown',
|
|
544
683
|
digest: model.digest || 'unknown',
|
|
684
|
+
...freshness,
|
|
545
685
|
provenance: model.provenance || {
|
|
546
686
|
source: model.source || 'ollama_database',
|
|
547
687
|
registry: model.registry || 'ollama.com',
|
|
@@ -553,7 +693,7 @@ class DeterministicModelSelector {
|
|
|
553
693
|
continue;
|
|
554
694
|
}
|
|
555
695
|
|
|
556
|
-
const converted = this.convertOllamaModelToDeterministicModels(model);
|
|
696
|
+
const converted = this.convertOllamaModelToDeterministicModels(model, { indexMeta });
|
|
557
697
|
normalized.push(...converted);
|
|
558
698
|
}
|
|
559
699
|
|
|
@@ -567,12 +707,14 @@ class DeterministicModelSelector {
|
|
|
567
707
|
return [...deduped.values()];
|
|
568
708
|
}
|
|
569
709
|
|
|
570
|
-
convertOllamaModelToDeterministicModels(ollamaModel) {
|
|
710
|
+
convertOllamaModelToDeterministicModels(ollamaModel, context = {}) {
|
|
571
711
|
const baseIdentifier = ollamaModel.model_identifier || ollamaModel.model_name || 'unknown';
|
|
572
712
|
const fallbackTag = `${baseIdentifier}:latest`;
|
|
573
713
|
const variants = Array.isArray(ollamaModel.variants) && ollamaModel.variants.length > 0
|
|
574
714
|
? ollamaModel.variants
|
|
575
715
|
: [{ tag: ollamaModel.model_identifier || fallbackTag }];
|
|
716
|
+
const indexMeta = context.indexMeta || this.modelIndexStatus || {};
|
|
717
|
+
const freshness = this.computeFreshnessMetadata(ollamaModel, indexMeta);
|
|
576
718
|
|
|
577
719
|
const contextLength = this.parseContextLength(
|
|
578
720
|
ollamaModel.context_length ||
|
|
@@ -614,6 +756,7 @@ class DeterministicModelSelector {
|
|
|
614
756
|
ollamaModel.main_size,
|
|
615
757
|
ollamaModel.model_identifier
|
|
616
758
|
);
|
|
759
|
+
const moeMetadata = this.extractMoEMetadata(ollamaModel, variant, paramsB, baseText);
|
|
617
760
|
const quant = this.normalizeQuantization(
|
|
618
761
|
variant.quantization ||
|
|
619
762
|
this.extractQuantizationFromTag(variantTag) ||
|
|
@@ -623,17 +766,67 @@ class DeterministicModelSelector {
|
|
|
623
766
|
const variantSizeGB = this.extractVariantSizeGB(variant, paramsB);
|
|
624
767
|
const modalities = this.inferModalities(ollamaModel, variantTag);
|
|
625
768
|
const modelTags = this.inferTagsForVariant(derivedTags, variant, variantTag);
|
|
769
|
+
const sizeByQuant = {};
|
|
770
|
+
|
|
771
|
+
for (const sibling of variants) {
|
|
772
|
+
const siblingParams = this.extractParamsFromString(
|
|
773
|
+
sibling.size,
|
|
774
|
+
sibling.tag,
|
|
775
|
+
ollamaModel.main_size,
|
|
776
|
+
ollamaModel.model_identifier
|
|
777
|
+
);
|
|
778
|
+
|
|
779
|
+
// Keep quantization map parameter-aware: don't blend 8B/70B/405B sizes.
|
|
780
|
+
if (Math.abs(siblingParams - paramsB) > 0.25) continue;
|
|
781
|
+
|
|
782
|
+
const siblingQuant = this.normalizeQuantization(
|
|
783
|
+
sibling.quantization ||
|
|
784
|
+
this.extractQuantizationFromTag(sibling.tag || '') ||
|
|
785
|
+
quant
|
|
786
|
+
);
|
|
787
|
+
const siblingSize = this.extractVariantSizeGB(sibling, siblingParams);
|
|
788
|
+
if (!Number.isFinite(sizeByQuant[siblingQuant]) || siblingSize < sizeByQuant[siblingQuant]) {
|
|
789
|
+
sizeByQuant[siblingQuant] = siblingSize;
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
const availableQuantizations = this.getQuantizationCandidates({
|
|
794
|
+
availableQuantizations: this.extractAvailableQuantizations(ollamaModel, variants),
|
|
795
|
+
sizeByQuant
|
|
796
|
+
});
|
|
626
797
|
|
|
627
798
|
const source = ollamaModel.source || 'ollama_database';
|
|
628
799
|
const registry = ollamaModel.registry || 'ollama.com';
|
|
629
800
|
const version = ollamaModel.version || variantTag;
|
|
630
801
|
const license = ollamaModel.license || 'unknown';
|
|
631
802
|
const digest = ollamaModel.digest || 'unknown';
|
|
803
|
+
const normalizedExpertCount = Number.isFinite(moeMetadata.expertCount) && moeMetadata.expertCount > 0
|
|
804
|
+
? Math.round(moeMetadata.expertCount)
|
|
805
|
+
: null;
|
|
806
|
+
const normalizedExpertsActive = Number.isFinite(moeMetadata.expertsActivePerToken) && moeMetadata.expertsActivePerToken > 0
|
|
807
|
+
? moeMetadata.expertsActivePerToken
|
|
808
|
+
: null;
|
|
809
|
+
const normalizedTotalParamsB = Number.isFinite(moeMetadata.totalParamsB) && moeMetadata.totalParamsB > 0
|
|
810
|
+
? moeMetadata.totalParamsB
|
|
811
|
+
: null;
|
|
812
|
+
const normalizedActiveParamsB = Number.isFinite(moeMetadata.activeParamsB) && moeMetadata.activeParamsB > 0
|
|
813
|
+
? moeMetadata.activeParamsB
|
|
814
|
+
: null;
|
|
632
815
|
|
|
633
816
|
return {
|
|
634
817
|
name: variantTag,
|
|
635
818
|
family: this.extractFamily(baseIdentifier),
|
|
636
819
|
paramsB,
|
|
820
|
+
isMoE: Boolean(moeMetadata.isMoE),
|
|
821
|
+
is_moe: Boolean(moeMetadata.isMoE),
|
|
822
|
+
totalParamsB: normalizedTotalParamsB,
|
|
823
|
+
activeParamsB: normalizedActiveParamsB,
|
|
824
|
+
expertCount: normalizedExpertCount,
|
|
825
|
+
expertsActivePerToken: normalizedExpertsActive,
|
|
826
|
+
total_params_b: normalizedTotalParamsB,
|
|
827
|
+
active_params_b: normalizedActiveParamsB,
|
|
828
|
+
expert_count: normalizedExpertCount,
|
|
829
|
+
experts_active_per_token: normalizedExpertsActive,
|
|
637
830
|
ctxMax: contextLength,
|
|
638
831
|
quant,
|
|
639
832
|
sizeGB: variantSizeGB,
|
|
@@ -642,6 +835,9 @@ class DeterministicModelSelector {
|
|
|
642
835
|
model_identifier: variantTag,
|
|
643
836
|
installed: false,
|
|
644
837
|
pulls: ollamaModel.actual_pulls || ollamaModel.pulls || 0,
|
|
838
|
+
availableQuantizations,
|
|
839
|
+
sizeByQuant,
|
|
840
|
+
...freshness,
|
|
645
841
|
source,
|
|
646
842
|
registry,
|
|
647
843
|
version,
|
|
@@ -658,6 +854,134 @@ class DeterministicModelSelector {
|
|
|
658
854
|
});
|
|
659
855
|
}
|
|
660
856
|
|
|
857
|
+
parseBillionsValue(rawValue) {
|
|
858
|
+
return parseMoEBillionsValue(rawValue);
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
parsePositiveNumber(rawValue) {
|
|
862
|
+
return parseMoEPositiveNumber(rawValue);
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
extractMoEMetadata(model = {}, variant = {}, paramsB = null, baseText = '') {
|
|
866
|
+
return extractCanonicalMoEMetadata({
|
|
867
|
+
model,
|
|
868
|
+
variant,
|
|
869
|
+
paramsB,
|
|
870
|
+
baseText
|
|
871
|
+
});
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
parseDateSafe(value) {
|
|
875
|
+
if (!value || typeof value !== 'string') return null;
|
|
876
|
+
const parsed = new Date(value);
|
|
877
|
+
if (Number.isNaN(parsed.getTime())) return null;
|
|
878
|
+
return parsed;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
extractAvailableQuantizations(model, variants = []) {
|
|
882
|
+
const quantSet = new Set();
|
|
883
|
+
const candidateStrings = [];
|
|
884
|
+
|
|
885
|
+
if (Array.isArray(model?.quantizations)) {
|
|
886
|
+
candidateStrings.push(...model.quantizations);
|
|
887
|
+
}
|
|
888
|
+
if (typeof model?.quantization === 'string') {
|
|
889
|
+
candidateStrings.push(model.quantization);
|
|
890
|
+
}
|
|
891
|
+
for (const variant of variants) {
|
|
892
|
+
if (variant?.quantization) candidateStrings.push(variant.quantization);
|
|
893
|
+
if (variant?.tag) candidateStrings.push(variant.tag);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
for (const value of candidateStrings) {
|
|
897
|
+
const inferred = this.normalizeQuantization(
|
|
898
|
+
this.extractQuantizationFromTag(String(value)) || String(value)
|
|
899
|
+
);
|
|
900
|
+
if (inferred) quantSet.add(inferred);
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
if (quantSet.size === 0 && model?.quant) {
|
|
904
|
+
quantSet.add(this.normalizeQuantization(model.quant));
|
|
905
|
+
}
|
|
906
|
+
if (quantSet.size === 0) {
|
|
907
|
+
quantSet.add('Q4_K_M');
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
return [...quantSet].sort((a, b) => {
|
|
911
|
+
const aIdx = this.quantHierarchy.indexOf(a);
|
|
912
|
+
const bIdx = this.quantHierarchy.indexOf(b);
|
|
913
|
+
const safeA = aIdx === -1 ? Number.MAX_SAFE_INTEGER : aIdx;
|
|
914
|
+
const safeB = bIdx === -1 ? Number.MAX_SAFE_INTEGER : bIdx;
|
|
915
|
+
return safeA - safeB;
|
|
916
|
+
});
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
computeFreshnessMetadata(model = {}, indexMeta = {}) {
|
|
920
|
+
const dateCandidates = [
|
|
921
|
+
model.last_updated,
|
|
922
|
+
model.lastUpdated,
|
|
923
|
+
model.updated_at,
|
|
924
|
+
model.updatedAt,
|
|
925
|
+
model.release_date,
|
|
926
|
+
model.released_at,
|
|
927
|
+
model.created_at,
|
|
928
|
+
model.detailed_scraped_at
|
|
929
|
+
];
|
|
930
|
+
|
|
931
|
+
const updatedAt = dateCandidates
|
|
932
|
+
.map((value) => this.parseDateSafe(value))
|
|
933
|
+
.find(Boolean);
|
|
934
|
+
|
|
935
|
+
const ageDays = updatedAt
|
|
936
|
+
? Math.max(0, (Date.now() - updatedAt.getTime()) / (1000 * 60 * 60 * 24))
|
|
937
|
+
: null;
|
|
938
|
+
|
|
939
|
+
let freshnessScore = 55; // neutral fallback when timestamp is unknown
|
|
940
|
+
if (Number.isFinite(ageDays)) {
|
|
941
|
+
if (ageDays <= 30) freshnessScore = 100;
|
|
942
|
+
else if (ageDays <= 90) freshnessScore = 90;
|
|
943
|
+
else if (ageDays <= 180) freshnessScore = 75;
|
|
944
|
+
else if (ageDays <= 365) freshnessScore = 60;
|
|
945
|
+
else if (ageDays <= 540) freshnessScore = 40;
|
|
946
|
+
else if (ageDays <= 720) freshnessScore = 25;
|
|
947
|
+
else freshnessScore = 10;
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
const textBlob = [
|
|
951
|
+
model.model_identifier,
|
|
952
|
+
model.model_name,
|
|
953
|
+
model.name,
|
|
954
|
+
model.description,
|
|
955
|
+
model.detailed_description,
|
|
956
|
+
model.status,
|
|
957
|
+
...(Array.isArray(model.tags) ? model.tags : [])
|
|
958
|
+
]
|
|
959
|
+
.filter(Boolean)
|
|
960
|
+
.join(' ')
|
|
961
|
+
.toLowerCase();
|
|
962
|
+
|
|
963
|
+
const isDeprecatedByText =
|
|
964
|
+
/\bdeprecated\b|\bobsolete\b|\blegacy\b|\barchived\b|\breplaced by\b|\buse .+ instead\b/.test(textBlob);
|
|
965
|
+
const isDeprecated = Boolean(model.deprecated || model.is_deprecated || model.archived || isDeprecatedByText);
|
|
966
|
+
const isStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.staleDays;
|
|
967
|
+
const veryStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.veryStaleDays;
|
|
968
|
+
const indexStale = Boolean(indexMeta?.isStale);
|
|
969
|
+
|
|
970
|
+
if (isDeprecated) freshnessScore = Math.min(freshnessScore, 15);
|
|
971
|
+
if (veryStale) freshnessScore = Math.min(freshnessScore, 20);
|
|
972
|
+
if (indexStale && !updatedAt) freshnessScore = Math.max(0, freshnessScore - 10);
|
|
973
|
+
|
|
974
|
+
return {
|
|
975
|
+
lastUpdatedAt: updatedAt ? updatedAt.toISOString() : null,
|
|
976
|
+
modelAgeDays: Number.isFinite(ageDays) ? Math.round(ageDays * 10) / 10 : null,
|
|
977
|
+
freshnessScore,
|
|
978
|
+
isStale,
|
|
979
|
+
isDeprecated,
|
|
980
|
+
indexAgeDays: Number.isFinite(indexMeta?.ageDays) ? indexMeta.ageDays : null,
|
|
981
|
+
indexStale
|
|
982
|
+
};
|
|
983
|
+
}
|
|
984
|
+
|
|
661
985
|
parseContextLength(contextValue) {
|
|
662
986
|
if (typeof contextValue === 'number' && Number.isFinite(contextValue) && contextValue > 0) {
|
|
663
987
|
return Math.round(contextValue);
|
|
@@ -766,7 +1090,7 @@ class DeterministicModelSelector {
|
|
|
766
1090
|
extractFamily(modelName) {
|
|
767
1091
|
const name = modelName.toLowerCase();
|
|
768
1092
|
if (name.includes('qwen2.5')) return 'qwen2.5';
|
|
769
|
-
if (name.includes('qwen3')) return '
|
|
1093
|
+
if (name.includes('qwen3')) return 'qwen3';
|
|
770
1094
|
if (name.includes('qwen')) return 'qwen2.5';
|
|
771
1095
|
if (name.includes('deepseek')) return 'deepseek';
|
|
772
1096
|
if (name.includes('llama3.2') || name.includes('llama3.3')) return 'llama3.2';
|
|
@@ -889,13 +1213,22 @@ class DeterministicModelSelector {
|
|
|
889
1213
|
topN = 5,
|
|
890
1214
|
enableProbe = false,
|
|
891
1215
|
silent = false,
|
|
1216
|
+
optimizeFor = 'balanced',
|
|
1217
|
+
runtime = 'ollama',
|
|
892
1218
|
hardware: providedHardware = null,
|
|
893
1219
|
installedModels = null,
|
|
894
1220
|
modelPool = null
|
|
895
1221
|
} = options;
|
|
1222
|
+
const normalizedRuntime = normalizeMoERuntime(runtime);
|
|
1223
|
+
const optimizationObjective = this.normalizeOptimizationObjective(
|
|
1224
|
+
options.optimize || options.objective || optimizeFor
|
|
1225
|
+
);
|
|
896
1226
|
|
|
897
1227
|
if (!silent) {
|
|
898
1228
|
console.log(`🔍 Selecting models for category: ${category}`);
|
|
1229
|
+
if (optimizationObjective !== 'balanced') {
|
|
1230
|
+
console.log(`⚙️ Optimization profile: ${optimizationObjective}`);
|
|
1231
|
+
}
|
|
899
1232
|
}
|
|
900
1233
|
|
|
901
1234
|
// Phase 0: Gather data
|
|
@@ -931,7 +1264,15 @@ class DeterministicModelSelector {
|
|
|
931
1264
|
const budget = isUnified ? usableMem : (vram || usableMem);
|
|
932
1265
|
|
|
933
1266
|
for (const model of filtered) {
|
|
934
|
-
const result = this.evaluateModel(
|
|
1267
|
+
const result = this.evaluateModel(
|
|
1268
|
+
model,
|
|
1269
|
+
hardware,
|
|
1270
|
+
category,
|
|
1271
|
+
targetCtx,
|
|
1272
|
+
budget,
|
|
1273
|
+
optimizationObjective,
|
|
1274
|
+
normalizedRuntime
|
|
1275
|
+
);
|
|
935
1276
|
if (result) {
|
|
936
1277
|
candidates.push(result);
|
|
937
1278
|
}
|
|
@@ -939,7 +1280,14 @@ class DeterministicModelSelector {
|
|
|
939
1280
|
|
|
940
1281
|
// Sort by score
|
|
941
1282
|
candidates.sort((a, b) => b.score - a.score);
|
|
942
|
-
|
|
1283
|
+
let topCandidates = candidates.slice(0, topN);
|
|
1284
|
+
topCandidates = this.ensureFeasibleMidTierCoverage(
|
|
1285
|
+
topCandidates,
|
|
1286
|
+
candidates,
|
|
1287
|
+
category,
|
|
1288
|
+
hardware,
|
|
1289
|
+
optimizationObjective
|
|
1290
|
+
);
|
|
943
1291
|
|
|
944
1292
|
if (!silent) {
|
|
945
1293
|
console.log(`✨ Selected ${topCandidates.length} top candidates`);
|
|
@@ -957,6 +1305,8 @@ class DeterministicModelSelector {
|
|
|
957
1305
|
|
|
958
1306
|
return {
|
|
959
1307
|
category,
|
|
1308
|
+
optimizeFor: optimizationObjective,
|
|
1309
|
+
runtime: normalizedRuntime,
|
|
960
1310
|
hardware,
|
|
961
1311
|
candidates: topCandidates,
|
|
962
1312
|
total_evaluated: filtered.length,
|
|
@@ -1008,42 +1358,115 @@ class DeterministicModelSelector {
|
|
|
1008
1358
|
});
|
|
1009
1359
|
}
|
|
1010
1360
|
|
|
1011
|
-
evaluateModel(model, hardware, category, targetCtx, budget) {
|
|
1361
|
+
evaluateModel(model, hardware, category, targetCtx, budget, optimizeFor = 'balanced', runtime = 'ollama') {
|
|
1012
1362
|
// 1. Select best fitting quantization
|
|
1013
1363
|
const bestQuant = this.selectBestQuantization(model, budget, targetCtx);
|
|
1014
1364
|
if (!bestQuant) return null;
|
|
1015
1365
|
|
|
1016
1366
|
// 2. Calculate required memory
|
|
1017
|
-
const
|
|
1367
|
+
const memoryEstimate = this.estimateMemoryBreakdown(model, bestQuant.quant, targetCtx);
|
|
1368
|
+
const requiredGB = memoryEstimate.requiredGB;
|
|
1018
1369
|
if (requiredGB > budget) return null;
|
|
1019
1370
|
|
|
1020
1371
|
// 3. Calculate component scores
|
|
1021
1372
|
const Q = this.calculateQualityPrior(model, bestQuant.quant, category);
|
|
1022
|
-
const
|
|
1373
|
+
const speedEstimate = this.estimateSpeedProfile(hardware, model, bestQuant.quant, category, runtime);
|
|
1374
|
+
const S = speedEstimate.score;
|
|
1023
1375
|
const F = this.calculateFitScore(requiredGB, budget);
|
|
1024
1376
|
const C = this.calculateContextScore(model, targetCtx);
|
|
1025
1377
|
|
|
1026
1378
|
// 4. Calculate final weighted score
|
|
1027
|
-
const weights = this.
|
|
1379
|
+
const weights = this.getScoringWeights(category, optimizeFor);
|
|
1028
1380
|
const score = Math.round((Q * weights[0] + S * weights[1] + F * weights[2] + C * weights[3]) * 10) / 10;
|
|
1029
1381
|
|
|
1030
1382
|
// 5. Build rationale
|
|
1031
|
-
const rationale = this.buildRationale(
|
|
1383
|
+
const rationale = this.buildRationale(
|
|
1384
|
+
hardware,
|
|
1385
|
+
model,
|
|
1386
|
+
bestQuant.quant,
|
|
1387
|
+
requiredGB,
|
|
1388
|
+
budget,
|
|
1389
|
+
category,
|
|
1390
|
+
Q,
|
|
1391
|
+
S,
|
|
1392
|
+
memoryEstimate,
|
|
1393
|
+
speedEstimate
|
|
1394
|
+
);
|
|
1032
1395
|
|
|
1033
1396
|
return {
|
|
1034
1397
|
meta: model,
|
|
1035
1398
|
quant: bestQuant.quant,
|
|
1036
1399
|
requiredGB: Math.round(requiredGB * 10) / 10,
|
|
1037
|
-
estTPS:
|
|
1400
|
+
estTPS: speedEstimate.estimatedTPS,
|
|
1038
1401
|
score,
|
|
1402
|
+
runtime: speedEstimate.runtime,
|
|
1039
1403
|
rationale,
|
|
1404
|
+
memory: {
|
|
1405
|
+
modelMemGB: Math.round(memoryEstimate.modelMemGB * 100) / 100,
|
|
1406
|
+
kvCacheGB: Math.round(memoryEstimate.kvCacheGB * 100) / 100,
|
|
1407
|
+
runtimeOverheadGB: Math.round(memoryEstimate.runtimeOverheadGB * 100) / 100,
|
|
1408
|
+
memorySource: memoryEstimate.memorySource,
|
|
1409
|
+
assumptionSource: memoryEstimate.parameterProfile.assumptionSource,
|
|
1410
|
+
isMoE: memoryEstimate.parameterProfile.isMoE,
|
|
1411
|
+
effectiveParamsB: Math.round(memoryEstimate.parameterProfile.effectiveParamsB * 1000) / 1000
|
|
1412
|
+
},
|
|
1413
|
+
speed: {
|
|
1414
|
+
backend: speedEstimate.backend,
|
|
1415
|
+
targetTPS: speedEstimate.targetTPS,
|
|
1416
|
+
estimatedTPS: speedEstimate.estimatedTPS,
|
|
1417
|
+
runtime: speedEstimate.runtime,
|
|
1418
|
+
moe: speedEstimate.moe
|
|
1419
|
+
},
|
|
1040
1420
|
components: { Q, S, F, C }
|
|
1041
1421
|
};
|
|
1042
1422
|
}
|
|
1043
1423
|
|
|
1424
|
+
getQuantizationCandidates(model) {
|
|
1425
|
+
const normalizedAvailable = Array.isArray(model?.availableQuantizations)
|
|
1426
|
+
? model.availableQuantizations.map((quant) => this.normalizeQuantization(quant))
|
|
1427
|
+
: [];
|
|
1428
|
+
const fromSizeMap = model?.sizeByQuant && typeof model.sizeByQuant === 'object'
|
|
1429
|
+
? Object.keys(model.sizeByQuant).map((quant) => this.normalizeQuantization(quant))
|
|
1430
|
+
: [];
|
|
1431
|
+
|
|
1432
|
+
const seeded = (fromSizeMap.length > 0
|
|
1433
|
+
? [...new Set(fromSizeMap)]
|
|
1434
|
+
: [...new Set(normalizedAvailable)])
|
|
1435
|
+
.filter(Boolean);
|
|
1436
|
+
|
|
1437
|
+
let candidates = seeded.length > 0 ? seeded : [...this.quantHierarchy];
|
|
1438
|
+
|
|
1439
|
+
// If we have at least one known quantization, allow extrapolating to
|
|
1440
|
+
// *more compressed* levels as an explicit feasibility assumption.
|
|
1441
|
+
if (seeded.length > 0) {
|
|
1442
|
+
const expanded = new Set();
|
|
1443
|
+
for (const quant of seeded) {
|
|
1444
|
+
const idx = this.quantHierarchy.indexOf(quant);
|
|
1445
|
+
if (idx === -1) {
|
|
1446
|
+
expanded.add(quant);
|
|
1447
|
+
continue;
|
|
1448
|
+
}
|
|
1449
|
+
for (let i = idx; i < this.quantHierarchy.length; i++) {
|
|
1450
|
+
expanded.add(this.quantHierarchy[i]);
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
candidates = [...expanded];
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
return candidates.sort((a, b) => {
|
|
1457
|
+
const aIdx = this.quantHierarchy.indexOf(a);
|
|
1458
|
+
const bIdx = this.quantHierarchy.indexOf(b);
|
|
1459
|
+
const safeA = aIdx === -1 ? Number.MAX_SAFE_INTEGER : aIdx;
|
|
1460
|
+
const safeB = bIdx === -1 ? Number.MAX_SAFE_INTEGER : bIdx;
|
|
1461
|
+
return safeA - safeB;
|
|
1462
|
+
});
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1044
1465
|
selectBestQuantization(model, budget, targetCtx) {
|
|
1466
|
+
const quantizationCandidates = this.getQuantizationCandidates(model);
|
|
1467
|
+
|
|
1045
1468
|
// Try quantizations from best to worst quality
|
|
1046
|
-
for (const quant of
|
|
1469
|
+
for (const quant of quantizationCandidates) {
|
|
1047
1470
|
const requiredGB = this.estimateRequiredGB(model, quant, targetCtx);
|
|
1048
1471
|
if (requiredGB <= budget) {
|
|
1049
1472
|
return { quant, sizeGB: requiredGB };
|
|
@@ -1053,7 +1476,7 @@ class DeterministicModelSelector {
|
|
|
1053
1476
|
// If nothing fits at target context, try halving context once
|
|
1054
1477
|
const halfCtx = Math.floor(targetCtx / 2);
|
|
1055
1478
|
if (halfCtx >= 1024) {
|
|
1056
|
-
for (const quant of
|
|
1479
|
+
for (const quant of quantizationCandidates) {
|
|
1057
1480
|
const requiredGB = this.estimateRequiredGB(model, quant, halfCtx);
|
|
1058
1481
|
if (requiredGB <= budget) {
|
|
1059
1482
|
return { quant, sizeGB: requiredGB };
|
|
@@ -1064,7 +1487,11 @@ class DeterministicModelSelector {
|
|
|
1064
1487
|
return null; // Model doesn't fit
|
|
1065
1488
|
}
|
|
1066
1489
|
|
|
1067
|
-
|
|
1490
|
+
resolveMemoryParameterProfile(model = {}) {
|
|
1491
|
+
return resolveMoEParameterProfile(model);
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
estimateMemoryBreakdown(model, quant, ctx) {
|
|
1068
1495
|
// Bytes per parameter by quantization level (calibrated to real Ollama sizes)
|
|
1069
1496
|
// 7B Q4_K_M=~4.5GB, 14B Q4_K_M=~9GB, 32B Q4_K_M=~19GB
|
|
1070
1497
|
const bytesPerParam = {
|
|
@@ -1075,17 +1502,54 @@ class DeterministicModelSelector {
|
|
|
1075
1502
|
'Q3_K': 0.48,
|
|
1076
1503
|
'Q2_K': 0.37
|
|
1077
1504
|
};
|
|
1078
|
-
const
|
|
1079
|
-
const
|
|
1505
|
+
const normalizedQuant = this.normalizeQuantization(quant);
|
|
1506
|
+
const bpp = bytesPerParam[normalizedQuant] || 0.63;
|
|
1507
|
+
const sizeByQuant = model?.sizeByQuant && typeof model.sizeByQuant === 'object' ? model.sizeByQuant : {};
|
|
1508
|
+
const observedFromSizeMap = Number(sizeByQuant[normalizedQuant]);
|
|
1509
|
+
const directVariantMatch =
|
|
1510
|
+
this.normalizeQuantization(model?.quant || '') === normalizedQuant
|
|
1511
|
+
? Number(model?.sizeGB ?? model?.size)
|
|
1512
|
+
: NaN;
|
|
1513
|
+
|
|
1514
|
+
const observedWeightGB = Number.isFinite(observedFromSizeMap) && observedFromSizeMap > 0
|
|
1515
|
+
? observedFromSizeMap
|
|
1516
|
+
: (Number.isFinite(directVariantMatch) && directVariantMatch > 0 ? directVariantMatch : null);
|
|
1517
|
+
|
|
1518
|
+
const parameterProfile = this.resolveMemoryParameterProfile(model);
|
|
1519
|
+
const modeledWeightGB = parameterProfile.effectiveParamsB * bpp;
|
|
1520
|
+
const preferSparseInferenceParams =
|
|
1521
|
+
parameterProfile.isMoE &&
|
|
1522
|
+
(parameterProfile.assumptionSource === 'moe_active_metadata' ||
|
|
1523
|
+
parameterProfile.assumptionSource === 'moe_derived_expert_ratio');
|
|
1524
|
+
const useObservedArtifactSize =
|
|
1525
|
+
!preferSparseInferenceParams &&
|
|
1526
|
+
Number.isFinite(observedWeightGB) &&
|
|
1527
|
+
observedWeightGB > 0;
|
|
1528
|
+
const modelMemGB = useObservedArtifactSize ? observedWeightGB : modeledWeightGB;
|
|
1529
|
+
const effectiveCtx = Number.isFinite(Number(ctx)) && Number(ctx) > 0 ? Number(ctx) : 4096;
|
|
1080
1530
|
|
|
1081
1531
|
// KV cache: ~2 * numLayers * hiddenDim * 2bytes * ctx / 1e9
|
|
1082
1532
|
// Simplified: ~0.000008 GB per billion params per context token
|
|
1083
|
-
const kvCacheGB = 0.000008 *
|
|
1533
|
+
const kvCacheGB = 0.000008 * parameterProfile.effectiveParamsB * effectiveCtx;
|
|
1084
1534
|
|
|
1085
1535
|
// Runtime overhead (Metal/CUDA context, buffers)
|
|
1086
|
-
const runtimeOverhead = 0.5;
|
|
1536
|
+
const runtimeOverhead = useObservedArtifactSize ? 0.35 : 0.5;
|
|
1537
|
+
const memorySource = useObservedArtifactSize
|
|
1538
|
+
? 'observed_artifact_size'
|
|
1539
|
+
: (preferSparseInferenceParams ? 'moe_sparse_inference_params' : 'estimated_from_params');
|
|
1087
1540
|
|
|
1088
|
-
return
|
|
1541
|
+
return {
|
|
1542
|
+
parameterProfile,
|
|
1543
|
+
memorySource,
|
|
1544
|
+
modelMemGB,
|
|
1545
|
+
kvCacheGB,
|
|
1546
|
+
runtimeOverheadGB: runtimeOverhead,
|
|
1547
|
+
requiredGB: modelMemGB + kvCacheGB + runtimeOverhead
|
|
1548
|
+
};
|
|
1549
|
+
}
|
|
1550
|
+
|
|
1551
|
+
estimateRequiredGB(model, quant, ctx) {
|
|
1552
|
+
return this.estimateMemoryBreakdown(model, quant, ctx).requiredGB;
|
|
1089
1553
|
}
|
|
1090
1554
|
|
|
1091
1555
|
calculateQualityPrior(model, quant, category) {
|
|
@@ -1099,6 +1563,10 @@ class DeterministicModelSelector {
|
|
|
1099
1563
|
// Quantization penalty
|
|
1100
1564
|
const quantPenalty = this.quantPenalties[quant] || -5;
|
|
1101
1565
|
Q += quantPenalty;
|
|
1566
|
+
|
|
1567
|
+
// Freshness/deprecation adjustment
|
|
1568
|
+
const freshnessAdjustment = this.calculateFreshnessAdjustment(model);
|
|
1569
|
+
Q += freshnessAdjustment;
|
|
1102
1570
|
|
|
1103
1571
|
// Task alignment bump
|
|
1104
1572
|
const taskBump = this.getTaskAlignmentBump(model, category);
|
|
@@ -1155,7 +1623,28 @@ class DeterministicModelSelector {
|
|
|
1155
1623
|
}
|
|
1156
1624
|
}
|
|
1157
1625
|
|
|
1158
|
-
|
|
1626
|
+
calculateFreshnessAdjustment(model = {}) {
|
|
1627
|
+
const freshnessScore = Number.isFinite(model.freshnessScore) ? model.freshnessScore : 55;
|
|
1628
|
+
const ageDays = Number.isFinite(model.modelAgeDays) ? model.modelAgeDays : null;
|
|
1629
|
+
const isDeprecated = Boolean(model.isDeprecated);
|
|
1630
|
+
const isStale = Boolean(model.isStale);
|
|
1631
|
+
|
|
1632
|
+
if (isDeprecated) return -12;
|
|
1633
|
+
if (ageDays !== null && ageDays > this.freshnessThresholds.veryStaleDays) return -8;
|
|
1634
|
+
if (ageDays !== null && ageDays > this.freshnessThresholds.staleDays) return -4;
|
|
1635
|
+
if (isStale) return -3;
|
|
1636
|
+
if (freshnessScore >= 90) return 3;
|
|
1637
|
+
if (freshnessScore >= 75) return 2;
|
|
1638
|
+
if (freshnessScore >= 60) return 1;
|
|
1639
|
+
if (freshnessScore <= 25) return -4;
|
|
1640
|
+
return 0;
|
|
1641
|
+
}
|
|
1642
|
+
|
|
1643
|
+
estimateSpeed(hardware, model, quant, category, runtime = 'ollama') {
|
|
1644
|
+
return this.estimateSpeedProfile(hardware, model, quant, category, runtime).score;
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
estimateSpeedProfile(hardware, model, quant, category, runtime = 'ollama') {
|
|
1159
1648
|
// Determine backend
|
|
1160
1649
|
let backend = 'cpu_x86';
|
|
1161
1650
|
if (hardware.acceleration.supports_metal) backend = 'metal';
|
|
@@ -1164,7 +1653,14 @@ class DeterministicModelSelector {
|
|
|
1164
1653
|
|
|
1165
1654
|
// Base speed calculation
|
|
1166
1655
|
const K = this.backendK[backend];
|
|
1167
|
-
|
|
1656
|
+
const denseParamsB = Number.isFinite(this.parseBillionsValue(model.paramsB))
|
|
1657
|
+
? this.parseBillionsValue(model.paramsB)
|
|
1658
|
+
: 1;
|
|
1659
|
+
const parameterProfile = this.resolveMemoryParameterProfile(model);
|
|
1660
|
+
const effectiveParamsB = Number.isFinite(parameterProfile.effectiveParamsB) && parameterProfile.effectiveParamsB > 0
|
|
1661
|
+
? parameterProfile.effectiveParamsB
|
|
1662
|
+
: denseParamsB;
|
|
1663
|
+
let base = K / effectiveParamsB;
|
|
1168
1664
|
|
|
1169
1665
|
// Quantization multiplier
|
|
1170
1666
|
const quantMultiplier = this.quantSpeedMultipliers[quant] || 1.0;
|
|
@@ -1173,10 +1669,31 @@ class DeterministicModelSelector {
|
|
|
1173
1669
|
// Threading multiplier
|
|
1174
1670
|
if (hardware.cpu.cores >= 8) base *= 1.1;
|
|
1175
1671
|
if (hardware.acceleration.supports_metal || hardware.acceleration.supports_cuda) base *= 1.2;
|
|
1672
|
+
|
|
1673
|
+
const normalizedRuntime = normalizeMoERuntime(runtime);
|
|
1674
|
+
const moe = estimateMoESpeedMultiplier({
|
|
1675
|
+
model,
|
|
1676
|
+
runtime: normalizedRuntime,
|
|
1677
|
+
denseParamsB,
|
|
1678
|
+
parameterProfile
|
|
1679
|
+
});
|
|
1680
|
+
if (moe.applied) {
|
|
1681
|
+
base *= moe.multiplier;
|
|
1682
|
+
}
|
|
1176
1683
|
|
|
1177
1684
|
// Normalize to 0-100 score
|
|
1178
1685
|
const target = this.targetSpeeds[category] || this.targetSpeeds.general;
|
|
1179
|
-
|
|
1686
|
+
const estimatedTPS = Math.max(1, Math.round(base * 10) / 10);
|
|
1687
|
+
const score = Math.min(100, Math.round((100 * estimatedTPS / target) * 10) / 10);
|
|
1688
|
+
|
|
1689
|
+
return {
|
|
1690
|
+
backend,
|
|
1691
|
+
targetTPS: target,
|
|
1692
|
+
estimatedTPS,
|
|
1693
|
+
score,
|
|
1694
|
+
runtime: normalizedRuntime,
|
|
1695
|
+
moe
|
|
1696
|
+
};
|
|
1180
1697
|
}
|
|
1181
1698
|
|
|
1182
1699
|
calculateFitScore(requiredGB, budgetGB) {
|
|
@@ -1192,7 +1709,104 @@ class DeterministicModelSelector {
|
|
|
1192
1709
|
return 0; // Should be filtered out earlier
|
|
1193
1710
|
}
|
|
1194
1711
|
|
|
1195
|
-
|
|
1712
|
+
estimatePracticalMaxParamsForBudget(budgetGB) {
|
|
1713
|
+
if (!Number.isFinite(budgetGB) || budgetGB <= 0) return 4;
|
|
1714
|
+
if (budgetGB >= 80) return 70;
|
|
1715
|
+
if (budgetGB >= 48) return 46;
|
|
1716
|
+
if (budgetGB >= 32) return 30;
|
|
1717
|
+
if (budgetGB >= 24) return 14;
|
|
1718
|
+
if (budgetGB >= 16) return 8;
|
|
1719
|
+
return 4;
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
ensureFeasibleMidTierCoverage(selectedCandidates, allCandidates, category, hardware, optimizeFor = 'balanced') {
|
|
1723
|
+
if (!Array.isArray(selectedCandidates) || selectedCandidates.length === 0) {
|
|
1724
|
+
return selectedCandidates;
|
|
1725
|
+
}
|
|
1726
|
+
|
|
1727
|
+
const objective = this.normalizeOptimizationObjective(optimizeFor);
|
|
1728
|
+
if (objective === 'speed') {
|
|
1729
|
+
return selectedCandidates;
|
|
1730
|
+
}
|
|
1731
|
+
|
|
1732
|
+
const enforceCategories = new Set(['general', 'talking', 'reading', 'coding', 'reasoning', 'multimodal']);
|
|
1733
|
+
if (!enforceCategories.has(category)) {
|
|
1734
|
+
return selectedCandidates;
|
|
1735
|
+
}
|
|
1736
|
+
|
|
1737
|
+
const normalizedHardware = this.normalizeHardwareProfile(hardware || {});
|
|
1738
|
+
const budget = normalizedHardware.gpu.unified
|
|
1739
|
+
? normalizedHardware.usableMemGB
|
|
1740
|
+
: (normalizedHardware.gpu.vramGB || normalizedHardware.usableMemGB);
|
|
1741
|
+
|
|
1742
|
+
if (!Number.isFinite(budget) || budget < 16) {
|
|
1743
|
+
return selectedCandidates;
|
|
1744
|
+
}
|
|
1745
|
+
|
|
1746
|
+
const candidatePool = Array.isArray(allCandidates) && allCandidates.length > 0
|
|
1747
|
+
? allCandidates
|
|
1748
|
+
: selectedCandidates;
|
|
1749
|
+
let promoted = [...selectedCandidates];
|
|
1750
|
+
|
|
1751
|
+
const minMidTierParams = budget >= 24 ? 7 : 6;
|
|
1752
|
+
const alreadyHasMidTier = promoted.some((candidate) => (candidate?.meta?.paramsB || 0) >= minMidTierParams);
|
|
1753
|
+
if (!alreadyHasMidTier) {
|
|
1754
|
+
const practicalSpeedFloor = normalizedHardware.gpu.unified ? 25 : 20;
|
|
1755
|
+
const feasibleMidTier = candidatePool.find((candidate) => {
|
|
1756
|
+
const params = candidate?.meta?.paramsB || 0;
|
|
1757
|
+
const speedScore = candidate?.components?.S ?? candidate?.estTPS ?? 0;
|
|
1758
|
+
return params >= minMidTierParams && speedScore >= practicalSpeedFloor;
|
|
1759
|
+
});
|
|
1760
|
+
|
|
1761
|
+
if (
|
|
1762
|
+
feasibleMidTier &&
|
|
1763
|
+
!promoted.some((candidate) => candidate?.meta?.model_identifier === feasibleMidTier?.meta?.model_identifier)
|
|
1764
|
+
) {
|
|
1765
|
+
promoted[promoted.length - 1] = feasibleMidTier;
|
|
1766
|
+
promoted.sort((a, b) => b.score - a.score);
|
|
1767
|
+
}
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
const practicalMaxParams = this.estimatePracticalMaxParamsForBudget(budget);
|
|
1771
|
+
const shouldEnforceThirtyBCoverage =
|
|
1772
|
+
Boolean(normalizedHardware?.gpu?.isMultiGPU) &&
|
|
1773
|
+
!Boolean(normalizedHardware?.gpu?.unified) &&
|
|
1774
|
+
practicalMaxParams >= 30;
|
|
1775
|
+
|
|
1776
|
+
if (!shouldEnforceThirtyBCoverage || objective === 'speed') {
|
|
1777
|
+
return promoted;
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1780
|
+
const alreadyHasThirtyB = promoted.some((candidate) => (candidate?.meta?.paramsB || 0) >= 30);
|
|
1781
|
+
if (alreadyHasThirtyB) {
|
|
1782
|
+
return promoted;
|
|
1783
|
+
}
|
|
1784
|
+
|
|
1785
|
+
const largeModelSpeedFloor = Math.max(
|
|
1786
|
+
8,
|
|
1787
|
+
Math.round((this.targetSpeeds[category] || this.targetSpeeds.general) * 0.2)
|
|
1788
|
+
);
|
|
1789
|
+
const feasibleThirtyB = candidatePool.find((candidate) => {
|
|
1790
|
+
const params = candidate?.meta?.paramsB || 0;
|
|
1791
|
+
const estTPS = candidate?.estTPS ?? candidate?.speed?.estimatedTPS ?? 0;
|
|
1792
|
+
return params >= 30 && estTPS >= largeModelSpeedFloor;
|
|
1793
|
+
});
|
|
1794
|
+
|
|
1795
|
+
if (!feasibleThirtyB) {
|
|
1796
|
+
return promoted;
|
|
1797
|
+
}
|
|
1798
|
+
|
|
1799
|
+
if (promoted.some((candidate) => candidate?.meta?.model_identifier === feasibleThirtyB?.meta?.model_identifier)) {
|
|
1800
|
+
return promoted;
|
|
1801
|
+
}
|
|
1802
|
+
|
|
1803
|
+
const highCapacityPromoted = [...promoted];
|
|
1804
|
+
highCapacityPromoted[highCapacityPromoted.length - 1] = feasibleThirtyB;
|
|
1805
|
+
highCapacityPromoted.sort((a, b) => b.score - a.score);
|
|
1806
|
+
return highCapacityPromoted;
|
|
1807
|
+
}
|
|
1808
|
+
|
|
1809
|
+
buildRationale(hardware, model, quant, requiredGB, budget, category, Q, S, memoryEstimate = null, speedEstimate = null) {
|
|
1196
1810
|
const parts = [];
|
|
1197
1811
|
|
|
1198
1812
|
// Memory fit
|
|
@@ -1204,6 +1818,27 @@ class DeterministicModelSelector {
|
|
|
1204
1818
|
// Special attributes
|
|
1205
1819
|
if (model.tags.includes('coder')) parts.push('coder-tuned');
|
|
1206
1820
|
if (model.modalities.includes('vision')) parts.push('vision-capable');
|
|
1821
|
+
if (model.isDeprecated) parts.push('deprecated penalized');
|
|
1822
|
+
else if (model.isStale) parts.push('stale penalized');
|
|
1823
|
+
else if (model.freshnessScore >= 90) parts.push('fresh release');
|
|
1824
|
+
|
|
1825
|
+
const memoryProfile = memoryEstimate?.parameterProfile;
|
|
1826
|
+
if (memoryProfile?.isMoE) {
|
|
1827
|
+
const assumptionLabels = {
|
|
1828
|
+
moe_active_metadata: 'MoE active params',
|
|
1829
|
+
moe_derived_expert_ratio: 'MoE derived active ratio',
|
|
1830
|
+
moe_fallback_total_params: 'MoE fallback total params',
|
|
1831
|
+
moe_fallback_model_params: 'MoE fallback model params',
|
|
1832
|
+
moe_fallback_default: 'MoE fallback default'
|
|
1833
|
+
};
|
|
1834
|
+
parts.push(assumptionLabels[memoryProfile.assumptionSource] || memoryProfile.assumptionSource);
|
|
1835
|
+
}
|
|
1836
|
+
|
|
1837
|
+
if (speedEstimate?.moe?.applied) {
|
|
1838
|
+
const runtimeLabel = speedEstimate.runtime || 'ollama';
|
|
1839
|
+
const multiplier = Number(speedEstimate.moe.multiplier || 1).toFixed(2);
|
|
1840
|
+
parts.push(`MoE speed x${multiplier} (${runtimeLabel})`);
|
|
1841
|
+
}
|
|
1207
1842
|
|
|
1208
1843
|
// Size sweet spot
|
|
1209
1844
|
if (model.paramsB >= 7 && model.paramsB <= 13) {
|
|
@@ -1380,6 +2015,16 @@ class DeterministicModelSelector {
|
|
|
1380
2015
|
quantization: candidate.quant,
|
|
1381
2016
|
estimatedRAM: candidate.requiredGB,
|
|
1382
2017
|
reasoning: candidate.rationale,
|
|
2018
|
+
runtime: candidate.runtime || candidate.speed?.runtime || 'ollama',
|
|
2019
|
+
memoryAssumptionSource: candidate.memory?.assumptionSource || 'dense_params',
|
|
2020
|
+
speedAssumptions: candidate.speed?.moe ? {
|
|
2021
|
+
applied: Boolean(candidate.speed.moe.applied),
|
|
2022
|
+
runtime: candidate.speed.runtime || candidate.runtime || 'ollama',
|
|
2023
|
+
multiplier: Number.isFinite(candidate.speed.moe.multiplier) ? candidate.speed.moe.multiplier : 1,
|
|
2024
|
+
theoreticalSpeedup: Number.isFinite(candidate.speed.moe.theoreticalSpeedup) ? candidate.speed.moe.theoreticalSpeedup : 1,
|
|
2025
|
+
overheadMultiplier: Number.isFinite(candidate.speed.moe.overheadMultiplier) ? candidate.speed.moe.overheadMultiplier : 1,
|
|
2026
|
+
assumptionSource: candidate.speed.moe.assumptionSource || candidate.memory?.assumptionSource || 'dense_params'
|
|
2027
|
+
} : null,
|
|
1383
2028
|
source: provenance.source,
|
|
1384
2029
|
registry: provenance.registry,
|
|
1385
2030
|
version: provenance.version,
|
|
@@ -1410,9 +2055,23 @@ class DeterministicModelSelector {
|
|
|
1410
2055
|
cores = 4;
|
|
1411
2056
|
}
|
|
1412
2057
|
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
2058
|
+
const gpu = hardware?.gpu || {};
|
|
2059
|
+
const gpuCount =
|
|
2060
|
+
(Number.isFinite(Number(gpu.gpuCount)) ? Number(gpu.gpuCount) : null) ??
|
|
2061
|
+
(Number.isFinite(Number(hardware?.gpuCount)) ? Number(hardware.gpuCount) : null) ??
|
|
2062
|
+
1;
|
|
2063
|
+
const totalVRAM =
|
|
2064
|
+
(Number.isFinite(Number(gpu.vramGB)) ? Number(gpu.vramGB) : null) ??
|
|
2065
|
+
(Number.isFinite(Number(gpu.vram)) ? Number(gpu.vram) : null) ??
|
|
2066
|
+
(Number.isFinite(Number(gpu.totalVRAM)) ? Number(gpu.totalVRAM) : null) ??
|
|
2067
|
+
0;
|
|
2068
|
+
const unifiedGPU = Boolean(gpu.unified) || gpu.type === 'apple_silicon';
|
|
2069
|
+
const effectiveAcceleratorMem = unifiedGPU ? Math.max(totalVRAM, ram) : totalVRAM;
|
|
2070
|
+
|
|
2071
|
+
if (effectiveAcceleratorMem >= 80 || (ram >= 64 && cores >= 16)) return 'extreme';
|
|
2072
|
+
if (effectiveAcceleratorMem >= 48 || (ram >= 32 && cores >= 12)) return 'very_high';
|
|
2073
|
+
if (effectiveAcceleratorMem >= 24 || (ram >= 16 && cores >= 8)) return 'high';
|
|
2074
|
+
if (gpuCount >= 2 && effectiveAcceleratorMem >= 20) return 'high';
|
|
1416
2075
|
if (ram >= 8 && cores >= 4) return 'medium';
|
|
1417
2076
|
return 'low';
|
|
1418
2077
|
}
|
|
@@ -1451,12 +2110,16 @@ class DeterministicModelSelector {
|
|
|
1451
2110
|
/**
|
|
1452
2111
|
* Generate recommendations by category (main API, replaces EnhancedModelSelector)
|
|
1453
2112
|
*/
|
|
1454
|
-
async getBestModelsForHardware(hardware, allModels) {
|
|
2113
|
+
async getBestModelsForHardware(hardware, allModels, options = {}) {
|
|
1455
2114
|
const categories = ['coding', 'reasoning', 'multimodal', 'creative', 'talking', 'reading', 'general'];
|
|
1456
2115
|
const recommendations = {};
|
|
1457
2116
|
const normalizedPool = this.normalizeExternalModels(Array.isArray(allModels) ? allModels : []);
|
|
1458
2117
|
const installedModels = await this.getInstalledModels();
|
|
1459
2118
|
const normalizedHardware = this.normalizeHardwareProfile(hardware || await this.getHardware());
|
|
2119
|
+
const runtime = normalizeMoERuntime(options.runtime || 'ollama');
|
|
2120
|
+
const optimizationObjective = this.normalizeOptimizationObjective(
|
|
2121
|
+
options.optimizeFor || options.optimize || options.objective
|
|
2122
|
+
);
|
|
1460
2123
|
|
|
1461
2124
|
for (const category of categories) {
|
|
1462
2125
|
try {
|
|
@@ -1464,6 +2127,8 @@ class DeterministicModelSelector {
|
|
|
1464
2127
|
topN: 3,
|
|
1465
2128
|
enableProbe: false,
|
|
1466
2129
|
silent: true,
|
|
2130
|
+
optimizeFor: optimizationObjective,
|
|
2131
|
+
runtime,
|
|
1467
2132
|
hardware: normalizedHardware,
|
|
1468
2133
|
installedModels,
|
|
1469
2134
|
modelPool: normalizedPool
|
|
@@ -1471,6 +2136,8 @@ class DeterministicModelSelector {
|
|
|
1471
2136
|
|
|
1472
2137
|
recommendations[category] = {
|
|
1473
2138
|
tier: this.mapHardwareTier(normalizedHardware),
|
|
2139
|
+
optimizeFor: optimizationObjective,
|
|
2140
|
+
runtime,
|
|
1474
2141
|
bestModels: result.candidates.map(candidate => this.mapCandidateToLegacyFormat(candidate)),
|
|
1475
2142
|
totalEvaluated: result.total_evaluated,
|
|
1476
2143
|
category: this.getCategoryInfo(category)
|
|
@@ -1478,6 +2145,8 @@ class DeterministicModelSelector {
|
|
|
1478
2145
|
} catch (error) {
|
|
1479
2146
|
recommendations[category] = {
|
|
1480
2147
|
tier: this.mapHardwareTier(normalizedHardware),
|
|
2148
|
+
optimizeFor: optimizationObjective,
|
|
2149
|
+
runtime,
|
|
1481
2150
|
bestModels: [],
|
|
1482
2151
|
totalEvaluated: 0,
|
|
1483
2152
|
category: this.getCategoryInfo(category)
|
|
@@ -1491,9 +2160,12 @@ class DeterministicModelSelector {
|
|
|
1491
2160
|
/**
|
|
1492
2161
|
* Generate recommendation summary
|
|
1493
2162
|
*/
|
|
1494
|
-
generateRecommendationSummary(recommendations, hardware) {
|
|
2163
|
+
generateRecommendationSummary(recommendations, hardware, options = {}) {
|
|
1495
2164
|
const summary = {
|
|
1496
2165
|
hardware_tier: this.mapHardwareTier(hardware),
|
|
2166
|
+
optimize_for: this.normalizeOptimizationObjective(
|
|
2167
|
+
options.optimizeFor || options.optimize || options.objective
|
|
2168
|
+
),
|
|
1497
2169
|
total_categories: Object.keys(recommendations).length,
|
|
1498
2170
|
best_overall: null,
|
|
1499
2171
|
by_category: {},
|
|
@@ -1513,6 +2185,7 @@ class DeterministicModelSelector {
|
|
|
1513
2185
|
score: Math.round(bestModel.categoryScore || bestModel.score),
|
|
1514
2186
|
command: `ollama pull ${bestModel.model_identifier}`,
|
|
1515
2187
|
size: this.formatModelSize(bestModel),
|
|
2188
|
+
quantization: bestModel.quantization || bestModel.quant || 'Q4_K_M',
|
|
1516
2189
|
pulls: bestModel.pulls || 0,
|
|
1517
2190
|
source: bestModel.source || bestModel.provenance?.source || 'unknown',
|
|
1518
2191
|
registry: bestModel.registry || bestModel.provenance?.registry || 'unknown',
|
|
@@ -1548,6 +2221,7 @@ class DeterministicModelSelector {
|
|
|
1548
2221
|
category: bestOverallCategory,
|
|
1549
2222
|
score: Math.round(bestOverallScore),
|
|
1550
2223
|
command: `ollama pull ${bestOverallModel.model_identifier}`,
|
|
2224
|
+
quantization: bestOverallModel.quantization || bestOverallModel.quant || 'Q4_K_M',
|
|
1551
2225
|
source: bestOverallModel.source || bestOverallModel.provenance?.source || 'unknown',
|
|
1552
2226
|
registry: bestOverallModel.registry || bestOverallModel.provenance?.registry || 'unknown',
|
|
1553
2227
|
version: bestOverallModel.version || bestOverallModel.provenance?.version || 'unknown',
|