llm-checker 3.2.5 → 3.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,14 @@ const path = require('path');
10
10
  const os = require('os');
11
11
  const { spawn } = require('child_process');
12
12
  const { DETERMINISTIC_WEIGHTS } = require('./scoring-config');
13
+ const {
14
+ parseBillionsValue: parseMoEBillionsValue,
15
+ parsePositiveNumber: parseMoEPositiveNumber,
16
+ normalizeMoERuntime,
17
+ extractMoEMetadata: extractCanonicalMoEMetadata,
18
+ resolveMoEParameterProfile,
19
+ estimateMoESpeedMultiplier
20
+ } = require('./moe-assumptions');
13
21
 
14
22
  class DeterministicModelSelector {
15
23
  constructor() {
@@ -33,6 +41,7 @@ class DeterministicModelSelector {
33
41
  // Family quality bumps
34
42
  this.familyBumps = {
35
43
  'qwen2.5': 2,
44
+ 'qwen3': 4,
36
45
  'deepseek': 3,
37
46
  'mistral': 1,
38
47
  'llama3.1': 1,
@@ -101,6 +110,28 @@ class DeterministicModelSelector {
101
110
 
102
111
  // Category scoring weights [Q, S, F, C] from centralized config
103
112
  this.categoryWeights = DETERMINISTIC_WEIGHTS;
113
+
114
+ // User optimization profile overrides [Q, S, F, C]
115
+ this.optimizationProfiles = {
116
+ balanced: null,
117
+ speed: [0.25, 0.55, 0.15, 0.05],
118
+ quality: [0.65, 0.10, 0.15, 0.10],
119
+ context: [0.30, 0.10, 0.20, 0.40],
120
+ coding: [0.55, 0.25, 0.10, 0.10]
121
+ };
122
+
123
+ this.freshnessThresholds = {
124
+ staleDays: 365,
125
+ veryStaleDays: 730,
126
+ indexCadenceDays: 14
127
+ };
128
+
129
+ this.modelIndexStatus = {
130
+ source: 'unknown',
131
+ ageDays: null,
132
+ isStale: false,
133
+ cachedAt: null
134
+ };
104
135
  }
105
136
 
106
137
  // ============================================================================
@@ -148,6 +179,7 @@ class DeterministicModelSelector {
148
179
  const gpu = input.gpu || {};
149
180
  const memory = input.memory || {};
150
181
  const acceleration = input.acceleration || {};
182
+ const gpuEntries = Array.isArray(gpu.all) ? gpu.all : [];
151
183
 
152
184
  const totalMemGB =
153
185
  toNumber(memory.totalGB) ??
@@ -156,21 +188,62 @@ class DeterministicModelSelector {
156
188
  toNumber(input.memoryGB) ??
157
189
  8;
158
190
 
191
+ const modelHints = `${gpu.model || ''} ${gpu.vendor || ''} ${gpu.type || ''}`.toLowerCase();
192
+ const inferredUnified =
193
+ Boolean(gpu.unified) ||
194
+ /apple|m1|m2|m3|m4|unified/.test(modelHints);
195
+
196
+ const utilizationFactor = inferredUnified ? 0.85 : 0.8;
197
+ const memoryHeadroomGB = inferredUnified ? 1.5 : 2;
159
198
  const usableMemGB =
160
199
  toNumber(input.usableMemGB) ??
161
- Math.max(1, Math.min(0.8 * totalMemGB, totalMemGB - 2));
200
+ Math.max(1, Math.min(utilizationFactor * totalMemGB, totalMemGB - memoryHeadroomGB));
201
+
202
+ const gpuCount =
203
+ toNumber(gpu.gpuCount) ??
204
+ toNumber(gpu.count) ??
205
+ (gpuEntries.length > 0 ? gpuEntries.length : null) ??
206
+ toNumber(input.gpuCount) ??
207
+ 1;
208
+
209
+ const vramPerGPU =
210
+ toNumber(gpu.vramPerGPU) ??
211
+ toNumber(input.vramPerGPU) ??
212
+ null;
213
+
214
+ const summedEntryVRAM = gpuEntries.reduce((sum, entry) => {
215
+ return sum + (
216
+ toNumber(entry?.vramGB) ??
217
+ toNumber(entry?.vram) ??
218
+ toNumber(entry?.totalVRAM) ??
219
+ 0
220
+ );
221
+ }, 0);
162
222
 
163
- const vramGB =
223
+ const explicitTotalVRAM =
224
+ toNumber(gpu.totalVRAM) ??
225
+ toNumber(input.totalVRAM) ??
226
+ toNumber(input.gpuTotalVRAM) ??
227
+ (summedEntryVRAM > 0 ? summedEntryVRAM : null);
228
+
229
+ const directVRAM =
164
230
  toNumber(gpu.vramGB) ??
165
231
  toNumber(gpu.vram) ??
166
- toNumber(gpu.totalVRAM) ??
167
- toNumber(gpu.vramPerGPU) ??
232
+ null;
233
+
234
+ let vramGB =
235
+ explicitTotalVRAM ??
236
+ directVRAM ??
168
237
  0;
169
238
 
170
- const modelHints = `${gpu.model || ''} ${gpu.vendor || ''} ${gpu.type || ''}`.toLowerCase();
171
- const inferredUnified =
172
- Boolean(gpu.unified) ||
173
- /apple|m1|m2|m3|m4|unified/.test(modelHints);
239
+ // Multi-GPU fallback when only per-GPU memory is known.
240
+ if (!explicitTotalVRAM && gpuCount > 1) {
241
+ if (vramPerGPU) {
242
+ vramGB = vramPerGPU * gpuCount;
243
+ } else if (directVRAM && Boolean(gpu.isMultiGPU || input.isMultiGPU)) {
244
+ vramGB = Math.max(directVRAM, directVRAM * gpuCount);
245
+ }
246
+ }
174
247
 
175
248
  let gpuType = gpu.type;
176
249
  if (!gpuType) {
@@ -206,6 +279,9 @@ class DeterministicModelSelector {
206
279
  ...gpu,
207
280
  type: gpuType,
208
281
  vramGB,
282
+ vramPerGPU: vramPerGPU ?? (gpuCount > 0 ? (vramGB > 0 ? vramGB / gpuCount : 0) : 0),
283
+ gpuCount,
284
+ isMultiGPU: Boolean(gpu.isMultiGPU || gpuCount > 1),
209
285
  unified: inferredUnified
210
286
  },
211
287
  memory: {
@@ -217,6 +293,43 @@ class DeterministicModelSelector {
217
293
  };
218
294
  }
219
295
 
296
+ normalizeOptimizationObjective(objective) {
297
+ if (!objective) return 'balanced';
298
+ const normalized = String(objective).toLowerCase().trim();
299
+ if (['balanced', 'default', 'auto'].includes(normalized)) return 'balanced';
300
+ if (['speed', 'fast', 'latency', 'throughput'].includes(normalized)) return 'speed';
301
+ if (['quality', 'accurate', 'accuracy'].includes(normalized)) return 'quality';
302
+ if (['context', 'long-context', 'long_context', 'memory'].includes(normalized)) return 'context';
303
+ if (['coding', 'code', 'developer'].includes(normalized)) return 'coding';
304
+ return 'balanced';
305
+ }
306
+
307
+ getScoringWeights(category, optimizeFor = 'balanced') {
308
+ const base = this.categoryWeights[category] || this.categoryWeights.general;
309
+ const objective = this.normalizeOptimizationObjective(optimizeFor);
310
+ const objectiveWeights = this.optimizationProfiles[objective];
311
+
312
+ if (!objectiveWeights) {
313
+ return base;
314
+ }
315
+
316
+ // Blend category semantics with requested profile, but keep explicit
317
+ // user intent dominant (especially for quality/context priorities).
318
+ const objectivePriorities = {
319
+ speed: 0.8,
320
+ quality: 0.95,
321
+ context: 0.85,
322
+ coding: 0.8
323
+ };
324
+ const objectivePriority = objectivePriorities[objective] || 0.75;
325
+ const categoryPriority = 1 - objectivePriority;
326
+
327
+ return base.map((weight, idx) => {
328
+ const blended = (weight * categoryPriority) + (objectiveWeights[idx] * objectivePriority);
329
+ return Math.round(blended * 1000) / 1000;
330
+ });
331
+ }
332
+
220
333
  async getCPUInfo() {
221
334
  const os = require('os');
222
335
  return {
@@ -511,7 +624,8 @@ class DeterministicModelSelector {
511
624
  if (!fs.existsSync(cachePath)) continue;
512
625
  const raw = JSON.parse(fs.readFileSync(cachePath, 'utf8'));
513
626
  const sourceModels = Array.isArray(raw) ? raw : (raw.models || []);
514
- const normalized = this.normalizeExternalModels(sourceModels);
627
+ const indexMeta = this.extractModelIndexMetadata(raw, cachePath);
628
+ const normalized = this.normalizeExternalModels(sourceModels, { indexMeta });
515
629
  if (normalized.length > 0) return normalized;
516
630
  } catch (error) {
517
631
  // Ignore broken cache files and keep trying fallbacks
@@ -520,8 +634,28 @@ class DeterministicModelSelector {
520
634
  return [];
521
635
  }
522
636
 
523
- normalizeExternalModels(models = []) {
637
+ extractModelIndexMetadata(raw, sourcePath = '') {
638
+ const cachedAtRaw = raw?.cached_at || raw?.generated_at || raw?.last_updated || null;
639
+ const cachedAt = this.parseDateSafe(cachedAtRaw);
640
+ const ageDays = cachedAt
641
+ ? Math.max(0, (Date.now() - cachedAt.getTime()) / (1000 * 60 * 60 * 24))
642
+ : null;
643
+ const isStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.indexCadenceDays;
644
+
645
+ const status = {
646
+ source: sourcePath || 'cache',
647
+ ageDays: Number.isFinite(ageDays) ? Math.round(ageDays * 10) / 10 : null,
648
+ isStale: Boolean(isStale),
649
+ cachedAt: cachedAt ? cachedAt.toISOString() : null
650
+ };
651
+
652
+ this.modelIndexStatus = status;
653
+ return status;
654
+ }
655
+
656
+ normalizeExternalModels(models = [], context = {}) {
524
657
  const normalized = [];
658
+ const indexMeta = context.indexMeta || this.modelIndexStatus || {};
525
659
 
526
660
  for (const model of models) {
527
661
  if (!model || typeof model !== 'object') continue;
@@ -531,17 +665,23 @@ class DeterministicModelSelector {
531
665
  typeof model.ctxMax === 'number' &&
532
666
  model.model_identifier;
533
667
 
668
+ const freshness = this.computeFreshnessMetadata(model, indexMeta);
669
+ const quantizations = this.extractAvailableQuantizations(model, model.variants || []);
670
+
534
671
  if (alreadyNormalized) {
535
672
  normalized.push({
536
673
  ...model,
537
674
  tags: Array.isArray(model.tags) ? model.tags : [],
538
675
  modalities: Array.isArray(model.modalities) ? model.modalities : ['text'],
539
676
  installed: Boolean(model.installed),
677
+ availableQuantizations: model.availableQuantizations || quantizations,
678
+ sizeByQuant: model.sizeByQuant || {},
540
679
  source: model.source || 'ollama_database',
541
680
  registry: model.registry || 'ollama.com',
542
681
  version: model.version || model.model_identifier,
543
682
  license: model.license || 'unknown',
544
683
  digest: model.digest || 'unknown',
684
+ ...freshness,
545
685
  provenance: model.provenance || {
546
686
  source: model.source || 'ollama_database',
547
687
  registry: model.registry || 'ollama.com',
@@ -553,7 +693,7 @@ class DeterministicModelSelector {
553
693
  continue;
554
694
  }
555
695
 
556
- const converted = this.convertOllamaModelToDeterministicModels(model);
696
+ const converted = this.convertOllamaModelToDeterministicModels(model, { indexMeta });
557
697
  normalized.push(...converted);
558
698
  }
559
699
 
@@ -567,12 +707,14 @@ class DeterministicModelSelector {
567
707
  return [...deduped.values()];
568
708
  }
569
709
 
570
- convertOllamaModelToDeterministicModels(ollamaModel) {
710
+ convertOllamaModelToDeterministicModels(ollamaModel, context = {}) {
571
711
  const baseIdentifier = ollamaModel.model_identifier || ollamaModel.model_name || 'unknown';
572
712
  const fallbackTag = `${baseIdentifier}:latest`;
573
713
  const variants = Array.isArray(ollamaModel.variants) && ollamaModel.variants.length > 0
574
714
  ? ollamaModel.variants
575
715
  : [{ tag: ollamaModel.model_identifier || fallbackTag }];
716
+ const indexMeta = context.indexMeta || this.modelIndexStatus || {};
717
+ const freshness = this.computeFreshnessMetadata(ollamaModel, indexMeta);
576
718
 
577
719
  const contextLength = this.parseContextLength(
578
720
  ollamaModel.context_length ||
@@ -614,6 +756,7 @@ class DeterministicModelSelector {
614
756
  ollamaModel.main_size,
615
757
  ollamaModel.model_identifier
616
758
  );
759
+ const moeMetadata = this.extractMoEMetadata(ollamaModel, variant, paramsB, baseText);
617
760
  const quant = this.normalizeQuantization(
618
761
  variant.quantization ||
619
762
  this.extractQuantizationFromTag(variantTag) ||
@@ -623,17 +766,67 @@ class DeterministicModelSelector {
623
766
  const variantSizeGB = this.extractVariantSizeGB(variant, paramsB);
624
767
  const modalities = this.inferModalities(ollamaModel, variantTag);
625
768
  const modelTags = this.inferTagsForVariant(derivedTags, variant, variantTag);
769
+ const sizeByQuant = {};
770
+
771
+ for (const sibling of variants) {
772
+ const siblingParams = this.extractParamsFromString(
773
+ sibling.size,
774
+ sibling.tag,
775
+ ollamaModel.main_size,
776
+ ollamaModel.model_identifier
777
+ );
778
+
779
+ // Keep quantization map parameter-aware: don't blend 8B/70B/405B sizes.
780
+ if (Math.abs(siblingParams - paramsB) > 0.25) continue;
781
+
782
+ const siblingQuant = this.normalizeQuantization(
783
+ sibling.quantization ||
784
+ this.extractQuantizationFromTag(sibling.tag || '') ||
785
+ quant
786
+ );
787
+ const siblingSize = this.extractVariantSizeGB(sibling, siblingParams);
788
+ if (!Number.isFinite(sizeByQuant[siblingQuant]) || siblingSize < sizeByQuant[siblingQuant]) {
789
+ sizeByQuant[siblingQuant] = siblingSize;
790
+ }
791
+ }
792
+
793
+ const availableQuantizations = this.getQuantizationCandidates({
794
+ availableQuantizations: this.extractAvailableQuantizations(ollamaModel, variants),
795
+ sizeByQuant
796
+ });
626
797
 
627
798
  const source = ollamaModel.source || 'ollama_database';
628
799
  const registry = ollamaModel.registry || 'ollama.com';
629
800
  const version = ollamaModel.version || variantTag;
630
801
  const license = ollamaModel.license || 'unknown';
631
802
  const digest = ollamaModel.digest || 'unknown';
803
+ const normalizedExpertCount = Number.isFinite(moeMetadata.expertCount) && moeMetadata.expertCount > 0
804
+ ? Math.round(moeMetadata.expertCount)
805
+ : null;
806
+ const normalizedExpertsActive = Number.isFinite(moeMetadata.expertsActivePerToken) && moeMetadata.expertsActivePerToken > 0
807
+ ? moeMetadata.expertsActivePerToken
808
+ : null;
809
+ const normalizedTotalParamsB = Number.isFinite(moeMetadata.totalParamsB) && moeMetadata.totalParamsB > 0
810
+ ? moeMetadata.totalParamsB
811
+ : null;
812
+ const normalizedActiveParamsB = Number.isFinite(moeMetadata.activeParamsB) && moeMetadata.activeParamsB > 0
813
+ ? moeMetadata.activeParamsB
814
+ : null;
632
815
 
633
816
  return {
634
817
  name: variantTag,
635
818
  family: this.extractFamily(baseIdentifier),
636
819
  paramsB,
820
+ isMoE: Boolean(moeMetadata.isMoE),
821
+ is_moe: Boolean(moeMetadata.isMoE),
822
+ totalParamsB: normalizedTotalParamsB,
823
+ activeParamsB: normalizedActiveParamsB,
824
+ expertCount: normalizedExpertCount,
825
+ expertsActivePerToken: normalizedExpertsActive,
826
+ total_params_b: normalizedTotalParamsB,
827
+ active_params_b: normalizedActiveParamsB,
828
+ expert_count: normalizedExpertCount,
829
+ experts_active_per_token: normalizedExpertsActive,
637
830
  ctxMax: contextLength,
638
831
  quant,
639
832
  sizeGB: variantSizeGB,
@@ -642,6 +835,9 @@ class DeterministicModelSelector {
642
835
  model_identifier: variantTag,
643
836
  installed: false,
644
837
  pulls: ollamaModel.actual_pulls || ollamaModel.pulls || 0,
838
+ availableQuantizations,
839
+ sizeByQuant,
840
+ ...freshness,
645
841
  source,
646
842
  registry,
647
843
  version,
@@ -658,6 +854,134 @@ class DeterministicModelSelector {
658
854
  });
659
855
  }
660
856
 
857
+ parseBillionsValue(rawValue) {
858
+ return parseMoEBillionsValue(rawValue);
859
+ }
860
+
861
+ parsePositiveNumber(rawValue) {
862
+ return parseMoEPositiveNumber(rawValue);
863
+ }
864
+
865
+ extractMoEMetadata(model = {}, variant = {}, paramsB = null, baseText = '') {
866
+ return extractCanonicalMoEMetadata({
867
+ model,
868
+ variant,
869
+ paramsB,
870
+ baseText
871
+ });
872
+ }
873
+
874
+ parseDateSafe(value) {
875
+ if (!value || typeof value !== 'string') return null;
876
+ const parsed = new Date(value);
877
+ if (Number.isNaN(parsed.getTime())) return null;
878
+ return parsed;
879
+ }
880
+
881
+ extractAvailableQuantizations(model, variants = []) {
882
+ const quantSet = new Set();
883
+ const candidateStrings = [];
884
+
885
+ if (Array.isArray(model?.quantizations)) {
886
+ candidateStrings.push(...model.quantizations);
887
+ }
888
+ if (typeof model?.quantization === 'string') {
889
+ candidateStrings.push(model.quantization);
890
+ }
891
+ for (const variant of variants) {
892
+ if (variant?.quantization) candidateStrings.push(variant.quantization);
893
+ if (variant?.tag) candidateStrings.push(variant.tag);
894
+ }
895
+
896
+ for (const value of candidateStrings) {
897
+ const inferred = this.normalizeQuantization(
898
+ this.extractQuantizationFromTag(String(value)) || String(value)
899
+ );
900
+ if (inferred) quantSet.add(inferred);
901
+ }
902
+
903
+ if (quantSet.size === 0 && model?.quant) {
904
+ quantSet.add(this.normalizeQuantization(model.quant));
905
+ }
906
+ if (quantSet.size === 0) {
907
+ quantSet.add('Q4_K_M');
908
+ }
909
+
910
+ return [...quantSet].sort((a, b) => {
911
+ const aIdx = this.quantHierarchy.indexOf(a);
912
+ const bIdx = this.quantHierarchy.indexOf(b);
913
+ const safeA = aIdx === -1 ? Number.MAX_SAFE_INTEGER : aIdx;
914
+ const safeB = bIdx === -1 ? Number.MAX_SAFE_INTEGER : bIdx;
915
+ return safeA - safeB;
916
+ });
917
+ }
918
+
919
+ computeFreshnessMetadata(model = {}, indexMeta = {}) {
920
+ const dateCandidates = [
921
+ model.last_updated,
922
+ model.lastUpdated,
923
+ model.updated_at,
924
+ model.updatedAt,
925
+ model.release_date,
926
+ model.released_at,
927
+ model.created_at,
928
+ model.detailed_scraped_at
929
+ ];
930
+
931
+ const updatedAt = dateCandidates
932
+ .map((value) => this.parseDateSafe(value))
933
+ .find(Boolean);
934
+
935
+ const ageDays = updatedAt
936
+ ? Math.max(0, (Date.now() - updatedAt.getTime()) / (1000 * 60 * 60 * 24))
937
+ : null;
938
+
939
+ let freshnessScore = 55; // neutral fallback when timestamp is unknown
940
+ if (Number.isFinite(ageDays)) {
941
+ if (ageDays <= 30) freshnessScore = 100;
942
+ else if (ageDays <= 90) freshnessScore = 90;
943
+ else if (ageDays <= 180) freshnessScore = 75;
944
+ else if (ageDays <= 365) freshnessScore = 60;
945
+ else if (ageDays <= 540) freshnessScore = 40;
946
+ else if (ageDays <= 720) freshnessScore = 25;
947
+ else freshnessScore = 10;
948
+ }
949
+
950
+ const textBlob = [
951
+ model.model_identifier,
952
+ model.model_name,
953
+ model.name,
954
+ model.description,
955
+ model.detailed_description,
956
+ model.status,
957
+ ...(Array.isArray(model.tags) ? model.tags : [])
958
+ ]
959
+ .filter(Boolean)
960
+ .join(' ')
961
+ .toLowerCase();
962
+
963
+ const isDeprecatedByText =
964
+ /\bdeprecated\b|\bobsolete\b|\blegacy\b|\barchived\b|\breplaced by\b|\buse .+ instead\b/.test(textBlob);
965
+ const isDeprecated = Boolean(model.deprecated || model.is_deprecated || model.archived || isDeprecatedByText);
966
+ const isStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.staleDays;
967
+ const veryStale = Number.isFinite(ageDays) && ageDays > this.freshnessThresholds.veryStaleDays;
968
+ const indexStale = Boolean(indexMeta?.isStale);
969
+
970
+ if (isDeprecated) freshnessScore = Math.min(freshnessScore, 15);
971
+ if (veryStale) freshnessScore = Math.min(freshnessScore, 20);
972
+ if (indexStale && !updatedAt) freshnessScore = Math.max(0, freshnessScore - 10);
973
+
974
+ return {
975
+ lastUpdatedAt: updatedAt ? updatedAt.toISOString() : null,
976
+ modelAgeDays: Number.isFinite(ageDays) ? Math.round(ageDays * 10) / 10 : null,
977
+ freshnessScore,
978
+ isStale,
979
+ isDeprecated,
980
+ indexAgeDays: Number.isFinite(indexMeta?.ageDays) ? indexMeta.ageDays : null,
981
+ indexStale
982
+ };
983
+ }
984
+
661
985
  parseContextLength(contextValue) {
662
986
  if (typeof contextValue === 'number' && Number.isFinite(contextValue) && contextValue > 0) {
663
987
  return Math.round(contextValue);
@@ -766,7 +1090,7 @@ class DeterministicModelSelector {
766
1090
  extractFamily(modelName) {
767
1091
  const name = modelName.toLowerCase();
768
1092
  if (name.includes('qwen2.5')) return 'qwen2.5';
769
- if (name.includes('qwen3')) return 'qwen2.5';
1093
+ if (name.includes('qwen3')) return 'qwen3';
770
1094
  if (name.includes('qwen')) return 'qwen2.5';
771
1095
  if (name.includes('deepseek')) return 'deepseek';
772
1096
  if (name.includes('llama3.2') || name.includes('llama3.3')) return 'llama3.2';
@@ -889,13 +1213,22 @@ class DeterministicModelSelector {
889
1213
  topN = 5,
890
1214
  enableProbe = false,
891
1215
  silent = false,
1216
+ optimizeFor = 'balanced',
1217
+ runtime = 'ollama',
892
1218
  hardware: providedHardware = null,
893
1219
  installedModels = null,
894
1220
  modelPool = null
895
1221
  } = options;
1222
+ const normalizedRuntime = normalizeMoERuntime(runtime);
1223
+ const optimizationObjective = this.normalizeOptimizationObjective(
1224
+ options.optimize || options.objective || optimizeFor
1225
+ );
896
1226
 
897
1227
  if (!silent) {
898
1228
  console.log(`🔍 Selecting models for category: ${category}`);
1229
+ if (optimizationObjective !== 'balanced') {
1230
+ console.log(`⚙️ Optimization profile: ${optimizationObjective}`);
1231
+ }
899
1232
  }
900
1233
 
901
1234
  // Phase 0: Gather data
@@ -931,7 +1264,15 @@ class DeterministicModelSelector {
931
1264
  const budget = isUnified ? usableMem : (vram || usableMem);
932
1265
 
933
1266
  for (const model of filtered) {
934
- const result = this.evaluateModel(model, hardware, category, targetCtx, budget);
1267
+ const result = this.evaluateModel(
1268
+ model,
1269
+ hardware,
1270
+ category,
1271
+ targetCtx,
1272
+ budget,
1273
+ optimizationObjective,
1274
+ normalizedRuntime
1275
+ );
935
1276
  if (result) {
936
1277
  candidates.push(result);
937
1278
  }
@@ -939,7 +1280,14 @@ class DeterministicModelSelector {
939
1280
 
940
1281
  // Sort by score
941
1282
  candidates.sort((a, b) => b.score - a.score);
942
- const topCandidates = candidates.slice(0, topN);
1283
+ let topCandidates = candidates.slice(0, topN);
1284
+ topCandidates = this.ensureFeasibleMidTierCoverage(
1285
+ topCandidates,
1286
+ candidates,
1287
+ category,
1288
+ hardware,
1289
+ optimizationObjective
1290
+ );
943
1291
 
944
1292
  if (!silent) {
945
1293
  console.log(`✨ Selected ${topCandidates.length} top candidates`);
@@ -957,6 +1305,8 @@ class DeterministicModelSelector {
957
1305
 
958
1306
  return {
959
1307
  category,
1308
+ optimizeFor: optimizationObjective,
1309
+ runtime: normalizedRuntime,
960
1310
  hardware,
961
1311
  candidates: topCandidates,
962
1312
  total_evaluated: filtered.length,
@@ -1008,42 +1358,115 @@ class DeterministicModelSelector {
1008
1358
  });
1009
1359
  }
1010
1360
 
1011
- evaluateModel(model, hardware, category, targetCtx, budget) {
1361
+ evaluateModel(model, hardware, category, targetCtx, budget, optimizeFor = 'balanced', runtime = 'ollama') {
1012
1362
  // 1. Select best fitting quantization
1013
1363
  const bestQuant = this.selectBestQuantization(model, budget, targetCtx);
1014
1364
  if (!bestQuant) return null;
1015
1365
 
1016
1366
  // 2. Calculate required memory
1017
- const requiredGB = this.estimateRequiredGB(model, bestQuant.quant, targetCtx);
1367
+ const memoryEstimate = this.estimateMemoryBreakdown(model, bestQuant.quant, targetCtx);
1368
+ const requiredGB = memoryEstimate.requiredGB;
1018
1369
  if (requiredGB > budget) return null;
1019
1370
 
1020
1371
  // 3. Calculate component scores
1021
1372
  const Q = this.calculateQualityPrior(model, bestQuant.quant, category);
1022
- const S = this.estimateSpeed(hardware, model, bestQuant.quant, category);
1373
+ const speedEstimate = this.estimateSpeedProfile(hardware, model, bestQuant.quant, category, runtime);
1374
+ const S = speedEstimate.score;
1023
1375
  const F = this.calculateFitScore(requiredGB, budget);
1024
1376
  const C = this.calculateContextScore(model, targetCtx);
1025
1377
 
1026
1378
  // 4. Calculate final weighted score
1027
- const weights = this.categoryWeights[category] || this.categoryWeights.general;
1379
+ const weights = this.getScoringWeights(category, optimizeFor);
1028
1380
  const score = Math.round((Q * weights[0] + S * weights[1] + F * weights[2] + C * weights[3]) * 10) / 10;
1029
1381
 
1030
1382
  // 5. Build rationale
1031
- const rationale = this.buildRationale(hardware, model, bestQuant.quant, requiredGB, budget, category, Q, S);
1383
+ const rationale = this.buildRationale(
1384
+ hardware,
1385
+ model,
1386
+ bestQuant.quant,
1387
+ requiredGB,
1388
+ budget,
1389
+ category,
1390
+ Q,
1391
+ S,
1392
+ memoryEstimate,
1393
+ speedEstimate
1394
+ );
1032
1395
 
1033
1396
  return {
1034
1397
  meta: model,
1035
1398
  quant: bestQuant.quant,
1036
1399
  requiredGB: Math.round(requiredGB * 10) / 10,
1037
- estTPS: S,
1400
+ estTPS: speedEstimate.estimatedTPS,
1038
1401
  score,
1402
+ runtime: speedEstimate.runtime,
1039
1403
  rationale,
1404
+ memory: {
1405
+ modelMemGB: Math.round(memoryEstimate.modelMemGB * 100) / 100,
1406
+ kvCacheGB: Math.round(memoryEstimate.kvCacheGB * 100) / 100,
1407
+ runtimeOverheadGB: Math.round(memoryEstimate.runtimeOverheadGB * 100) / 100,
1408
+ memorySource: memoryEstimate.memorySource,
1409
+ assumptionSource: memoryEstimate.parameterProfile.assumptionSource,
1410
+ isMoE: memoryEstimate.parameterProfile.isMoE,
1411
+ effectiveParamsB: Math.round(memoryEstimate.parameterProfile.effectiveParamsB * 1000) / 1000
1412
+ },
1413
+ speed: {
1414
+ backend: speedEstimate.backend,
1415
+ targetTPS: speedEstimate.targetTPS,
1416
+ estimatedTPS: speedEstimate.estimatedTPS,
1417
+ runtime: speedEstimate.runtime,
1418
+ moe: speedEstimate.moe
1419
+ },
1040
1420
  components: { Q, S, F, C }
1041
1421
  };
1042
1422
  }
1043
1423
 
1424
+ getQuantizationCandidates(model) {
1425
+ const normalizedAvailable = Array.isArray(model?.availableQuantizations)
1426
+ ? model.availableQuantizations.map((quant) => this.normalizeQuantization(quant))
1427
+ : [];
1428
+ const fromSizeMap = model?.sizeByQuant && typeof model.sizeByQuant === 'object'
1429
+ ? Object.keys(model.sizeByQuant).map((quant) => this.normalizeQuantization(quant))
1430
+ : [];
1431
+
1432
+ const seeded = (fromSizeMap.length > 0
1433
+ ? [...new Set(fromSizeMap)]
1434
+ : [...new Set(normalizedAvailable)])
1435
+ .filter(Boolean);
1436
+
1437
+ let candidates = seeded.length > 0 ? seeded : [...this.quantHierarchy];
1438
+
1439
+ // If we have at least one known quantization, allow extrapolating to
1440
+ // *more compressed* levels as an explicit feasibility assumption.
1441
+ if (seeded.length > 0) {
1442
+ const expanded = new Set();
1443
+ for (const quant of seeded) {
1444
+ const idx = this.quantHierarchy.indexOf(quant);
1445
+ if (idx === -1) {
1446
+ expanded.add(quant);
1447
+ continue;
1448
+ }
1449
+ for (let i = idx; i < this.quantHierarchy.length; i++) {
1450
+ expanded.add(this.quantHierarchy[i]);
1451
+ }
1452
+ }
1453
+ candidates = [...expanded];
1454
+ }
1455
+
1456
+ return candidates.sort((a, b) => {
1457
+ const aIdx = this.quantHierarchy.indexOf(a);
1458
+ const bIdx = this.quantHierarchy.indexOf(b);
1459
+ const safeA = aIdx === -1 ? Number.MAX_SAFE_INTEGER : aIdx;
1460
+ const safeB = bIdx === -1 ? Number.MAX_SAFE_INTEGER : bIdx;
1461
+ return safeA - safeB;
1462
+ });
1463
+ }
1464
+
1044
1465
  selectBestQuantization(model, budget, targetCtx) {
1466
+ const quantizationCandidates = this.getQuantizationCandidates(model);
1467
+
1045
1468
  // Try quantizations from best to worst quality
1046
- for (const quant of this.quantHierarchy) {
1469
+ for (const quant of quantizationCandidates) {
1047
1470
  const requiredGB = this.estimateRequiredGB(model, quant, targetCtx);
1048
1471
  if (requiredGB <= budget) {
1049
1472
  return { quant, sizeGB: requiredGB };
@@ -1053,7 +1476,7 @@ class DeterministicModelSelector {
1053
1476
  // If nothing fits at target context, try halving context once
1054
1477
  const halfCtx = Math.floor(targetCtx / 2);
1055
1478
  if (halfCtx >= 1024) {
1056
- for (const quant of this.quantHierarchy) {
1479
+ for (const quant of quantizationCandidates) {
1057
1480
  const requiredGB = this.estimateRequiredGB(model, quant, halfCtx);
1058
1481
  if (requiredGB <= budget) {
1059
1482
  return { quant, sizeGB: requiredGB };
@@ -1064,7 +1487,11 @@ class DeterministicModelSelector {
1064
1487
  return null; // Model doesn't fit
1065
1488
  }
1066
1489
 
1067
- estimateRequiredGB(model, quant, ctx) {
1490
+ resolveMemoryParameterProfile(model = {}) {
1491
+ return resolveMoEParameterProfile(model);
1492
+ }
1493
+
1494
+ estimateMemoryBreakdown(model, quant, ctx) {
1068
1495
  // Bytes per parameter by quantization level (calibrated to real Ollama sizes)
1069
1496
  // 7B Q4_K_M=~4.5GB, 14B Q4_K_M=~9GB, 32B Q4_K_M=~19GB
1070
1497
  const bytesPerParam = {
@@ -1075,17 +1502,54 @@ class DeterministicModelSelector {
1075
1502
  'Q3_K': 0.48,
1076
1503
  'Q2_K': 0.37
1077
1504
  };
1078
- const bpp = bytesPerParam[quant] || 0.63;
1079
- const modelMemGB = model.paramsB * bpp;
1505
+ const normalizedQuant = this.normalizeQuantization(quant);
1506
+ const bpp = bytesPerParam[normalizedQuant] || 0.63;
1507
+ const sizeByQuant = model?.sizeByQuant && typeof model.sizeByQuant === 'object' ? model.sizeByQuant : {};
1508
+ const observedFromSizeMap = Number(sizeByQuant[normalizedQuant]);
1509
+ const directVariantMatch =
1510
+ this.normalizeQuantization(model?.quant || '') === normalizedQuant
1511
+ ? Number(model?.sizeGB ?? model?.size)
1512
+ : NaN;
1513
+
1514
+ const observedWeightGB = Number.isFinite(observedFromSizeMap) && observedFromSizeMap > 0
1515
+ ? observedFromSizeMap
1516
+ : (Number.isFinite(directVariantMatch) && directVariantMatch > 0 ? directVariantMatch : null);
1517
+
1518
+ const parameterProfile = this.resolveMemoryParameterProfile(model);
1519
+ const modeledWeightGB = parameterProfile.effectiveParamsB * bpp;
1520
+ const preferSparseInferenceParams =
1521
+ parameterProfile.isMoE &&
1522
+ (parameterProfile.assumptionSource === 'moe_active_metadata' ||
1523
+ parameterProfile.assumptionSource === 'moe_derived_expert_ratio');
1524
+ const useObservedArtifactSize =
1525
+ !preferSparseInferenceParams &&
1526
+ Number.isFinite(observedWeightGB) &&
1527
+ observedWeightGB > 0;
1528
+ const modelMemGB = useObservedArtifactSize ? observedWeightGB : modeledWeightGB;
1529
+ const effectiveCtx = Number.isFinite(Number(ctx)) && Number(ctx) > 0 ? Number(ctx) : 4096;
1080
1530
 
1081
1531
  // KV cache: ~2 * numLayers * hiddenDim * 2bytes * ctx / 1e9
1082
1532
  // Simplified: ~0.000008 GB per billion params per context token
1083
- const kvCacheGB = 0.000008 * model.paramsB * ctx;
1533
+ const kvCacheGB = 0.000008 * parameterProfile.effectiveParamsB * effectiveCtx;
1084
1534
 
1085
1535
  // Runtime overhead (Metal/CUDA context, buffers)
1086
- const runtimeOverhead = 0.5;
1536
+ const runtimeOverhead = useObservedArtifactSize ? 0.35 : 0.5;
1537
+ const memorySource = useObservedArtifactSize
1538
+ ? 'observed_artifact_size'
1539
+ : (preferSparseInferenceParams ? 'moe_sparse_inference_params' : 'estimated_from_params');
1087
1540
 
1088
- return modelMemGB + kvCacheGB + runtimeOverhead;
1541
+ return {
1542
+ parameterProfile,
1543
+ memorySource,
1544
+ modelMemGB,
1545
+ kvCacheGB,
1546
+ runtimeOverheadGB: runtimeOverhead,
1547
+ requiredGB: modelMemGB + kvCacheGB + runtimeOverhead
1548
+ };
1549
+ }
1550
+
1551
+ estimateRequiredGB(model, quant, ctx) {
1552
+ return this.estimateMemoryBreakdown(model, quant, ctx).requiredGB;
1089
1553
  }
1090
1554
 
1091
1555
  calculateQualityPrior(model, quant, category) {
@@ -1099,6 +1563,10 @@ class DeterministicModelSelector {
1099
1563
  // Quantization penalty
1100
1564
  const quantPenalty = this.quantPenalties[quant] || -5;
1101
1565
  Q += quantPenalty;
1566
+
1567
+ // Freshness/deprecation adjustment
1568
+ const freshnessAdjustment = this.calculateFreshnessAdjustment(model);
1569
+ Q += freshnessAdjustment;
1102
1570
 
1103
1571
  // Task alignment bump
1104
1572
  const taskBump = this.getTaskAlignmentBump(model, category);
@@ -1155,7 +1623,28 @@ class DeterministicModelSelector {
1155
1623
  }
1156
1624
  }
1157
1625
 
1158
- estimateSpeed(hardware, model, quant, category) {
1626
+ calculateFreshnessAdjustment(model = {}) {
1627
+ const freshnessScore = Number.isFinite(model.freshnessScore) ? model.freshnessScore : 55;
1628
+ const ageDays = Number.isFinite(model.modelAgeDays) ? model.modelAgeDays : null;
1629
+ const isDeprecated = Boolean(model.isDeprecated);
1630
+ const isStale = Boolean(model.isStale);
1631
+
1632
+ if (isDeprecated) return -12;
1633
+ if (ageDays !== null && ageDays > this.freshnessThresholds.veryStaleDays) return -8;
1634
+ if (ageDays !== null && ageDays > this.freshnessThresholds.staleDays) return -4;
1635
+ if (isStale) return -3;
1636
+ if (freshnessScore >= 90) return 3;
1637
+ if (freshnessScore >= 75) return 2;
1638
+ if (freshnessScore >= 60) return 1;
1639
+ if (freshnessScore <= 25) return -4;
1640
+ return 0;
1641
+ }
1642
+
1643
+ estimateSpeed(hardware, model, quant, category, runtime = 'ollama') {
1644
+ return this.estimateSpeedProfile(hardware, model, quant, category, runtime).score;
1645
+ }
1646
+
1647
+ estimateSpeedProfile(hardware, model, quant, category, runtime = 'ollama') {
1159
1648
  // Determine backend
1160
1649
  let backend = 'cpu_x86';
1161
1650
  if (hardware.acceleration.supports_metal) backend = 'metal';
@@ -1164,7 +1653,14 @@ class DeterministicModelSelector {
1164
1653
 
1165
1654
  // Base speed calculation
1166
1655
  const K = this.backendK[backend];
1167
- let base = K / model.paramsB;
1656
+ const denseParamsB = Number.isFinite(this.parseBillionsValue(model.paramsB))
1657
+ ? this.parseBillionsValue(model.paramsB)
1658
+ : 1;
1659
+ const parameterProfile = this.resolveMemoryParameterProfile(model);
1660
+ const effectiveParamsB = Number.isFinite(parameterProfile.effectiveParamsB) && parameterProfile.effectiveParamsB > 0
1661
+ ? parameterProfile.effectiveParamsB
1662
+ : denseParamsB;
1663
+ let base = K / effectiveParamsB;
1168
1664
 
1169
1665
  // Quantization multiplier
1170
1666
  const quantMultiplier = this.quantSpeedMultipliers[quant] || 1.0;
@@ -1173,10 +1669,31 @@ class DeterministicModelSelector {
1173
1669
  // Threading multiplier
1174
1670
  if (hardware.cpu.cores >= 8) base *= 1.1;
1175
1671
  if (hardware.acceleration.supports_metal || hardware.acceleration.supports_cuda) base *= 1.2;
1672
+
1673
+ const normalizedRuntime = normalizeMoERuntime(runtime);
1674
+ const moe = estimateMoESpeedMultiplier({
1675
+ model,
1676
+ runtime: normalizedRuntime,
1677
+ denseParamsB,
1678
+ parameterProfile
1679
+ });
1680
+ if (moe.applied) {
1681
+ base *= moe.multiplier;
1682
+ }
1176
1683
 
1177
1684
  // Normalize to 0-100 score
1178
1685
  const target = this.targetSpeeds[category] || this.targetSpeeds.general;
1179
- return Math.min(100, Math.round((100 * base / target) * 10) / 10);
1686
+ const estimatedTPS = Math.max(1, Math.round(base * 10) / 10);
1687
+ const score = Math.min(100, Math.round((100 * estimatedTPS / target) * 10) / 10);
1688
+
1689
+ return {
1690
+ backend,
1691
+ targetTPS: target,
1692
+ estimatedTPS,
1693
+ score,
1694
+ runtime: normalizedRuntime,
1695
+ moe
1696
+ };
1180
1697
  }
1181
1698
 
1182
1699
  calculateFitScore(requiredGB, budgetGB) {
@@ -1192,7 +1709,104 @@ class DeterministicModelSelector {
1192
1709
  return 0; // Should be filtered out earlier
1193
1710
  }
1194
1711
 
1195
- buildRationale(hardware, model, quant, requiredGB, budget, category, Q, S) {
1712
+ estimatePracticalMaxParamsForBudget(budgetGB) {
1713
+ if (!Number.isFinite(budgetGB) || budgetGB <= 0) return 4;
1714
+ if (budgetGB >= 80) return 70;
1715
+ if (budgetGB >= 48) return 46;
1716
+ if (budgetGB >= 32) return 30;
1717
+ if (budgetGB >= 24) return 14;
1718
+ if (budgetGB >= 16) return 8;
1719
+ return 4;
1720
+ }
1721
+
1722
+ ensureFeasibleMidTierCoverage(selectedCandidates, allCandidates, category, hardware, optimizeFor = 'balanced') {
1723
+ if (!Array.isArray(selectedCandidates) || selectedCandidates.length === 0) {
1724
+ return selectedCandidates;
1725
+ }
1726
+
1727
+ const objective = this.normalizeOptimizationObjective(optimizeFor);
1728
+ if (objective === 'speed') {
1729
+ return selectedCandidates;
1730
+ }
1731
+
1732
+ const enforceCategories = new Set(['general', 'talking', 'reading', 'coding', 'reasoning', 'multimodal']);
1733
+ if (!enforceCategories.has(category)) {
1734
+ return selectedCandidates;
1735
+ }
1736
+
1737
+ const normalizedHardware = this.normalizeHardwareProfile(hardware || {});
1738
+ const budget = normalizedHardware.gpu.unified
1739
+ ? normalizedHardware.usableMemGB
1740
+ : (normalizedHardware.gpu.vramGB || normalizedHardware.usableMemGB);
1741
+
1742
+ if (!Number.isFinite(budget) || budget < 16) {
1743
+ return selectedCandidates;
1744
+ }
1745
+
1746
+ const candidatePool = Array.isArray(allCandidates) && allCandidates.length > 0
1747
+ ? allCandidates
1748
+ : selectedCandidates;
1749
+ let promoted = [...selectedCandidates];
1750
+
1751
+ const minMidTierParams = budget >= 24 ? 7 : 6;
1752
+ const alreadyHasMidTier = promoted.some((candidate) => (candidate?.meta?.paramsB || 0) >= minMidTierParams);
1753
+ if (!alreadyHasMidTier) {
1754
+ const practicalSpeedFloor = normalizedHardware.gpu.unified ? 25 : 20;
1755
+ const feasibleMidTier = candidatePool.find((candidate) => {
1756
+ const params = candidate?.meta?.paramsB || 0;
1757
+ const speedScore = candidate?.components?.S ?? candidate?.estTPS ?? 0;
1758
+ return params >= minMidTierParams && speedScore >= practicalSpeedFloor;
1759
+ });
1760
+
1761
+ if (
1762
+ feasibleMidTier &&
1763
+ !promoted.some((candidate) => candidate?.meta?.model_identifier === feasibleMidTier?.meta?.model_identifier)
1764
+ ) {
1765
+ promoted[promoted.length - 1] = feasibleMidTier;
1766
+ promoted.sort((a, b) => b.score - a.score);
1767
+ }
1768
+ }
1769
+
1770
+ const practicalMaxParams = this.estimatePracticalMaxParamsForBudget(budget);
1771
+ const shouldEnforceThirtyBCoverage =
1772
+ Boolean(normalizedHardware?.gpu?.isMultiGPU) &&
1773
+ !Boolean(normalizedHardware?.gpu?.unified) &&
1774
+ practicalMaxParams >= 30;
1775
+
1776
+ if (!shouldEnforceThirtyBCoverage || objective === 'speed') {
1777
+ return promoted;
1778
+ }
1779
+
1780
+ const alreadyHasThirtyB = promoted.some((candidate) => (candidate?.meta?.paramsB || 0) >= 30);
1781
+ if (alreadyHasThirtyB) {
1782
+ return promoted;
1783
+ }
1784
+
1785
+ const largeModelSpeedFloor = Math.max(
1786
+ 8,
1787
+ Math.round((this.targetSpeeds[category] || this.targetSpeeds.general) * 0.2)
1788
+ );
1789
+ const feasibleThirtyB = candidatePool.find((candidate) => {
1790
+ const params = candidate?.meta?.paramsB || 0;
1791
+ const estTPS = candidate?.estTPS ?? candidate?.speed?.estimatedTPS ?? 0;
1792
+ return params >= 30 && estTPS >= largeModelSpeedFloor;
1793
+ });
1794
+
1795
+ if (!feasibleThirtyB) {
1796
+ return promoted;
1797
+ }
1798
+
1799
+ if (promoted.some((candidate) => candidate?.meta?.model_identifier === feasibleThirtyB?.meta?.model_identifier)) {
1800
+ return promoted;
1801
+ }
1802
+
1803
+ const highCapacityPromoted = [...promoted];
1804
+ highCapacityPromoted[highCapacityPromoted.length - 1] = feasibleThirtyB;
1805
+ highCapacityPromoted.sort((a, b) => b.score - a.score);
1806
+ return highCapacityPromoted;
1807
+ }
1808
+
1809
+ buildRationale(hardware, model, quant, requiredGB, budget, category, Q, S, memoryEstimate = null, speedEstimate = null) {
1196
1810
  const parts = [];
1197
1811
 
1198
1812
  // Memory fit
@@ -1204,6 +1818,27 @@ class DeterministicModelSelector {
1204
1818
  // Special attributes
1205
1819
  if (model.tags.includes('coder')) parts.push('coder-tuned');
1206
1820
  if (model.modalities.includes('vision')) parts.push('vision-capable');
1821
+ if (model.isDeprecated) parts.push('deprecated penalized');
1822
+ else if (model.isStale) parts.push('stale penalized');
1823
+ else if (model.freshnessScore >= 90) parts.push('fresh release');
1824
+
1825
+ const memoryProfile = memoryEstimate?.parameterProfile;
1826
+ if (memoryProfile?.isMoE) {
1827
+ const assumptionLabels = {
1828
+ moe_active_metadata: 'MoE active params',
1829
+ moe_derived_expert_ratio: 'MoE derived active ratio',
1830
+ moe_fallback_total_params: 'MoE fallback total params',
1831
+ moe_fallback_model_params: 'MoE fallback model params',
1832
+ moe_fallback_default: 'MoE fallback default'
1833
+ };
1834
+ parts.push(assumptionLabels[memoryProfile.assumptionSource] || memoryProfile.assumptionSource);
1835
+ }
1836
+
1837
+ if (speedEstimate?.moe?.applied) {
1838
+ const runtimeLabel = speedEstimate.runtime || 'ollama';
1839
+ const multiplier = Number(speedEstimate.moe.multiplier || 1).toFixed(2);
1840
+ parts.push(`MoE speed x${multiplier} (${runtimeLabel})`);
1841
+ }
1207
1842
 
1208
1843
  // Size sweet spot
1209
1844
  if (model.paramsB >= 7 && model.paramsB <= 13) {
@@ -1380,6 +2015,16 @@ class DeterministicModelSelector {
1380
2015
  quantization: candidate.quant,
1381
2016
  estimatedRAM: candidate.requiredGB,
1382
2017
  reasoning: candidate.rationale,
2018
+ runtime: candidate.runtime || candidate.speed?.runtime || 'ollama',
2019
+ memoryAssumptionSource: candidate.memory?.assumptionSource || 'dense_params',
2020
+ speedAssumptions: candidate.speed?.moe ? {
2021
+ applied: Boolean(candidate.speed.moe.applied),
2022
+ runtime: candidate.speed.runtime || candidate.runtime || 'ollama',
2023
+ multiplier: Number.isFinite(candidate.speed.moe.multiplier) ? candidate.speed.moe.multiplier : 1,
2024
+ theoreticalSpeedup: Number.isFinite(candidate.speed.moe.theoreticalSpeedup) ? candidate.speed.moe.theoreticalSpeedup : 1,
2025
+ overheadMultiplier: Number.isFinite(candidate.speed.moe.overheadMultiplier) ? candidate.speed.moe.overheadMultiplier : 1,
2026
+ assumptionSource: candidate.speed.moe.assumptionSource || candidate.memory?.assumptionSource || 'dense_params'
2027
+ } : null,
1383
2028
  source: provenance.source,
1384
2029
  registry: provenance.registry,
1385
2030
  version: provenance.version,
@@ -1410,9 +2055,23 @@ class DeterministicModelSelector {
1410
2055
  cores = 4;
1411
2056
  }
1412
2057
 
1413
- if (ram >= 64 && cores >= 16) return 'extreme';
1414
- if (ram >= 32 && cores >= 12) return 'very_high';
1415
- if (ram >= 16 && cores >= 8) return 'high';
2058
+ const gpu = hardware?.gpu || {};
2059
+ const gpuCount =
2060
+ (Number.isFinite(Number(gpu.gpuCount)) ? Number(gpu.gpuCount) : null) ??
2061
+ (Number.isFinite(Number(hardware?.gpuCount)) ? Number(hardware.gpuCount) : null) ??
2062
+ 1;
2063
+ const totalVRAM =
2064
+ (Number.isFinite(Number(gpu.vramGB)) ? Number(gpu.vramGB) : null) ??
2065
+ (Number.isFinite(Number(gpu.vram)) ? Number(gpu.vram) : null) ??
2066
+ (Number.isFinite(Number(gpu.totalVRAM)) ? Number(gpu.totalVRAM) : null) ??
2067
+ 0;
2068
+ const unifiedGPU = Boolean(gpu.unified) || gpu.type === 'apple_silicon';
2069
+ const effectiveAcceleratorMem = unifiedGPU ? Math.max(totalVRAM, ram) : totalVRAM;
2070
+
2071
+ if (effectiveAcceleratorMem >= 80 || (ram >= 64 && cores >= 16)) return 'extreme';
2072
+ if (effectiveAcceleratorMem >= 48 || (ram >= 32 && cores >= 12)) return 'very_high';
2073
+ if (effectiveAcceleratorMem >= 24 || (ram >= 16 && cores >= 8)) return 'high';
2074
+ if (gpuCount >= 2 && effectiveAcceleratorMem >= 20) return 'high';
1416
2075
  if (ram >= 8 && cores >= 4) return 'medium';
1417
2076
  return 'low';
1418
2077
  }
@@ -1451,12 +2110,16 @@ class DeterministicModelSelector {
1451
2110
  /**
1452
2111
  * Generate recommendations by category (main API, replaces EnhancedModelSelector)
1453
2112
  */
1454
- async getBestModelsForHardware(hardware, allModels) {
2113
+ async getBestModelsForHardware(hardware, allModels, options = {}) {
1455
2114
  const categories = ['coding', 'reasoning', 'multimodal', 'creative', 'talking', 'reading', 'general'];
1456
2115
  const recommendations = {};
1457
2116
  const normalizedPool = this.normalizeExternalModels(Array.isArray(allModels) ? allModels : []);
1458
2117
  const installedModels = await this.getInstalledModels();
1459
2118
  const normalizedHardware = this.normalizeHardwareProfile(hardware || await this.getHardware());
2119
+ const runtime = normalizeMoERuntime(options.runtime || 'ollama');
2120
+ const optimizationObjective = this.normalizeOptimizationObjective(
2121
+ options.optimizeFor || options.optimize || options.objective
2122
+ );
1460
2123
 
1461
2124
  for (const category of categories) {
1462
2125
  try {
@@ -1464,6 +2127,8 @@ class DeterministicModelSelector {
1464
2127
  topN: 3,
1465
2128
  enableProbe: false,
1466
2129
  silent: true,
2130
+ optimizeFor: optimizationObjective,
2131
+ runtime,
1467
2132
  hardware: normalizedHardware,
1468
2133
  installedModels,
1469
2134
  modelPool: normalizedPool
@@ -1471,6 +2136,8 @@ class DeterministicModelSelector {
1471
2136
 
1472
2137
  recommendations[category] = {
1473
2138
  tier: this.mapHardwareTier(normalizedHardware),
2139
+ optimizeFor: optimizationObjective,
2140
+ runtime,
1474
2141
  bestModels: result.candidates.map(candidate => this.mapCandidateToLegacyFormat(candidate)),
1475
2142
  totalEvaluated: result.total_evaluated,
1476
2143
  category: this.getCategoryInfo(category)
@@ -1478,6 +2145,8 @@ class DeterministicModelSelector {
1478
2145
  } catch (error) {
1479
2146
  recommendations[category] = {
1480
2147
  tier: this.mapHardwareTier(normalizedHardware),
2148
+ optimizeFor: optimizationObjective,
2149
+ runtime,
1481
2150
  bestModels: [],
1482
2151
  totalEvaluated: 0,
1483
2152
  category: this.getCategoryInfo(category)
@@ -1491,9 +2160,12 @@ class DeterministicModelSelector {
1491
2160
  /**
1492
2161
  * Generate recommendation summary
1493
2162
  */
1494
- generateRecommendationSummary(recommendations, hardware) {
2163
+ generateRecommendationSummary(recommendations, hardware, options = {}) {
1495
2164
  const summary = {
1496
2165
  hardware_tier: this.mapHardwareTier(hardware),
2166
+ optimize_for: this.normalizeOptimizationObjective(
2167
+ options.optimizeFor || options.optimize || options.objective
2168
+ ),
1497
2169
  total_categories: Object.keys(recommendations).length,
1498
2170
  best_overall: null,
1499
2171
  by_category: {},
@@ -1513,6 +2185,7 @@ class DeterministicModelSelector {
1513
2185
  score: Math.round(bestModel.categoryScore || bestModel.score),
1514
2186
  command: `ollama pull ${bestModel.model_identifier}`,
1515
2187
  size: this.formatModelSize(bestModel),
2188
+ quantization: bestModel.quantization || bestModel.quant || 'Q4_K_M',
1516
2189
  pulls: bestModel.pulls || 0,
1517
2190
  source: bestModel.source || bestModel.provenance?.source || 'unknown',
1518
2191
  registry: bestModel.registry || bestModel.provenance?.registry || 'unknown',
@@ -1548,6 +2221,7 @@ class DeterministicModelSelector {
1548
2221
  category: bestOverallCategory,
1549
2222
  score: Math.round(bestOverallScore),
1550
2223
  command: `ollama pull ${bestOverallModel.model_identifier}`,
2224
+ quantization: bestOverallModel.quantization || bestOverallModel.quant || 'Q4_K_M',
1551
2225
  source: bestOverallModel.source || bestOverallModel.provenance?.source || 'unknown',
1552
2226
  registry: bestOverallModel.registry || bestOverallModel.provenance?.registry || 'unknown',
1553
2227
  version: bestOverallModel.version || bestOverallModel.provenance?.version || 'unknown',