mlx-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_stack/__init__.py +5 -0
- mlx_stack/_version.py +24 -0
- mlx_stack/cli/__init__.py +5 -0
- mlx_stack/cli/bench.py +221 -0
- mlx_stack/cli/config.py +166 -0
- mlx_stack/cli/down.py +109 -0
- mlx_stack/cli/init.py +180 -0
- mlx_stack/cli/install.py +165 -0
- mlx_stack/cli/logs.py +234 -0
- mlx_stack/cli/main.py +187 -0
- mlx_stack/cli/models.py +304 -0
- mlx_stack/cli/profile.py +65 -0
- mlx_stack/cli/pull.py +134 -0
- mlx_stack/cli/recommend.py +397 -0
- mlx_stack/cli/status.py +111 -0
- mlx_stack/cli/up.py +163 -0
- mlx_stack/cli/watch.py +252 -0
- mlx_stack/core/__init__.py +1 -0
- mlx_stack/core/benchmark.py +1182 -0
- mlx_stack/core/catalog.py +560 -0
- mlx_stack/core/config.py +471 -0
- mlx_stack/core/deps.py +323 -0
- mlx_stack/core/hardware.py +304 -0
- mlx_stack/core/launchd.py +531 -0
- mlx_stack/core/litellm_gen.py +188 -0
- mlx_stack/core/log_rotation.py +231 -0
- mlx_stack/core/log_viewer.py +386 -0
- mlx_stack/core/models.py +639 -0
- mlx_stack/core/paths.py +79 -0
- mlx_stack/core/process.py +887 -0
- mlx_stack/core/pull.py +815 -0
- mlx_stack/core/scoring.py +611 -0
- mlx_stack/core/stack_down.py +317 -0
- mlx_stack/core/stack_init.py +524 -0
- mlx_stack/core/stack_status.py +229 -0
- mlx_stack/core/stack_up.py +856 -0
- mlx_stack/core/watchdog.py +744 -0
- mlx_stack/data/__init__.py +1 -0
- mlx_stack/data/catalog/__init__.py +1 -0
- mlx_stack/data/catalog/deepseek-r1-32b.yaml +46 -0
- mlx_stack/data/catalog/deepseek-r1-8b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-12b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-27b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-4b.yaml +45 -0
- mlx_stack/data/catalog/llama3.3-8b.yaml +44 -0
- mlx_stack/data/catalog/nemotron-49b.yaml +41 -0
- mlx_stack/data/catalog/nemotron-8b.yaml +44 -0
- mlx_stack/data/catalog/qwen3-8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-0.8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-14b.yaml +46 -0
- mlx_stack/data/catalog/qwen3.5-32b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-3b.yaml +44 -0
- mlx_stack/data/catalog/qwen3.5-72b.yaml +42 -0
- mlx_stack/data/catalog/qwen3.5-8b.yaml +45 -0
- mlx_stack/py.typed +1 -0
- mlx_stack/utils/__init__.py +1 -0
- mlx_stack-0.1.0.dist-info/METADATA +397 -0
- mlx_stack-0.1.0.dist-info/RECORD +61 -0
- mlx_stack-0.1.0.dist-info/WHEEL +4 -0
- mlx_stack-0.1.0.dist-info/entry_points.txt +2 -0
- mlx_stack-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
"""Recommendation scoring engine for mlx-stack.
|
|
2
|
+
|
|
3
|
+
Scores and ranks catalog models for a given hardware profile and intent.
|
|
4
|
+
Supports 2 intents (balanced, agent-fleet) with weighted composite scoring
|
|
5
|
+
across speed, quality, tool_calling, and memory_efficiency dimensions.
|
|
6
|
+
|
|
7
|
+
Uses log-scaled gen_tps normalization to prevent extreme speed models from
|
|
8
|
+
dominating composite scores. Filters models by memory budget (default 40%
|
|
9
|
+
of unified memory). Assigns models to tiers (fast, standard, longctx).
|
|
10
|
+
|
|
11
|
+
Handles bandwidth-ratio estimation for unknown hardware profiles where
|
|
12
|
+
catalog benchmark data is not available.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import math
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from mlx_stack.core.catalog import BenchmarkResult, CatalogEntry
|
|
23
|
+
from mlx_stack.core.hardware import HardwareProfile
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("mlx_stack")
|
|
26
|
+
|
|
27
|
+
# --------------------------------------------------------------------------- #
|
|
28
|
+
# Constants
|
|
29
|
+
# --------------------------------------------------------------------------- #
|
|
30
|
+
|
|
31
|
+
# Default memory budget percentage of unified memory
|
|
32
|
+
DEFAULT_MEMORY_BUDGET_PCT = 40
|
|
33
|
+
|
|
34
|
+
# Valid intents
|
|
35
|
+
VALID_INTENTS: set[str] = {"balanced", "agent-fleet"}
|
|
36
|
+
|
|
37
|
+
# Tier names
|
|
38
|
+
TIER_FAST = "fast"
|
|
39
|
+
TIER_STANDARD = "standard"
|
|
40
|
+
TIER_LONGCTX = "longctx"
|
|
41
|
+
|
|
42
|
+
# Architecture that qualifies for longctx tier
|
|
43
|
+
_LONGCTX_ARCHITECTURES: set[str] = {"mamba2-hybrid"}
|
|
44
|
+
|
|
45
|
+
# Default quantization for memory lookup
|
|
46
|
+
_DEFAULT_QUANT = "int4"
|
|
47
|
+
|
|
48
|
+
# Reference hardware profiles used in catalog benchmarks and their bandwidths.
|
|
49
|
+
# Used for bandwidth-ratio estimation when a model has no benchmark data
|
|
50
|
+
# for the user's hardware.
|
|
51
|
+
_REFERENCE_PROFILES: dict[str, float] = {
|
|
52
|
+
"m4-pro-48": 273.0,
|
|
53
|
+
"m4-max-128": 546.0,
|
|
54
|
+
"m5-max-128": 546.0,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# --------------------------------------------------------------------------- #
|
|
59
|
+
# Intent weight configurations
|
|
60
|
+
# --------------------------------------------------------------------------- #
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(frozen=True)
|
|
64
|
+
class IntentWeights:
|
|
65
|
+
"""Weight configuration for a recommendation intent.
|
|
66
|
+
|
|
67
|
+
Weights should sum to 1.0 for normalized scoring.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
speed: float
|
|
71
|
+
quality: float
|
|
72
|
+
tool_calling: float
|
|
73
|
+
memory_efficiency: float
|
|
74
|
+
|
|
75
|
+
def __post_init__(self) -> None:
|
|
76
|
+
total = self.speed + self.quality + self.tool_calling + self.memory_efficiency
|
|
77
|
+
if not math.isclose(total, 1.0, abs_tol=0.001):
|
|
78
|
+
msg = f"Intent weights must sum to 1.0, got {total}"
|
|
79
|
+
raise ValueError(msg)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# Weight configurations for each intent
|
|
83
|
+
INTENT_WEIGHTS: dict[str, IntentWeights] = {
|
|
84
|
+
"balanced": IntentWeights(
|
|
85
|
+
speed=0.25,
|
|
86
|
+
quality=0.40,
|
|
87
|
+
tool_calling=0.15,
|
|
88
|
+
memory_efficiency=0.20,
|
|
89
|
+
),
|
|
90
|
+
"agent-fleet": IntentWeights(
|
|
91
|
+
speed=0.30,
|
|
92
|
+
quality=0.20,
|
|
93
|
+
tool_calling=0.35,
|
|
94
|
+
memory_efficiency=0.15,
|
|
95
|
+
),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# --------------------------------------------------------------------------- #
|
|
100
|
+
# Exceptions
|
|
101
|
+
# --------------------------------------------------------------------------- #
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class ScoringError(Exception):
|
|
105
|
+
"""Raised when scoring fails."""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# --------------------------------------------------------------------------- #
|
|
109
|
+
# Data classes
|
|
110
|
+
# --------------------------------------------------------------------------- #
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass(frozen=True)
|
|
114
|
+
class ScoredModel:
|
|
115
|
+
"""A catalog model with computed scores for a given hardware/intent."""
|
|
116
|
+
|
|
117
|
+
entry: CatalogEntry
|
|
118
|
+
composite_score: float
|
|
119
|
+
speed_score: float
|
|
120
|
+
quality_score: float
|
|
121
|
+
tool_calling_score: float
|
|
122
|
+
memory_efficiency_score: float
|
|
123
|
+
gen_tps: float
|
|
124
|
+
memory_gb: float
|
|
125
|
+
is_estimated: bool # True if gen_tps was bandwidth-ratio estimated
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass(frozen=True)
|
|
129
|
+
class TierAssignment:
|
|
130
|
+
"""A model assigned to a specific tier."""
|
|
131
|
+
|
|
132
|
+
tier: str
|
|
133
|
+
model: ScoredModel
|
|
134
|
+
quant: str
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass(frozen=True)
|
|
138
|
+
class RecommendationResult:
|
|
139
|
+
"""The full recommendation result with tier assignments and scored models."""
|
|
140
|
+
|
|
141
|
+
tiers: list[TierAssignment]
|
|
142
|
+
all_scored: list[ScoredModel]
|
|
143
|
+
memory_budget_gb: float
|
|
144
|
+
intent: str
|
|
145
|
+
hardware_profile: HardwareProfile
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
# --------------------------------------------------------------------------- #
|
|
149
|
+
# Memory budget
|
|
150
|
+
# --------------------------------------------------------------------------- #
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def compute_memory_budget(memory_gb: int, budget_pct: int = DEFAULT_MEMORY_BUDGET_PCT) -> float:
|
|
154
|
+
"""Compute the memory budget in GB from total memory and percentage.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
memory_gb: Total unified memory in GB.
|
|
158
|
+
budget_pct: Budget percentage (1-100).
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Memory budget in GB.
|
|
162
|
+
"""
|
|
163
|
+
return memory_gb * (budget_pct / 100.0)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# --------------------------------------------------------------------------- #
|
|
167
|
+
# Benchmark resolution
|
|
168
|
+
# --------------------------------------------------------------------------- #
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _resolve_benchmark(
|
|
172
|
+
entry: CatalogEntry,
|
|
173
|
+
profile: HardwareProfile,
|
|
174
|
+
quant: str = _DEFAULT_QUANT,
|
|
175
|
+
saved_benchmarks: dict[str, Any] | None = None,
|
|
176
|
+
) -> tuple[float, float, bool]:
|
|
177
|
+
"""Resolve gen_tps and memory_gb for a model on the given hardware.
|
|
178
|
+
|
|
179
|
+
Tries in order:
|
|
180
|
+
1. Saved benchmark data (from bench --save)
|
|
181
|
+
2. Direct catalog benchmark match for profile_id
|
|
182
|
+
3. Bandwidth-ratio estimation from a reference profile
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
entry: The catalog entry.
|
|
186
|
+
profile: The hardware profile.
|
|
187
|
+
quant: Quantization level (for saved benchmarks key).
|
|
188
|
+
saved_benchmarks: Optional dict from bench --save JSON files,
|
|
189
|
+
keyed by model_id with gen_tps, prompt_tps, memory_gb.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Tuple of (gen_tps, memory_gb, is_estimated).
|
|
193
|
+
"""
|
|
194
|
+
# 1. Check saved benchmarks
|
|
195
|
+
if saved_benchmarks and entry.id in saved_benchmarks:
|
|
196
|
+
saved = saved_benchmarks[entry.id]
|
|
197
|
+
try:
|
|
198
|
+
return (
|
|
199
|
+
float(saved.get("gen_tps", 0.0)),
|
|
200
|
+
float(saved.get("memory_gb", 0.0)),
|
|
201
|
+
False,
|
|
202
|
+
)
|
|
203
|
+
except (ValueError, TypeError):
|
|
204
|
+
# Malformed saved benchmark data — fall through to catalog lookup
|
|
205
|
+
logger.warning(
|
|
206
|
+
"Ignoring malformed saved benchmark for model '%s': "
|
|
207
|
+
"invalid numeric values",
|
|
208
|
+
entry.id,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
profile_id = profile.profile_id
|
|
212
|
+
|
|
213
|
+
# 2. Direct match in catalog benchmarks
|
|
214
|
+
if profile_id in entry.benchmarks:
|
|
215
|
+
bench = entry.benchmarks[profile_id]
|
|
216
|
+
return bench.gen_tps, bench.memory_gb, False
|
|
217
|
+
|
|
218
|
+
# 3. Bandwidth-ratio estimation from a reference profile
|
|
219
|
+
return _estimate_from_bandwidth_ratio(entry, profile)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _estimate_from_bandwidth_ratio(
|
|
223
|
+
entry: CatalogEntry,
|
|
224
|
+
profile: HardwareProfile,
|
|
225
|
+
) -> tuple[float, float, bool]:
|
|
226
|
+
"""Estimate gen_tps using bandwidth ratio from a reference profile.
|
|
227
|
+
|
|
228
|
+
Picks the best available reference profile from the catalog benchmarks
|
|
229
|
+
and scales gen_tps by the ratio of actual bandwidth to reference bandwidth.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
entry: The catalog entry with benchmark data for reference profiles.
|
|
233
|
+
profile: The user's hardware profile.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Tuple of (estimated_gen_tps, memory_gb, True).
|
|
237
|
+
memory_gb is taken directly from the reference (memory usage is
|
|
238
|
+
hardware-independent for the same quant).
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
ScoringError: If no reference benchmark data is available for this model.
|
|
242
|
+
"""
|
|
243
|
+
if not entry.benchmarks:
|
|
244
|
+
msg = f"Model '{entry.id}' has no benchmark data for estimation"
|
|
245
|
+
raise ScoringError(msg)
|
|
246
|
+
|
|
247
|
+
# Pick the reference profile: prefer one with known bandwidth in our table,
|
|
248
|
+
# otherwise use the first available
|
|
249
|
+
ref_profile_id: str | None = None
|
|
250
|
+
ref_bench: BenchmarkResult | None = None
|
|
251
|
+
|
|
252
|
+
for pid, bench in entry.benchmarks.items():
|
|
253
|
+
if pid in _REFERENCE_PROFILES:
|
|
254
|
+
ref_profile_id = pid
|
|
255
|
+
ref_bench = bench
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
if ref_profile_id is None or ref_bench is None:
|
|
259
|
+
# Fall back to the first available benchmark
|
|
260
|
+
ref_profile_id = next(iter(entry.benchmarks))
|
|
261
|
+
ref_bench = entry.benchmarks[ref_profile_id]
|
|
262
|
+
|
|
263
|
+
# Get reference bandwidth
|
|
264
|
+
ref_bandwidth = _REFERENCE_PROFILES.get(ref_profile_id)
|
|
265
|
+
if ref_bandwidth is None:
|
|
266
|
+
# Unknown reference — use the estimate_bandwidth heuristic based on
|
|
267
|
+
# memory size. For the reference, we don't know the memory, so assume
|
|
268
|
+
# a middle-ground bandwidth.
|
|
269
|
+
ref_bandwidth = 400.0 # Conservative middle estimate
|
|
270
|
+
|
|
271
|
+
# Scale gen_tps by bandwidth ratio
|
|
272
|
+
actual_bandwidth = profile.bandwidth_gbps
|
|
273
|
+
ratio = actual_bandwidth / ref_bandwidth if ref_bandwidth > 0 else 1.0
|
|
274
|
+
estimated_gen_tps = ref_bench.gen_tps * ratio
|
|
275
|
+
|
|
276
|
+
# Memory usage is the same regardless of hardware
|
|
277
|
+
return estimated_gen_tps, ref_bench.memory_gb, True
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# --------------------------------------------------------------------------- #
|
|
281
|
+
# Scoring
|
|
282
|
+
# --------------------------------------------------------------------------- #
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _normalize_gen_tps_log(gen_tps: float) -> float:
|
|
286
|
+
"""Normalize gen_tps using log scaling.
|
|
287
|
+
|
|
288
|
+
Log scaling prevents very fast models (e.g., 0.8B at 195 tps) from
|
|
289
|
+
having disproportionately high speed scores compared to larger models
|
|
290
|
+
(e.g., 72B at 12 tps).
|
|
291
|
+
|
|
292
|
+
Maps gen_tps to a 0-1 range using log(1 + gen_tps) / log(1 + max_reference).
|
|
293
|
+
The max reference is set to 200 tps (reasonable upper bound for current hardware).
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
gen_tps: Generation tokens per second.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Normalized score in [0, 1].
|
|
300
|
+
"""
|
|
301
|
+
max_ref = 200.0 # Upper bound reference for normalization
|
|
302
|
+
if gen_tps <= 0:
|
|
303
|
+
return 0.0
|
|
304
|
+
score = math.log(1.0 + gen_tps) / math.log(1.0 + max_ref)
|
|
305
|
+
return min(score, 1.0)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _normalize_quality(quality_overall: int) -> float:
|
|
309
|
+
"""Normalize quality score to [0, 1].
|
|
310
|
+
|
|
311
|
+
Quality scores are on a 0-100 scale.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
quality_overall: Overall quality score (0-100).
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Normalized score in [0, 1].
|
|
318
|
+
"""
|
|
319
|
+
return min(max(quality_overall / 100.0, 0.0), 1.0)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _normalize_tool_calling(has_tool_calling: bool) -> float:
|
|
323
|
+
"""Normalize tool calling capability to a score.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
has_tool_calling: Whether the model supports tool calling.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
1.0 if tool calling is supported, 0.0 otherwise.
|
|
330
|
+
"""
|
|
331
|
+
return 1.0 if has_tool_calling else 0.0
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def _normalize_memory_efficiency(memory_gb: float, budget_gb: float) -> float:
|
|
335
|
+
"""Normalize memory efficiency to [0, 1].
|
|
336
|
+
|
|
337
|
+
Models using less of the budget score higher, encouraging efficient use
|
|
338
|
+
of available memory. Uses (budget - memory) / budget ratio.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
memory_gb: Model memory usage in GB.
|
|
342
|
+
budget_gb: Available memory budget in GB.
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
Normalized score in [0, 1]. Higher means more memory-efficient.
|
|
346
|
+
"""
|
|
347
|
+
if budget_gb <= 0 or memory_gb <= 0:
|
|
348
|
+
return 0.0
|
|
349
|
+
efficiency = (budget_gb - memory_gb) / budget_gb
|
|
350
|
+
return min(max(efficiency, 0.0), 1.0)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def score_model(
|
|
354
|
+
entry: CatalogEntry,
|
|
355
|
+
profile: HardwareProfile,
|
|
356
|
+
weights: IntentWeights,
|
|
357
|
+
budget_gb: float,
|
|
358
|
+
quant: str = _DEFAULT_QUANT,
|
|
359
|
+
saved_benchmarks: dict[str, Any] | None = None,
|
|
360
|
+
) -> ScoredModel:
|
|
361
|
+
"""Score a single model for the given hardware profile and intent weights.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
entry: The catalog entry to score.
|
|
365
|
+
profile: The hardware profile.
|
|
366
|
+
weights: The intent weight configuration.
|
|
367
|
+
budget_gb: Memory budget in GB.
|
|
368
|
+
quant: Quantization level.
|
|
369
|
+
saved_benchmarks: Optional saved benchmark data.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
A ScoredModel with all computed scores.
|
|
373
|
+
|
|
374
|
+
Raises:
|
|
375
|
+
ScoringError: If benchmark data cannot be resolved.
|
|
376
|
+
"""
|
|
377
|
+
gen_tps, memory_gb, is_estimated = _resolve_benchmark(
|
|
378
|
+
entry, profile, quant, saved_benchmarks
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
speed_score = _normalize_gen_tps_log(gen_tps)
|
|
382
|
+
quality_score = _normalize_quality(entry.quality.overall)
|
|
383
|
+
tool_calling_score = _normalize_tool_calling(entry.capabilities.tool_calling)
|
|
384
|
+
memory_eff_score = _normalize_memory_efficiency(memory_gb, budget_gb)
|
|
385
|
+
|
|
386
|
+
composite = (
|
|
387
|
+
weights.speed * speed_score
|
|
388
|
+
+ weights.quality * quality_score
|
|
389
|
+
+ weights.tool_calling * tool_calling_score
|
|
390
|
+
+ weights.memory_efficiency * memory_eff_score
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
return ScoredModel(
|
|
394
|
+
entry=entry,
|
|
395
|
+
composite_score=composite,
|
|
396
|
+
speed_score=speed_score,
|
|
397
|
+
quality_score=quality_score,
|
|
398
|
+
tool_calling_score=tool_calling_score,
|
|
399
|
+
memory_efficiency_score=memory_eff_score,
|
|
400
|
+
gen_tps=gen_tps,
|
|
401
|
+
memory_gb=memory_gb,
|
|
402
|
+
is_estimated=is_estimated,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def score_and_filter(
|
|
407
|
+
catalog: list[CatalogEntry],
|
|
408
|
+
profile: HardwareProfile,
|
|
409
|
+
intent: str,
|
|
410
|
+
budget_gb: float,
|
|
411
|
+
quant: str = _DEFAULT_QUANT,
|
|
412
|
+
saved_benchmarks: dict[str, Any] | None = None,
|
|
413
|
+
) -> list[ScoredModel]:
|
|
414
|
+
"""Score all catalog models and filter by memory budget.
|
|
415
|
+
|
|
416
|
+
Models whose memory_gb exceeds the budget are excluded.
|
|
417
|
+
Results are sorted by composite_score descending.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
catalog: List of catalog entries to score.
|
|
421
|
+
profile: The hardware profile.
|
|
422
|
+
intent: The recommendation intent (balanced or agent-fleet).
|
|
423
|
+
budget_gb: Memory budget in GB.
|
|
424
|
+
quant: Quantization level.
|
|
425
|
+
saved_benchmarks: Optional saved benchmark data.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
List of ScoredModel instances within budget, sorted by score descending.
|
|
429
|
+
|
|
430
|
+
Raises:
|
|
431
|
+
ScoringError: If the intent is invalid.
|
|
432
|
+
"""
|
|
433
|
+
if intent not in VALID_INTENTS:
|
|
434
|
+
valid = ", ".join(sorted(VALID_INTENTS))
|
|
435
|
+
msg = f"Invalid intent '{intent}'. Valid intents: {valid}"
|
|
436
|
+
raise ScoringError(msg)
|
|
437
|
+
|
|
438
|
+
weights = INTENT_WEIGHTS[intent]
|
|
439
|
+
scored: list[ScoredModel] = []
|
|
440
|
+
|
|
441
|
+
for entry in catalog:
|
|
442
|
+
try:
|
|
443
|
+
model = score_model(entry, profile, weights, budget_gb, quant, saved_benchmarks)
|
|
444
|
+
except ScoringError:
|
|
445
|
+
# Skip models that can't be scored (no benchmark data at all)
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
# Filter by memory budget
|
|
449
|
+
if model.memory_gb <= budget_gb:
|
|
450
|
+
scored.append(model)
|
|
451
|
+
|
|
452
|
+
# Sort by composite score descending (deterministic: break ties by model id)
|
|
453
|
+
scored.sort(key=lambda m: (-m.composite_score, m.entry.id))
|
|
454
|
+
|
|
455
|
+
return scored
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
# --------------------------------------------------------------------------- #
|
|
459
|
+
# Tier assignment
|
|
460
|
+
# --------------------------------------------------------------------------- #
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def assign_tiers(
|
|
464
|
+
scored_models: list[ScoredModel],
|
|
465
|
+
memory_budget_gb: float,
|
|
466
|
+
) -> list[TierAssignment]:
|
|
467
|
+
"""Assign scored models to tiers: standard, fast, longctx.
|
|
468
|
+
|
|
469
|
+
Assignment rules:
|
|
470
|
+
- standard: highest intent-weighted composite score from budget-eligible
|
|
471
|
+
candidates (varies by intent — agent-fleet favours tool_calling,
|
|
472
|
+
balanced favours quality)
|
|
473
|
+
- fast: highest gen_tps model that is different from standard
|
|
474
|
+
- longctx: architecturally diverse model (e.g., mamba2-hybrid) if available
|
|
475
|
+
and different from standard and fast
|
|
476
|
+
|
|
477
|
+
Small-memory systems (budget < 16 GB) get 1-2 tiers.
|
|
478
|
+
Large-memory systems get up to 3 tiers.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
scored_models: Pre-filtered, scored models within budget.
|
|
482
|
+
Must already be scored with intent-specific weights.
|
|
483
|
+
memory_budget_gb: The memory budget in GB (used for tier count decisions).
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
List of TierAssignment instances, ordered: standard, fast, longctx.
|
|
487
|
+
"""
|
|
488
|
+
if not scored_models:
|
|
489
|
+
return []
|
|
490
|
+
|
|
491
|
+
assignments: list[TierAssignment] = []
|
|
492
|
+
used_model_ids: set[str] = set()
|
|
493
|
+
|
|
494
|
+
# --- Standard tier: highest intent-weighted composite score ---
|
|
495
|
+
# The composite_score already reflects intent weights (quality-heavy for
|
|
496
|
+
# balanced, tool_calling-heavy for agent-fleet), so sorting by it
|
|
497
|
+
# naturally produces different tier assignments per intent.
|
|
498
|
+
standard_candidates = sorted(
|
|
499
|
+
scored_models,
|
|
500
|
+
key=lambda m: (-m.composite_score, m.entry.id),
|
|
501
|
+
)
|
|
502
|
+
standard_model = standard_candidates[0]
|
|
503
|
+
assignments.append(TierAssignment(
|
|
504
|
+
tier=TIER_STANDARD,
|
|
505
|
+
model=standard_model,
|
|
506
|
+
quant=_DEFAULT_QUANT,
|
|
507
|
+
))
|
|
508
|
+
used_model_ids.add(standard_model.entry.id)
|
|
509
|
+
|
|
510
|
+
# --- Fast tier: highest gen_tps, different from standard ---
|
|
511
|
+
fast_candidates = sorted(
|
|
512
|
+
[m for m in scored_models if m.entry.id not in used_model_ids],
|
|
513
|
+
key=lambda m: (-m.gen_tps, m.entry.id),
|
|
514
|
+
)
|
|
515
|
+
if fast_candidates:
|
|
516
|
+
fast_model = fast_candidates[0]
|
|
517
|
+
assignments.append(TierAssignment(
|
|
518
|
+
tier=TIER_FAST,
|
|
519
|
+
model=fast_model,
|
|
520
|
+
quant=_DEFAULT_QUANT,
|
|
521
|
+
))
|
|
522
|
+
used_model_ids.add(fast_model.entry.id)
|
|
523
|
+
|
|
524
|
+
# --- Longctx tier: architecturally diverse, only for larger budgets ---
|
|
525
|
+
# Only assign longctx if budget >= 16 GB (small systems get 1-2 tiers)
|
|
526
|
+
if memory_budget_gb >= 16.0:
|
|
527
|
+
longctx_candidates = sorted(
|
|
528
|
+
[
|
|
529
|
+
m
|
|
530
|
+
for m in scored_models
|
|
531
|
+
if m.entry.id not in used_model_ids
|
|
532
|
+
and m.entry.architecture in _LONGCTX_ARCHITECTURES
|
|
533
|
+
],
|
|
534
|
+
key=lambda m: (-m.composite_score, m.entry.id),
|
|
535
|
+
)
|
|
536
|
+
if longctx_candidates:
|
|
537
|
+
longctx_model = longctx_candidates[0]
|
|
538
|
+
assignments.append(TierAssignment(
|
|
539
|
+
tier=TIER_LONGCTX,
|
|
540
|
+
model=longctx_model,
|
|
541
|
+
quant=_DEFAULT_QUANT,
|
|
542
|
+
))
|
|
543
|
+
used_model_ids.add(longctx_model.entry.id)
|
|
544
|
+
|
|
545
|
+
return assignments
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
# --------------------------------------------------------------------------- #
|
|
549
|
+
# Main recommendation entry point
|
|
550
|
+
# --------------------------------------------------------------------------- #
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def recommend(
|
|
554
|
+
catalog: list[CatalogEntry],
|
|
555
|
+
profile: HardwareProfile,
|
|
556
|
+
intent: str = "balanced",
|
|
557
|
+
budget_pct: int = DEFAULT_MEMORY_BUDGET_PCT,
|
|
558
|
+
budget_gb_override: float | None = None,
|
|
559
|
+
quant: str = _DEFAULT_QUANT,
|
|
560
|
+
saved_benchmarks: dict[str, Any] | None = None,
|
|
561
|
+
) -> RecommendationResult:
|
|
562
|
+
"""Generate a recommendation for the given hardware and intent.
|
|
563
|
+
|
|
564
|
+
This is the main entry point for the scoring engine. It:
|
|
565
|
+
1. Computes the memory budget
|
|
566
|
+
2. Scores and filters all catalog models
|
|
567
|
+
3. Assigns models to tiers
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
catalog: Full catalog of model entries.
|
|
571
|
+
profile: The hardware profile.
|
|
572
|
+
intent: Recommendation intent (balanced or agent-fleet).
|
|
573
|
+
budget_pct: Memory budget percentage (1-100). Ignored if
|
|
574
|
+
budget_gb_override is set.
|
|
575
|
+
budget_gb_override: Explicit memory budget in GB, overriding
|
|
576
|
+
percentage-based calculation.
|
|
577
|
+
quant: Default quantization level.
|
|
578
|
+
saved_benchmarks: Optional saved benchmark data from bench --save.
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
A RecommendationResult with tier assignments and all scored models.
|
|
582
|
+
|
|
583
|
+
Raises:
|
|
584
|
+
ScoringError: If the intent is invalid or scoring fails.
|
|
585
|
+
"""
|
|
586
|
+
if intent not in VALID_INTENTS:
|
|
587
|
+
valid = ", ".join(sorted(VALID_INTENTS))
|
|
588
|
+
msg = f"Invalid intent '{intent}'. Valid intents: {valid}"
|
|
589
|
+
raise ScoringError(msg)
|
|
590
|
+
|
|
591
|
+
# Compute memory budget
|
|
592
|
+
if budget_gb_override is not None:
|
|
593
|
+
budget_gb = budget_gb_override
|
|
594
|
+
else:
|
|
595
|
+
budget_gb = compute_memory_budget(profile.memory_gb, budget_pct)
|
|
596
|
+
|
|
597
|
+
# Score and filter
|
|
598
|
+
scored = score_and_filter(
|
|
599
|
+
catalog, profile, intent, budget_gb, quant, saved_benchmarks
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# Assign tiers
|
|
603
|
+
tiers = assign_tiers(scored, budget_gb)
|
|
604
|
+
|
|
605
|
+
return RecommendationResult(
|
|
606
|
+
tiers=tiers,
|
|
607
|
+
all_scored=scored,
|
|
608
|
+
memory_budget_gb=budget_gb,
|
|
609
|
+
intent=intent,
|
|
610
|
+
hardware_profile=profile,
|
|
611
|
+
)
|