mlx-stack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. mlx_stack/__init__.py +5 -0
  2. mlx_stack/_version.py +24 -0
  3. mlx_stack/cli/__init__.py +5 -0
  4. mlx_stack/cli/bench.py +221 -0
  5. mlx_stack/cli/config.py +166 -0
  6. mlx_stack/cli/down.py +109 -0
  7. mlx_stack/cli/init.py +180 -0
  8. mlx_stack/cli/install.py +165 -0
  9. mlx_stack/cli/logs.py +234 -0
  10. mlx_stack/cli/main.py +187 -0
  11. mlx_stack/cli/models.py +304 -0
  12. mlx_stack/cli/profile.py +65 -0
  13. mlx_stack/cli/pull.py +134 -0
  14. mlx_stack/cli/recommend.py +397 -0
  15. mlx_stack/cli/status.py +111 -0
  16. mlx_stack/cli/up.py +163 -0
  17. mlx_stack/cli/watch.py +252 -0
  18. mlx_stack/core/__init__.py +1 -0
  19. mlx_stack/core/benchmark.py +1182 -0
  20. mlx_stack/core/catalog.py +560 -0
  21. mlx_stack/core/config.py +471 -0
  22. mlx_stack/core/deps.py +323 -0
  23. mlx_stack/core/hardware.py +304 -0
  24. mlx_stack/core/launchd.py +531 -0
  25. mlx_stack/core/litellm_gen.py +188 -0
  26. mlx_stack/core/log_rotation.py +231 -0
  27. mlx_stack/core/log_viewer.py +386 -0
  28. mlx_stack/core/models.py +639 -0
  29. mlx_stack/core/paths.py +79 -0
  30. mlx_stack/core/process.py +887 -0
  31. mlx_stack/core/pull.py +815 -0
  32. mlx_stack/core/scoring.py +611 -0
  33. mlx_stack/core/stack_down.py +317 -0
  34. mlx_stack/core/stack_init.py +524 -0
  35. mlx_stack/core/stack_status.py +229 -0
  36. mlx_stack/core/stack_up.py +856 -0
  37. mlx_stack/core/watchdog.py +744 -0
  38. mlx_stack/data/__init__.py +1 -0
  39. mlx_stack/data/catalog/__init__.py +1 -0
  40. mlx_stack/data/catalog/deepseek-r1-32b.yaml +46 -0
  41. mlx_stack/data/catalog/deepseek-r1-8b.yaml +45 -0
  42. mlx_stack/data/catalog/gemma3-12b.yaml +45 -0
  43. mlx_stack/data/catalog/gemma3-27b.yaml +45 -0
  44. mlx_stack/data/catalog/gemma3-4b.yaml +45 -0
  45. mlx_stack/data/catalog/llama3.3-8b.yaml +44 -0
  46. mlx_stack/data/catalog/nemotron-49b.yaml +41 -0
  47. mlx_stack/data/catalog/nemotron-8b.yaml +44 -0
  48. mlx_stack/data/catalog/qwen3-8b.yaml +45 -0
  49. mlx_stack/data/catalog/qwen3.5-0.8b.yaml +45 -0
  50. mlx_stack/data/catalog/qwen3.5-14b.yaml +46 -0
  51. mlx_stack/data/catalog/qwen3.5-32b.yaml +45 -0
  52. mlx_stack/data/catalog/qwen3.5-3b.yaml +44 -0
  53. mlx_stack/data/catalog/qwen3.5-72b.yaml +42 -0
  54. mlx_stack/data/catalog/qwen3.5-8b.yaml +45 -0
  55. mlx_stack/py.typed +1 -0
  56. mlx_stack/utils/__init__.py +1 -0
  57. mlx_stack-0.1.0.dist-info/METADATA +397 -0
  58. mlx_stack-0.1.0.dist-info/RECORD +61 -0
  59. mlx_stack-0.1.0.dist-info/WHEEL +4 -0
  60. mlx_stack-0.1.0.dist-info/entry_points.txt +2 -0
  61. mlx_stack-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,611 @@
1
+ """Recommendation scoring engine for mlx-stack.
2
+
3
+ Scores and ranks catalog models for a given hardware profile and intent.
4
+ Supports 2 intents (balanced, agent-fleet) with weighted composite scoring
5
+ across speed, quality, tool_calling, and memory_efficiency dimensions.
6
+
7
+ Uses log-scaled gen_tps normalization to prevent extreme speed models from
8
+ dominating composite scores. Filters models by memory budget (default 40%
9
+ of unified memory). Assigns models to tiers (fast, standard, longctx).
10
+
11
+ Handles bandwidth-ratio estimation for unknown hardware profiles where
12
+ catalog benchmark data is not available.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import math
19
+ from dataclasses import dataclass
20
+ from typing import Any
21
+
22
+ from mlx_stack.core.catalog import BenchmarkResult, CatalogEntry
23
+ from mlx_stack.core.hardware import HardwareProfile
24
+
25
+ logger = logging.getLogger("mlx_stack")
26
+
27
+ # --------------------------------------------------------------------------- #
28
+ # Constants
29
+ # --------------------------------------------------------------------------- #
30
+
31
+ # Default memory budget percentage of unified memory
32
+ DEFAULT_MEMORY_BUDGET_PCT = 40
33
+
34
+ # Valid intents
35
+ VALID_INTENTS: set[str] = {"balanced", "agent-fleet"}
36
+
37
+ # Tier names
38
+ TIER_FAST = "fast"
39
+ TIER_STANDARD = "standard"
40
+ TIER_LONGCTX = "longctx"
41
+
42
+ # Architecture that qualifies for longctx tier
43
+ _LONGCTX_ARCHITECTURES: set[str] = {"mamba2-hybrid"}
44
+
45
+ # Default quantization for memory lookup
46
+ _DEFAULT_QUANT = "int4"
47
+
48
+ # Reference hardware profiles used in catalog benchmarks and their bandwidths.
49
+ # Used for bandwidth-ratio estimation when a model has no benchmark data
50
+ # for the user's hardware.
51
+ _REFERENCE_PROFILES: dict[str, float] = {
52
+ "m4-pro-48": 273.0,
53
+ "m4-max-128": 546.0,
54
+ "m5-max-128": 546.0,
55
+ }
56
+
57
+
58
+ # --------------------------------------------------------------------------- #
59
+ # Intent weight configurations
60
+ # --------------------------------------------------------------------------- #
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class IntentWeights:
65
+ """Weight configuration for a recommendation intent.
66
+
67
+ Weights should sum to 1.0 for normalized scoring.
68
+ """
69
+
70
+ speed: float
71
+ quality: float
72
+ tool_calling: float
73
+ memory_efficiency: float
74
+
75
+ def __post_init__(self) -> None:
76
+ total = self.speed + self.quality + self.tool_calling + self.memory_efficiency
77
+ if not math.isclose(total, 1.0, abs_tol=0.001):
78
+ msg = f"Intent weights must sum to 1.0, got {total}"
79
+ raise ValueError(msg)
80
+
81
+
82
+ # Weight configurations for each intent
83
+ INTENT_WEIGHTS: dict[str, IntentWeights] = {
84
+ "balanced": IntentWeights(
85
+ speed=0.25,
86
+ quality=0.40,
87
+ tool_calling=0.15,
88
+ memory_efficiency=0.20,
89
+ ),
90
+ "agent-fleet": IntentWeights(
91
+ speed=0.30,
92
+ quality=0.20,
93
+ tool_calling=0.35,
94
+ memory_efficiency=0.15,
95
+ ),
96
+ }
97
+
98
+
99
+ # --------------------------------------------------------------------------- #
100
+ # Exceptions
101
+ # --------------------------------------------------------------------------- #
102
+
103
+
104
+ class ScoringError(Exception):
105
+ """Raised when scoring fails."""
106
+
107
+
108
+ # --------------------------------------------------------------------------- #
109
+ # Data classes
110
+ # --------------------------------------------------------------------------- #
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class ScoredModel:
115
+ """A catalog model with computed scores for a given hardware/intent."""
116
+
117
+ entry: CatalogEntry
118
+ composite_score: float
119
+ speed_score: float
120
+ quality_score: float
121
+ tool_calling_score: float
122
+ memory_efficiency_score: float
123
+ gen_tps: float
124
+ memory_gb: float
125
+ is_estimated: bool # True if gen_tps was bandwidth-ratio estimated
126
+
127
+
128
+ @dataclass(frozen=True)
129
+ class TierAssignment:
130
+ """A model assigned to a specific tier."""
131
+
132
+ tier: str
133
+ model: ScoredModel
134
+ quant: str
135
+
136
+
137
+ @dataclass(frozen=True)
138
+ class RecommendationResult:
139
+ """The full recommendation result with tier assignments and scored models."""
140
+
141
+ tiers: list[TierAssignment]
142
+ all_scored: list[ScoredModel]
143
+ memory_budget_gb: float
144
+ intent: str
145
+ hardware_profile: HardwareProfile
146
+
147
+
148
+ # --------------------------------------------------------------------------- #
149
+ # Memory budget
150
+ # --------------------------------------------------------------------------- #
151
+
152
+
153
+ def compute_memory_budget(memory_gb: int, budget_pct: int = DEFAULT_MEMORY_BUDGET_PCT) -> float:
154
+ """Compute the memory budget in GB from total memory and percentage.
155
+
156
+ Args:
157
+ memory_gb: Total unified memory in GB.
158
+ budget_pct: Budget percentage (1-100).
159
+
160
+ Returns:
161
+ Memory budget in GB.
162
+ """
163
+ return memory_gb * (budget_pct / 100.0)
164
+
165
+
166
+ # --------------------------------------------------------------------------- #
167
+ # Benchmark resolution
168
+ # --------------------------------------------------------------------------- #
169
+
170
+
171
+ def _resolve_benchmark(
172
+ entry: CatalogEntry,
173
+ profile: HardwareProfile,
174
+ quant: str = _DEFAULT_QUANT,
175
+ saved_benchmarks: dict[str, Any] | None = None,
176
+ ) -> tuple[float, float, bool]:
177
+ """Resolve gen_tps and memory_gb for a model on the given hardware.
178
+
179
+ Tries in order:
180
+ 1. Saved benchmark data (from bench --save)
181
+ 2. Direct catalog benchmark match for profile_id
182
+ 3. Bandwidth-ratio estimation from a reference profile
183
+
184
+ Args:
185
+ entry: The catalog entry.
186
+ profile: The hardware profile.
187
+ quant: Quantization level (for saved benchmarks key).
188
+ saved_benchmarks: Optional dict from bench --save JSON files,
189
+ keyed by model_id with gen_tps, prompt_tps, memory_gb.
190
+
191
+ Returns:
192
+ Tuple of (gen_tps, memory_gb, is_estimated).
193
+ """
194
+ # 1. Check saved benchmarks
195
+ if saved_benchmarks and entry.id in saved_benchmarks:
196
+ saved = saved_benchmarks[entry.id]
197
+ try:
198
+ return (
199
+ float(saved.get("gen_tps", 0.0)),
200
+ float(saved.get("memory_gb", 0.0)),
201
+ False,
202
+ )
203
+ except (ValueError, TypeError):
204
+ # Malformed saved benchmark data — fall through to catalog lookup
205
+ logger.warning(
206
+ "Ignoring malformed saved benchmark for model '%s': "
207
+ "invalid numeric values",
208
+ entry.id,
209
+ )
210
+
211
+ profile_id = profile.profile_id
212
+
213
+ # 2. Direct match in catalog benchmarks
214
+ if profile_id in entry.benchmarks:
215
+ bench = entry.benchmarks[profile_id]
216
+ return bench.gen_tps, bench.memory_gb, False
217
+
218
+ # 3. Bandwidth-ratio estimation from a reference profile
219
+ return _estimate_from_bandwidth_ratio(entry, profile)
220
+
221
+
222
+ def _estimate_from_bandwidth_ratio(
223
+ entry: CatalogEntry,
224
+ profile: HardwareProfile,
225
+ ) -> tuple[float, float, bool]:
226
+ """Estimate gen_tps using bandwidth ratio from a reference profile.
227
+
228
+ Picks the best available reference profile from the catalog benchmarks
229
+ and scales gen_tps by the ratio of actual bandwidth to reference bandwidth.
230
+
231
+ Args:
232
+ entry: The catalog entry with benchmark data for reference profiles.
233
+ profile: The user's hardware profile.
234
+
235
+ Returns:
236
+ Tuple of (estimated_gen_tps, memory_gb, True).
237
+ memory_gb is taken directly from the reference (memory usage is
238
+ hardware-independent for the same quant).
239
+
240
+ Raises:
241
+ ScoringError: If no reference benchmark data is available for this model.
242
+ """
243
+ if not entry.benchmarks:
244
+ msg = f"Model '{entry.id}' has no benchmark data for estimation"
245
+ raise ScoringError(msg)
246
+
247
+ # Pick the reference profile: prefer one with known bandwidth in our table,
248
+ # otherwise use the first available
249
+ ref_profile_id: str | None = None
250
+ ref_bench: BenchmarkResult | None = None
251
+
252
+ for pid, bench in entry.benchmarks.items():
253
+ if pid in _REFERENCE_PROFILES:
254
+ ref_profile_id = pid
255
+ ref_bench = bench
256
+ break
257
+
258
+ if ref_profile_id is None or ref_bench is None:
259
+ # Fall back to the first available benchmark
260
+ ref_profile_id = next(iter(entry.benchmarks))
261
+ ref_bench = entry.benchmarks[ref_profile_id]
262
+
263
+ # Get reference bandwidth
264
+ ref_bandwidth = _REFERENCE_PROFILES.get(ref_profile_id)
265
+ if ref_bandwidth is None:
266
+ # Unknown reference — use the estimate_bandwidth heuristic based on
267
+ # memory size. For the reference, we don't know the memory, so assume
268
+ # a middle-ground bandwidth.
269
+ ref_bandwidth = 400.0 # Conservative middle estimate
270
+
271
+ # Scale gen_tps by bandwidth ratio
272
+ actual_bandwidth = profile.bandwidth_gbps
273
+ ratio = actual_bandwidth / ref_bandwidth if ref_bandwidth > 0 else 1.0
274
+ estimated_gen_tps = ref_bench.gen_tps * ratio
275
+
276
+ # Memory usage is the same regardless of hardware
277
+ return estimated_gen_tps, ref_bench.memory_gb, True
278
+
279
+
280
+ # --------------------------------------------------------------------------- #
281
+ # Scoring
282
+ # --------------------------------------------------------------------------- #
283
+
284
+
285
+ def _normalize_gen_tps_log(gen_tps: float) -> float:
286
+ """Normalize gen_tps using log scaling.
287
+
288
+ Log scaling prevents very fast models (e.g., 0.8B at 195 tps) from
289
+ having disproportionately high speed scores compared to larger models
290
+ (e.g., 72B at 12 tps).
291
+
292
+ Maps gen_tps to a 0-1 range using log(1 + gen_tps) / log(1 + max_reference).
293
+ The max reference is set to 200 tps (reasonable upper bound for current hardware).
294
+
295
+ Args:
296
+ gen_tps: Generation tokens per second.
297
+
298
+ Returns:
299
+ Normalized score in [0, 1].
300
+ """
301
+ max_ref = 200.0 # Upper bound reference for normalization
302
+ if gen_tps <= 0:
303
+ return 0.0
304
+ score = math.log(1.0 + gen_tps) / math.log(1.0 + max_ref)
305
+ return min(score, 1.0)
306
+
307
+
308
+ def _normalize_quality(quality_overall: int) -> float:
309
+ """Normalize quality score to [0, 1].
310
+
311
+ Quality scores are on a 0-100 scale.
312
+
313
+ Args:
314
+ quality_overall: Overall quality score (0-100).
315
+
316
+ Returns:
317
+ Normalized score in [0, 1].
318
+ """
319
+ return min(max(quality_overall / 100.0, 0.0), 1.0)
320
+
321
+
322
+ def _normalize_tool_calling(has_tool_calling: bool) -> float:
323
+ """Normalize tool calling capability to a score.
324
+
325
+ Args:
326
+ has_tool_calling: Whether the model supports tool calling.
327
+
328
+ Returns:
329
+ 1.0 if tool calling is supported, 0.0 otherwise.
330
+ """
331
+ return 1.0 if has_tool_calling else 0.0
332
+
333
+
334
+ def _normalize_memory_efficiency(memory_gb: float, budget_gb: float) -> float:
335
+ """Normalize memory efficiency to [0, 1].
336
+
337
+ Models using less of the budget score higher, encouraging efficient use
338
+ of available memory. Uses (budget - memory) / budget ratio.
339
+
340
+ Args:
341
+ memory_gb: Model memory usage in GB.
342
+ budget_gb: Available memory budget in GB.
343
+
344
+ Returns:
345
+ Normalized score in [0, 1]. Higher means more memory-efficient.
346
+ """
347
+ if budget_gb <= 0 or memory_gb <= 0:
348
+ return 0.0
349
+ efficiency = (budget_gb - memory_gb) / budget_gb
350
+ return min(max(efficiency, 0.0), 1.0)
351
+
352
+
353
+ def score_model(
354
+ entry: CatalogEntry,
355
+ profile: HardwareProfile,
356
+ weights: IntentWeights,
357
+ budget_gb: float,
358
+ quant: str = _DEFAULT_QUANT,
359
+ saved_benchmarks: dict[str, Any] | None = None,
360
+ ) -> ScoredModel:
361
+ """Score a single model for the given hardware profile and intent weights.
362
+
363
+ Args:
364
+ entry: The catalog entry to score.
365
+ profile: The hardware profile.
366
+ weights: The intent weight configuration.
367
+ budget_gb: Memory budget in GB.
368
+ quant: Quantization level.
369
+ saved_benchmarks: Optional saved benchmark data.
370
+
371
+ Returns:
372
+ A ScoredModel with all computed scores.
373
+
374
+ Raises:
375
+ ScoringError: If benchmark data cannot be resolved.
376
+ """
377
+ gen_tps, memory_gb, is_estimated = _resolve_benchmark(
378
+ entry, profile, quant, saved_benchmarks
379
+ )
380
+
381
+ speed_score = _normalize_gen_tps_log(gen_tps)
382
+ quality_score = _normalize_quality(entry.quality.overall)
383
+ tool_calling_score = _normalize_tool_calling(entry.capabilities.tool_calling)
384
+ memory_eff_score = _normalize_memory_efficiency(memory_gb, budget_gb)
385
+
386
+ composite = (
387
+ weights.speed * speed_score
388
+ + weights.quality * quality_score
389
+ + weights.tool_calling * tool_calling_score
390
+ + weights.memory_efficiency * memory_eff_score
391
+ )
392
+
393
+ return ScoredModel(
394
+ entry=entry,
395
+ composite_score=composite,
396
+ speed_score=speed_score,
397
+ quality_score=quality_score,
398
+ tool_calling_score=tool_calling_score,
399
+ memory_efficiency_score=memory_eff_score,
400
+ gen_tps=gen_tps,
401
+ memory_gb=memory_gb,
402
+ is_estimated=is_estimated,
403
+ )
404
+
405
+
406
+ def score_and_filter(
407
+ catalog: list[CatalogEntry],
408
+ profile: HardwareProfile,
409
+ intent: str,
410
+ budget_gb: float,
411
+ quant: str = _DEFAULT_QUANT,
412
+ saved_benchmarks: dict[str, Any] | None = None,
413
+ ) -> list[ScoredModel]:
414
+ """Score all catalog models and filter by memory budget.
415
+
416
+ Models whose memory_gb exceeds the budget are excluded.
417
+ Results are sorted by composite_score descending.
418
+
419
+ Args:
420
+ catalog: List of catalog entries to score.
421
+ profile: The hardware profile.
422
+ intent: The recommendation intent (balanced or agent-fleet).
423
+ budget_gb: Memory budget in GB.
424
+ quant: Quantization level.
425
+ saved_benchmarks: Optional saved benchmark data.
426
+
427
+ Returns:
428
+ List of ScoredModel instances within budget, sorted by score descending.
429
+
430
+ Raises:
431
+ ScoringError: If the intent is invalid.
432
+ """
433
+ if intent not in VALID_INTENTS:
434
+ valid = ", ".join(sorted(VALID_INTENTS))
435
+ msg = f"Invalid intent '{intent}'. Valid intents: {valid}"
436
+ raise ScoringError(msg)
437
+
438
+ weights = INTENT_WEIGHTS[intent]
439
+ scored: list[ScoredModel] = []
440
+
441
+ for entry in catalog:
442
+ try:
443
+ model = score_model(entry, profile, weights, budget_gb, quant, saved_benchmarks)
444
+ except ScoringError:
445
+ # Skip models that can't be scored (no benchmark data at all)
446
+ continue
447
+
448
+ # Filter by memory budget
449
+ if model.memory_gb <= budget_gb:
450
+ scored.append(model)
451
+
452
+ # Sort by composite score descending (deterministic: break ties by model id)
453
+ scored.sort(key=lambda m: (-m.composite_score, m.entry.id))
454
+
455
+ return scored
456
+
457
+
458
+ # --------------------------------------------------------------------------- #
459
+ # Tier assignment
460
+ # --------------------------------------------------------------------------- #
461
+
462
+
463
+ def assign_tiers(
464
+ scored_models: list[ScoredModel],
465
+ memory_budget_gb: float,
466
+ ) -> list[TierAssignment]:
467
+ """Assign scored models to tiers: standard, fast, longctx.
468
+
469
+ Assignment rules:
470
+ - standard: highest intent-weighted composite score from budget-eligible
471
+ candidates (varies by intent — agent-fleet favours tool_calling,
472
+ balanced favours quality)
473
+ - fast: highest gen_tps model that is different from standard
474
+ - longctx: architecturally diverse model (e.g., mamba2-hybrid) if available
475
+ and different from standard and fast
476
+
477
+ Small-memory systems (budget < 16 GB) get 1-2 tiers.
478
+ Large-memory systems get up to 3 tiers.
479
+
480
+ Args:
481
+ scored_models: Pre-filtered, scored models within budget.
482
+ Must already be scored with intent-specific weights.
483
+ memory_budget_gb: The memory budget in GB (used for tier count decisions).
484
+
485
+ Returns:
486
+ List of TierAssignment instances, ordered: standard, fast, longctx.
487
+ """
488
+ if not scored_models:
489
+ return []
490
+
491
+ assignments: list[TierAssignment] = []
492
+ used_model_ids: set[str] = set()
493
+
494
+ # --- Standard tier: highest intent-weighted composite score ---
495
+ # The composite_score already reflects intent weights (quality-heavy for
496
+ # balanced, tool_calling-heavy for agent-fleet), so sorting by it
497
+ # naturally produces different tier assignments per intent.
498
+ standard_candidates = sorted(
499
+ scored_models,
500
+ key=lambda m: (-m.composite_score, m.entry.id),
501
+ )
502
+ standard_model = standard_candidates[0]
503
+ assignments.append(TierAssignment(
504
+ tier=TIER_STANDARD,
505
+ model=standard_model,
506
+ quant=_DEFAULT_QUANT,
507
+ ))
508
+ used_model_ids.add(standard_model.entry.id)
509
+
510
+ # --- Fast tier: highest gen_tps, different from standard ---
511
+ fast_candidates = sorted(
512
+ [m for m in scored_models if m.entry.id not in used_model_ids],
513
+ key=lambda m: (-m.gen_tps, m.entry.id),
514
+ )
515
+ if fast_candidates:
516
+ fast_model = fast_candidates[0]
517
+ assignments.append(TierAssignment(
518
+ tier=TIER_FAST,
519
+ model=fast_model,
520
+ quant=_DEFAULT_QUANT,
521
+ ))
522
+ used_model_ids.add(fast_model.entry.id)
523
+
524
+ # --- Longctx tier: architecturally diverse, only for larger budgets ---
525
+ # Only assign longctx if budget >= 16 GB (small systems get 1-2 tiers)
526
+ if memory_budget_gb >= 16.0:
527
+ longctx_candidates = sorted(
528
+ [
529
+ m
530
+ for m in scored_models
531
+ if m.entry.id not in used_model_ids
532
+ and m.entry.architecture in _LONGCTX_ARCHITECTURES
533
+ ],
534
+ key=lambda m: (-m.composite_score, m.entry.id),
535
+ )
536
+ if longctx_candidates:
537
+ longctx_model = longctx_candidates[0]
538
+ assignments.append(TierAssignment(
539
+ tier=TIER_LONGCTX,
540
+ model=longctx_model,
541
+ quant=_DEFAULT_QUANT,
542
+ ))
543
+ used_model_ids.add(longctx_model.entry.id)
544
+
545
+ return assignments
546
+
547
+
548
+ # --------------------------------------------------------------------------- #
549
+ # Main recommendation entry point
550
+ # --------------------------------------------------------------------------- #
551
+
552
+
553
+ def recommend(
554
+ catalog: list[CatalogEntry],
555
+ profile: HardwareProfile,
556
+ intent: str = "balanced",
557
+ budget_pct: int = DEFAULT_MEMORY_BUDGET_PCT,
558
+ budget_gb_override: float | None = None,
559
+ quant: str = _DEFAULT_QUANT,
560
+ saved_benchmarks: dict[str, Any] | None = None,
561
+ ) -> RecommendationResult:
562
+ """Generate a recommendation for the given hardware and intent.
563
+
564
+ This is the main entry point for the scoring engine. It:
565
+ 1. Computes the memory budget
566
+ 2. Scores and filters all catalog models
567
+ 3. Assigns models to tiers
568
+
569
+ Args:
570
+ catalog: Full catalog of model entries.
571
+ profile: The hardware profile.
572
+ intent: Recommendation intent (balanced or agent-fleet).
573
+ budget_pct: Memory budget percentage (1-100). Ignored if
574
+ budget_gb_override is set.
575
+ budget_gb_override: Explicit memory budget in GB, overriding
576
+ percentage-based calculation.
577
+ quant: Default quantization level.
578
+ saved_benchmarks: Optional saved benchmark data from bench --save.
579
+
580
+ Returns:
581
+ A RecommendationResult with tier assignments and all scored models.
582
+
583
+ Raises:
584
+ ScoringError: If the intent is invalid or scoring fails.
585
+ """
586
+ if intent not in VALID_INTENTS:
587
+ valid = ", ".join(sorted(VALID_INTENTS))
588
+ msg = f"Invalid intent '{intent}'. Valid intents: {valid}"
589
+ raise ScoringError(msg)
590
+
591
+ # Compute memory budget
592
+ if budget_gb_override is not None:
593
+ budget_gb = budget_gb_override
594
+ else:
595
+ budget_gb = compute_memory_budget(profile.memory_gb, budget_pct)
596
+
597
+ # Score and filter
598
+ scored = score_and_filter(
599
+ catalog, profile, intent, budget_gb, quant, saved_benchmarks
600
+ )
601
+
602
+ # Assign tiers
603
+ tiers = assign_tiers(scored, budget_gb)
604
+
605
+ return RecommendationResult(
606
+ tiers=tiers,
607
+ all_scored=scored,
608
+ memory_budget_gb=budget_gb,
609
+ intent=intent,
610
+ hardware_profile=profile,
611
+ )