mlx-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_stack/__init__.py +5 -0
- mlx_stack/_version.py +24 -0
- mlx_stack/cli/__init__.py +5 -0
- mlx_stack/cli/bench.py +221 -0
- mlx_stack/cli/config.py +166 -0
- mlx_stack/cli/down.py +109 -0
- mlx_stack/cli/init.py +180 -0
- mlx_stack/cli/install.py +165 -0
- mlx_stack/cli/logs.py +234 -0
- mlx_stack/cli/main.py +187 -0
- mlx_stack/cli/models.py +304 -0
- mlx_stack/cli/profile.py +65 -0
- mlx_stack/cli/pull.py +134 -0
- mlx_stack/cli/recommend.py +397 -0
- mlx_stack/cli/status.py +111 -0
- mlx_stack/cli/up.py +163 -0
- mlx_stack/cli/watch.py +252 -0
- mlx_stack/core/__init__.py +1 -0
- mlx_stack/core/benchmark.py +1182 -0
- mlx_stack/core/catalog.py +560 -0
- mlx_stack/core/config.py +471 -0
- mlx_stack/core/deps.py +323 -0
- mlx_stack/core/hardware.py +304 -0
- mlx_stack/core/launchd.py +531 -0
- mlx_stack/core/litellm_gen.py +188 -0
- mlx_stack/core/log_rotation.py +231 -0
- mlx_stack/core/log_viewer.py +386 -0
- mlx_stack/core/models.py +639 -0
- mlx_stack/core/paths.py +79 -0
- mlx_stack/core/process.py +887 -0
- mlx_stack/core/pull.py +815 -0
- mlx_stack/core/scoring.py +611 -0
- mlx_stack/core/stack_down.py +317 -0
- mlx_stack/core/stack_init.py +524 -0
- mlx_stack/core/stack_status.py +229 -0
- mlx_stack/core/stack_up.py +856 -0
- mlx_stack/core/watchdog.py +744 -0
- mlx_stack/data/__init__.py +1 -0
- mlx_stack/data/catalog/__init__.py +1 -0
- mlx_stack/data/catalog/deepseek-r1-32b.yaml +46 -0
- mlx_stack/data/catalog/deepseek-r1-8b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-12b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-27b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-4b.yaml +45 -0
- mlx_stack/data/catalog/llama3.3-8b.yaml +44 -0
- mlx_stack/data/catalog/nemotron-49b.yaml +41 -0
- mlx_stack/data/catalog/nemotron-8b.yaml +44 -0
- mlx_stack/data/catalog/qwen3-8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-0.8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-14b.yaml +46 -0
- mlx_stack/data/catalog/qwen3.5-32b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-3b.yaml +44 -0
- mlx_stack/data/catalog/qwen3.5-72b.yaml +42 -0
- mlx_stack/data/catalog/qwen3.5-8b.yaml +45 -0
- mlx_stack/py.typed +1 -0
- mlx_stack/utils/__init__.py +1 -0
- mlx_stack-0.1.0.dist-info/METADATA +397 -0
- mlx_stack-0.1.0.dist-info/RECORD +61 -0
- mlx_stack-0.1.0.dist-info/WHEEL +4 -0
- mlx_stack-0.1.0.dist-info/entry_points.txt +2 -0
- mlx_stack-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,524 @@
|
|
|
1
|
+
"""Stack initialization logic for mlx-stack.
|
|
2
|
+
|
|
3
|
+
Generates stack definition YAML and LiteLLM config files from a
|
|
4
|
+
recommendation result. Handles port allocation, vllm_flags generation,
|
|
5
|
+
cloud fallback, missing model detection, and overwrite protection.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import socket
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import yaml
|
|
16
|
+
|
|
17
|
+
from mlx_stack.core.catalog import CatalogEntry, get_entry_by_id, load_catalog
|
|
18
|
+
from mlx_stack.core.config import ConfigCorruptError, get_value
|
|
19
|
+
from mlx_stack.core.hardware import detect_hardware, load_profile, save_profile
|
|
20
|
+
from mlx_stack.core.litellm_gen import generate_litellm_config, render_litellm_yaml
|
|
21
|
+
from mlx_stack.core.paths import ensure_data_home, get_data_home, get_stacks_dir
|
|
22
|
+
from mlx_stack.core.scoring import (
|
|
23
|
+
VALID_INTENTS,
|
|
24
|
+
RecommendationResult,
|
|
25
|
+
ScoringError,
|
|
26
|
+
TierAssignment,
|
|
27
|
+
)
|
|
28
|
+
from mlx_stack.core.scoring import recommend as run_recommend
|
|
29
|
+
|
|
30
|
+
# --------------------------------------------------------------------------- #
|
|
31
|
+
# Constants
|
|
32
|
+
# --------------------------------------------------------------------------- #
|
|
33
|
+
|
|
34
|
+
# Default starting port for vllm-mlx instances
|
|
35
|
+
_VLLM_BASE_PORT = 8000
|
|
36
|
+
|
|
37
|
+
# Schema version for stack definition files
|
|
38
|
+
STACK_SCHEMA_VERSION = 1
|
|
39
|
+
|
|
40
|
+
# Default stack name
|
|
41
|
+
DEFAULT_STACK_NAME = "default"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# --------------------------------------------------------------------------- #
|
|
45
|
+
# Exceptions
|
|
46
|
+
# --------------------------------------------------------------------------- #
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class InitError(Exception):
|
|
50
|
+
"""Raised when stack initialization fails."""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# --------------------------------------------------------------------------- #
|
|
54
|
+
# Port allocation
|
|
55
|
+
# --------------------------------------------------------------------------- #
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _is_port_available(port: int) -> bool:
|
|
59
|
+
"""Check if a TCP port is available for binding.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
port: The port number to check.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
True if the port is available, False otherwise.
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
69
|
+
sock.settimeout(1)
|
|
70
|
+
sock.bind(("127.0.0.1", port))
|
|
71
|
+
return True
|
|
72
|
+
except OSError:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def allocate_ports(
|
|
77
|
+
num_tiers: int,
|
|
78
|
+
litellm_port: int = 4000,
|
|
79
|
+
base_port: int = _VLLM_BASE_PORT,
|
|
80
|
+
) -> list[int]:
|
|
81
|
+
"""Allocate unique ports for vllm-mlx instances.
|
|
82
|
+
|
|
83
|
+
Ensures no port conflicts with the LiteLLM port and skips ports
|
|
84
|
+
that are already in use (detected via socket binding). Selects
|
|
85
|
+
deterministic alternates by incrementing the port number.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
num_tiers: Number of tiers needing ports.
|
|
89
|
+
litellm_port: The LiteLLM proxy port to avoid.
|
|
90
|
+
base_port: Starting port for allocation.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of unique port numbers, one per tier.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
InitError: If not enough ports can be allocated within a
|
|
97
|
+
reasonable range (base_port .. base_port + 100).
|
|
98
|
+
"""
|
|
99
|
+
ports: list[int] = []
|
|
100
|
+
port = base_port
|
|
101
|
+
max_port = base_port + 100 # Safety limit to prevent infinite loops
|
|
102
|
+
|
|
103
|
+
for _ in range(num_tiers):
|
|
104
|
+
# Skip the LiteLLM port and ports already in use
|
|
105
|
+
while port == litellm_port or not _is_port_available(port):
|
|
106
|
+
port += 1
|
|
107
|
+
if port > max_port:
|
|
108
|
+
msg = (
|
|
109
|
+
f"Could not allocate {num_tiers} free ports starting "
|
|
110
|
+
f"from {base_port}. All ports in range "
|
|
111
|
+
f"{base_port}–{max_port} are in use or reserved."
|
|
112
|
+
)
|
|
113
|
+
raise InitError(msg)
|
|
114
|
+
ports.append(port)
|
|
115
|
+
port += 1
|
|
116
|
+
|
|
117
|
+
return ports
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# --------------------------------------------------------------------------- #
|
|
121
|
+
# vllm flags generation
|
|
122
|
+
# --------------------------------------------------------------------------- #
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def build_vllm_flags(entry: CatalogEntry) -> dict[str, Any]:
|
|
126
|
+
"""Build vllm_flags for a model based on its catalog capabilities.
|
|
127
|
+
|
|
128
|
+
All models get:
|
|
129
|
+
- continuous_batching: true
|
|
130
|
+
- use_paged_cache: true
|
|
131
|
+
|
|
132
|
+
Tool-calling models additionally get:
|
|
133
|
+
- enable_auto_tool_choice: true
|
|
134
|
+
- tool_call_parser: <parser from catalog>
|
|
135
|
+
|
|
136
|
+
Thinking models additionally get:
|
|
137
|
+
- reasoning_parser: <parser from catalog>
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
entry: The catalog entry for the model.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
A dict of vllm flags.
|
|
144
|
+
"""
|
|
145
|
+
flags: dict[str, Any] = {
|
|
146
|
+
"continuous_batching": True,
|
|
147
|
+
"use_paged_cache": True,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if entry.capabilities.tool_calling:
|
|
151
|
+
flags["enable_auto_tool_choice"] = True
|
|
152
|
+
if entry.capabilities.tool_call_parser:
|
|
153
|
+
flags["tool_call_parser"] = entry.capabilities.tool_call_parser
|
|
154
|
+
|
|
155
|
+
if entry.capabilities.thinking and entry.capabilities.reasoning_parser:
|
|
156
|
+
flags["reasoning_parser"] = entry.capabilities.reasoning_parser
|
|
157
|
+
|
|
158
|
+
return flags
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# --------------------------------------------------------------------------- #
|
|
162
|
+
# Stack definition generation
|
|
163
|
+
# --------------------------------------------------------------------------- #
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _build_tier_entry(
|
|
167
|
+
assignment: TierAssignment,
|
|
168
|
+
port: int,
|
|
169
|
+
catalog: list[CatalogEntry],
|
|
170
|
+
) -> dict[str, Any]:
|
|
171
|
+
"""Build a single tier entry for the stack definition.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
assignment: The tier assignment from the scoring engine.
|
|
175
|
+
port: The allocated port for this tier.
|
|
176
|
+
catalog: The full catalog (for capability lookup).
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
A dict representing the tier in the stack YAML.
|
|
180
|
+
"""
|
|
181
|
+
entry = assignment.model.entry
|
|
182
|
+
quant = assignment.quant
|
|
183
|
+
|
|
184
|
+
# Get the source HF repo for this quant
|
|
185
|
+
source = ""
|
|
186
|
+
if quant in entry.sources:
|
|
187
|
+
source = entry.sources[quant].hf_repo
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
"name": assignment.tier,
|
|
191
|
+
"model": entry.id,
|
|
192
|
+
"quant": quant,
|
|
193
|
+
"source": source,
|
|
194
|
+
"port": port,
|
|
195
|
+
"vllm_flags": build_vllm_flags(entry),
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def generate_stack_definition(
|
|
200
|
+
recommendation: RecommendationResult,
|
|
201
|
+
ports: list[int],
|
|
202
|
+
catalog: list[CatalogEntry],
|
|
203
|
+
stack_name: str = DEFAULT_STACK_NAME,
|
|
204
|
+
cloud_fallback: dict[str, Any] | None = None,
|
|
205
|
+
) -> dict[str, Any]:
|
|
206
|
+
"""Generate a stack definition YAML structure.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
recommendation: The recommendation result with tier assignments.
|
|
210
|
+
ports: Allocated ports, one per tier.
|
|
211
|
+
catalog: The full catalog for capability lookups.
|
|
212
|
+
stack_name: Name of the stack (default: 'default').
|
|
213
|
+
cloud_fallback: Optional cloud fallback configuration.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
A dict representing the full stack definition.
|
|
217
|
+
|
|
218
|
+
Raises:
|
|
219
|
+
InitError: If generation fails.
|
|
220
|
+
"""
|
|
221
|
+
if len(ports) != len(recommendation.tiers):
|
|
222
|
+
msg = f"Port count ({len(ports)}) doesn't match tier count ({len(recommendation.tiers)})"
|
|
223
|
+
raise InitError(msg)
|
|
224
|
+
|
|
225
|
+
tiers: list[dict[str, Any]] = []
|
|
226
|
+
for assignment, port in zip(recommendation.tiers, ports):
|
|
227
|
+
tiers.append(_build_tier_entry(assignment, port, catalog))
|
|
228
|
+
|
|
229
|
+
stack: dict[str, Any] = {
|
|
230
|
+
"schema_version": STACK_SCHEMA_VERSION,
|
|
231
|
+
"name": stack_name,
|
|
232
|
+
"hardware_profile": recommendation.hardware_profile.profile_id,
|
|
233
|
+
"intent": recommendation.intent,
|
|
234
|
+
"created": datetime.now(timezone.utc).isoformat(),
|
|
235
|
+
"tiers": tiers,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if cloud_fallback:
|
|
239
|
+
stack["cloud_fallback"] = cloud_fallback
|
|
240
|
+
|
|
241
|
+
return stack
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# --------------------------------------------------------------------------- #
|
|
245
|
+
# Missing model detection
|
|
246
|
+
# --------------------------------------------------------------------------- #
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def detect_missing_models(
|
|
250
|
+
tiers: list[dict[str, Any]],
|
|
251
|
+
models_dir: Path | None = None,
|
|
252
|
+
) -> list[str]:
|
|
253
|
+
"""Detect models referenced in the stack that are not locally available.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
tiers: List of tier entries from the stack definition.
|
|
257
|
+
models_dir: The models directory to check. If None, uses config.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
List of model IDs that are not found locally.
|
|
261
|
+
"""
|
|
262
|
+
if models_dir is None:
|
|
263
|
+
try:
|
|
264
|
+
models_dir = Path(str(get_value("model-dir"))).expanduser()
|
|
265
|
+
except (ConfigCorruptError, Exception):
|
|
266
|
+
models_dir = get_data_home() / "models"
|
|
267
|
+
|
|
268
|
+
missing: list[str] = []
|
|
269
|
+
for tier in tiers:
|
|
270
|
+
model_id = tier["model"]
|
|
271
|
+
# Check if model directory exists (simple heuristic)
|
|
272
|
+
# Models would be stored in subdirectories matching the source repo pattern
|
|
273
|
+
# or the model ID
|
|
274
|
+
model_path = models_dir / model_id
|
|
275
|
+
source = tier.get("source", "")
|
|
276
|
+
# Also check by HF repo name (directory name from hf_repo)
|
|
277
|
+
source_dir_name = source.rsplit("/", 1)[-1] if "/" in source else source
|
|
278
|
+
source_path = models_dir / source_dir_name if source_dir_name else None
|
|
279
|
+
|
|
280
|
+
if not model_path.exists() and (source_path is None or not source_path.exists()):
|
|
281
|
+
missing.append(model_id)
|
|
282
|
+
|
|
283
|
+
return missing
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# --------------------------------------------------------------------------- #
|
|
287
|
+
# Main init entry point
|
|
288
|
+
# --------------------------------------------------------------------------- #
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def run_init(
|
|
292
|
+
intent: str = "balanced",
|
|
293
|
+
budget_pct: int | None = None,
|
|
294
|
+
add_models: list[str] | None = None,
|
|
295
|
+
remove_tiers: list[str] | None = None,
|
|
296
|
+
force: bool = False,
|
|
297
|
+
stack_name: str = DEFAULT_STACK_NAME,
|
|
298
|
+
) -> dict[str, Any]:
|
|
299
|
+
"""Run the full init flow: profile -> recommend -> generate configs.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
intent: Recommendation intent (balanced or agent-fleet).
|
|
303
|
+
budget_pct: Memory budget percentage override (uses config default if None).
|
|
304
|
+
add_models: Additional model IDs to add as tiers.
|
|
305
|
+
remove_tiers: Tier names to remove from recommendation.
|
|
306
|
+
force: Whether to overwrite existing stack files.
|
|
307
|
+
stack_name: Name for the stack definition.
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
A dict with keys:
|
|
311
|
+
- stack_path: Path to the generated stack YAML
|
|
312
|
+
- litellm_path: Path to the generated LiteLLM config
|
|
313
|
+
- stack: The stack definition dict
|
|
314
|
+
- litellm_config: The LiteLLM config dict
|
|
315
|
+
- missing_models: List of models not found locally
|
|
316
|
+
|
|
317
|
+
Raises:
|
|
318
|
+
InitError: If initialization fails.
|
|
319
|
+
"""
|
|
320
|
+
# --- Validate intent ---
|
|
321
|
+
if intent not in VALID_INTENTS:
|
|
322
|
+
valid = ", ".join(sorted(VALID_INTENTS))
|
|
323
|
+
msg = f"Invalid intent '{intent}'. Valid intents: {valid}"
|
|
324
|
+
raise InitError(msg)
|
|
325
|
+
|
|
326
|
+
# --- Resolve hardware profile ---
|
|
327
|
+
profile = load_profile()
|
|
328
|
+
if profile is None:
|
|
329
|
+
try:
|
|
330
|
+
profile = detect_hardware()
|
|
331
|
+
save_profile(profile)
|
|
332
|
+
except Exception as exc:
|
|
333
|
+
msg = f"Hardware detection failed: {exc}"
|
|
334
|
+
raise InitError(msg) from None
|
|
335
|
+
|
|
336
|
+
# --- Read config values ---
|
|
337
|
+
try:
|
|
338
|
+
litellm_port = int(get_value("litellm-port"))
|
|
339
|
+
except (ConfigCorruptError, ValueError):
|
|
340
|
+
litellm_port = 4000
|
|
341
|
+
|
|
342
|
+
if budget_pct is None:
|
|
343
|
+
try:
|
|
344
|
+
budget_pct = int(get_value("memory-budget-pct"))
|
|
345
|
+
except (ConfigCorruptError, ValueError):
|
|
346
|
+
budget_pct = 40
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
openrouter_key = str(get_value("openrouter-key"))
|
|
350
|
+
except (ConfigCorruptError, Exception):
|
|
351
|
+
openrouter_key = ""
|
|
352
|
+
|
|
353
|
+
# --- Check for existing stack ---
|
|
354
|
+
stacks_dir = get_stacks_dir()
|
|
355
|
+
stack_path = stacks_dir / f"{stack_name}.yaml"
|
|
356
|
+
litellm_path = get_data_home() / "litellm.yaml"
|
|
357
|
+
|
|
358
|
+
if stack_path.exists() and not force:
|
|
359
|
+
msg = (
|
|
360
|
+
f"Stack '{stack_name}' already exists at {stack_path}. "
|
|
361
|
+
f"Use --force to overwrite."
|
|
362
|
+
)
|
|
363
|
+
raise InitError(msg)
|
|
364
|
+
|
|
365
|
+
# --- Load catalog ---
|
|
366
|
+
try:
|
|
367
|
+
catalog = load_catalog()
|
|
368
|
+
except Exception as exc:
|
|
369
|
+
msg = f"Could not load model catalog: {exc}"
|
|
370
|
+
raise InitError(msg) from None
|
|
371
|
+
|
|
372
|
+
# --- Run recommendation ---
|
|
373
|
+
try:
|
|
374
|
+
recommendation = run_recommend(
|
|
375
|
+
catalog=catalog,
|
|
376
|
+
profile=profile,
|
|
377
|
+
intent=intent,
|
|
378
|
+
budget_pct=budget_pct,
|
|
379
|
+
)
|
|
380
|
+
except ScoringError as exc:
|
|
381
|
+
msg = f"Recommendation failed: {exc}"
|
|
382
|
+
raise InitError(msg) from None
|
|
383
|
+
|
|
384
|
+
if not recommendation.tiers:
|
|
385
|
+
msg = (
|
|
386
|
+
f"No models fit within the {recommendation.memory_budget_gb:.1f} GB budget. "
|
|
387
|
+
f"Try increasing memory-budget-pct in config."
|
|
388
|
+
)
|
|
389
|
+
raise InitError(msg)
|
|
390
|
+
|
|
391
|
+
# --- Apply --add/--remove customizations ---
|
|
392
|
+
tiers = list(recommendation.tiers)
|
|
393
|
+
|
|
394
|
+
if remove_tiers:
|
|
395
|
+
valid_tier_names = {t.tier for t in tiers}
|
|
396
|
+
for tier_name in remove_tiers:
|
|
397
|
+
if tier_name not in valid_tier_names:
|
|
398
|
+
valid = ", ".join(sorted(valid_tier_names))
|
|
399
|
+
msg = (
|
|
400
|
+
f"Cannot remove tier '{tier_name}': not in the current stack. "
|
|
401
|
+
f"Valid tiers: {valid}"
|
|
402
|
+
)
|
|
403
|
+
raise InitError(msg)
|
|
404
|
+
tiers = [t for t in tiers if t.tier not in set(remove_tiers)]
|
|
405
|
+
|
|
406
|
+
warnings: list[str] = []
|
|
407
|
+
|
|
408
|
+
if add_models:
|
|
409
|
+
for model_id in add_models:
|
|
410
|
+
entry = get_entry_by_id(catalog, model_id)
|
|
411
|
+
if entry is None:
|
|
412
|
+
msg = (
|
|
413
|
+
f"Unknown model '{model_id}'. "
|
|
414
|
+
f"Run 'mlx-stack models --catalog' to see available models."
|
|
415
|
+
)
|
|
416
|
+
raise InitError(msg)
|
|
417
|
+
|
|
418
|
+
# Check if model already assigned
|
|
419
|
+
assigned_ids = {t.model.entry.id for t in tiers}
|
|
420
|
+
if model_id in assigned_ids:
|
|
421
|
+
continue # Skip duplicates silently
|
|
422
|
+
|
|
423
|
+
# Create a tier name like 'added-<model_id>'
|
|
424
|
+
from mlx_stack.core.scoring import INTENT_WEIGHTS, TierAssignment, score_model
|
|
425
|
+
|
|
426
|
+
weights = INTENT_WEIGHTS.get(intent, INTENT_WEIGHTS["balanced"])
|
|
427
|
+
try:
|
|
428
|
+
scored = score_model(
|
|
429
|
+
entry, profile, weights, recommendation.memory_budget_gb,
|
|
430
|
+
)
|
|
431
|
+
except ScoringError as exc:
|
|
432
|
+
msg = f"Cannot add model '{model_id}': {exc}"
|
|
433
|
+
raise InitError(msg) from None
|
|
434
|
+
|
|
435
|
+
# Warn if exceeding budget (per spec: warn, not block)
|
|
436
|
+
total_memory = sum(t.model.memory_gb for t in tiers) + scored.memory_gb
|
|
437
|
+
if total_memory > recommendation.memory_budget_gb:
|
|
438
|
+
warnings.append(
|
|
439
|
+
f"Adding '{model_id}' exceeds memory budget "
|
|
440
|
+
f"({total_memory:.1f} GB > {recommendation.memory_budget_gb:.1f} GB)."
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
tier_name = f"added-{model_id}"
|
|
444
|
+
tiers.append(TierAssignment(
|
|
445
|
+
tier=tier_name,
|
|
446
|
+
model=scored,
|
|
447
|
+
quant="int4",
|
|
448
|
+
))
|
|
449
|
+
|
|
450
|
+
if not tiers:
|
|
451
|
+
msg = "No tiers remaining after customization. Cannot generate stack."
|
|
452
|
+
raise InitError(msg)
|
|
453
|
+
|
|
454
|
+
# --- Allocate ports ---
|
|
455
|
+
ports = allocate_ports(len(tiers), litellm_port=litellm_port)
|
|
456
|
+
|
|
457
|
+
# --- Generate stack definition ---
|
|
458
|
+
cloud_fallback: dict[str, Any] | None = None
|
|
459
|
+
if openrouter_key:
|
|
460
|
+
cloud_fallback = {
|
|
461
|
+
"provider": "openrouter",
|
|
462
|
+
"models": ["openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"],
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
stack = generate_stack_definition(
|
|
466
|
+
recommendation=_with_tiers(recommendation, tiers),
|
|
467
|
+
ports=ports,
|
|
468
|
+
catalog=catalog,
|
|
469
|
+
stack_name=stack_name,
|
|
470
|
+
cloud_fallback=cloud_fallback,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# --- Generate LiteLLM config ---
|
|
474
|
+
tier_entries = [
|
|
475
|
+
{"name": t["name"], "model": t["model"], "port": t["port"]}
|
|
476
|
+
for t in stack["tiers"]
|
|
477
|
+
]
|
|
478
|
+
litellm_config = generate_litellm_config(
|
|
479
|
+
tiers=tier_entries,
|
|
480
|
+
litellm_port=litellm_port,
|
|
481
|
+
openrouter_key=openrouter_key,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# --- Write files ---
|
|
485
|
+
ensure_data_home()
|
|
486
|
+
stacks_dir.mkdir(parents=True, exist_ok=True)
|
|
487
|
+
|
|
488
|
+
stack_yaml = yaml.dump(stack, default_flow_style=False, sort_keys=False)
|
|
489
|
+
stack_path.write_text(stack_yaml, encoding="utf-8")
|
|
490
|
+
|
|
491
|
+
litellm_yaml = render_litellm_yaml(litellm_config)
|
|
492
|
+
litellm_path.write_text(litellm_yaml, encoding="utf-8")
|
|
493
|
+
|
|
494
|
+
# --- Detect missing models ---
|
|
495
|
+
missing_models = detect_missing_models(stack["tiers"])
|
|
496
|
+
|
|
497
|
+
# --- Compute total estimated memory for selected tiers ---
|
|
498
|
+
total_memory_gb = sum(t.model.memory_gb for t in tiers)
|
|
499
|
+
|
|
500
|
+
return {
|
|
501
|
+
"stack_path": stack_path,
|
|
502
|
+
"litellm_path": litellm_path,
|
|
503
|
+
"stack": stack,
|
|
504
|
+
"litellm_config": litellm_config,
|
|
505
|
+
"missing_models": missing_models,
|
|
506
|
+
"warnings": warnings,
|
|
507
|
+
"profile": profile,
|
|
508
|
+
"memory_budget_gb": recommendation.memory_budget_gb,
|
|
509
|
+
"total_memory_gb": total_memory_gb,
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _with_tiers(result: RecommendationResult, tiers: list[TierAssignment]) -> RecommendationResult:
|
|
514
|
+
"""Create a new RecommendationResult with different tiers.
|
|
515
|
+
|
|
516
|
+
RecommendationResult is a frozen dataclass, so we create a new instance.
|
|
517
|
+
"""
|
|
518
|
+
return RecommendationResult(
|
|
519
|
+
tiers=tiers,
|
|
520
|
+
all_scored=result.all_scored,
|
|
521
|
+
memory_budget_gb=result.memory_budget_gb,
|
|
522
|
+
intent=result.intent,
|
|
523
|
+
hardware_profile=result.hardware_profile,
|
|
524
|
+
)
|