split-stack 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split_stack/__init__.py +106 -0
- split_stack/__main__.py +4 -0
- split_stack/advice.py +12 -0
- split_stack/benchmark.py +97 -0
- split_stack/cli.py +690 -0
- split_stack/community_picks.py +247 -0
- split_stack/compare.py +194 -0
- split_stack/complexity.py +77 -0
- split_stack/discovery.py +288 -0
- split_stack/hints.py +102 -0
- split_stack/local_models.py +63 -0
- split_stack/model_guide.py +273 -0
- split_stack/model_registry.py +314 -0
- split_stack/models.py +77 -0
- split_stack/ollama_errors.py +30 -0
- split_stack/ollama_generate.py +135 -0
- split_stack/poc_models.py +131 -0
- split_stack/presets.py +75 -0
- split_stack/quantization.py +137 -0
- split_stack/requirements.py +287 -0
- split_stack/routing.py +96 -0
- split_stack/session.py +259 -0
- split_stack/setup_wizard.py +259 -0
- split_stack/startup_tips.py +169 -0
- split_stack/tiering.py +66 -0
- split_stack/validation.py +85 -0
- split_stack-0.2.0.dist-info/METADATA +364 -0
- split_stack-0.2.0.dist-info/RECORD +32 -0
- split_stack-0.2.0.dist-info/WHEEL +5 -0
- split_stack-0.2.0.dist-info/entry_points.txt +2 -0
- split_stack-0.2.0.dist-info/licenses/LICENSE +21 -0
- split_stack-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""Model guide: map agent hints and installed models to routing tiers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from split_stack.community_picks import (
|
|
8
|
+
community_index_for_model,
|
|
9
|
+
community_note_for_model,
|
|
10
|
+
recommended_models_for_tier,
|
|
11
|
+
vram_tier_for_profile,
|
|
12
|
+
)
|
|
13
|
+
from split_stack.hints import list_hints
|
|
14
|
+
from split_stack.model_registry import load_registry, resolve_discovered_models
|
|
15
|
+
from split_stack.routing import route_prompt
|
|
16
|
+
from split_stack.tiering import assign_tiers, describe_tiers
|
|
17
|
+
|
|
18
|
+
# Example prompts per hint — same spirit as compare POC steps.
|
|
19
|
+
HINT_EXAMPLES: dict[str, str] = {
|
|
20
|
+
"lookup": "what is JWT in one sentence?",
|
|
21
|
+
"explain": "compare session cookies vs JWT for a small SaaS API",
|
|
22
|
+
"design": "design a webhook retry strategy with idempotency keys",
|
|
23
|
+
"code": "refactor this auth module for unit tests",
|
|
24
|
+
"reason": "prove this token expiry policy step by step",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
TIER_LABELS: dict[str, str] = {
|
|
28
|
+
"simple": "Simple — fast, cheap steps",
|
|
29
|
+
"medium": "Medium — summarise and compare",
|
|
30
|
+
"complex": "Complex — design and heavy generation",
|
|
31
|
+
"reasoning": "Reasoning — proofs and step-by-step logic",
|
|
32
|
+
"code": "Code — implement, refactor, debug",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
FAMILY_BEST_FOR: dict[str, str] = {
|
|
36
|
+
"gemma": "Lookup and quick answers; smallest Gemma tag in your stack",
|
|
37
|
+
"qwen": "General spine — explain on 8B, design on 14B+",
|
|
38
|
+
"llama": "Lightweight lookup on 1B/3B; mid tiers on 8B+",
|
|
39
|
+
"phi": "Reasoning and careful step-by-step (especially phi4-reasoning)",
|
|
40
|
+
"deepseek": "Reasoning (R1) or code (coder) specialists",
|
|
41
|
+
"mistral": "Solid medium-tier general work",
|
|
42
|
+
"starcoder": "Code-only slot when present",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class HintRoute:
|
|
48
|
+
hint_id: str
|
|
49
|
+
label: str
|
|
50
|
+
summary: str
|
|
51
|
+
tier: str
|
|
52
|
+
model: str
|
|
53
|
+
example_prompt: str
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(frozen=True)
|
|
57
|
+
class ModelCard:
|
|
58
|
+
name: str
|
|
59
|
+
family: str | None
|
|
60
|
+
weight: int
|
|
61
|
+
vram_gb: int | None
|
|
62
|
+
tier_slots: tuple[str, ...]
|
|
63
|
+
hints: tuple[str, ...]
|
|
64
|
+
best_for: str
|
|
65
|
+
in_stack: bool
|
|
66
|
+
vram_ok: bool
|
|
67
|
+
community_note: str | None = None
|
|
68
|
+
community_hints: tuple[str, ...] = ()
|
|
69
|
+
installed: bool = True
|
|
70
|
+
duplicate_locations: tuple[str, ...] = ()
|
|
71
|
+
status: str = "installed"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True)
|
|
75
|
+
class ModelGuide:
|
|
76
|
+
stack: tuple[str, ...]
|
|
77
|
+
tiers: dict[str, str | None]
|
|
78
|
+
tier_labels: dict[str, str]
|
|
79
|
+
hint_routes: tuple[HintRoute, ...]
|
|
80
|
+
models: tuple[ModelCard, ...]
|
|
81
|
+
vram_tier: str | None = None
|
|
82
|
+
audit: dict[str, object] | None = None
|
|
83
|
+
missing_recommended: tuple[str, ...] = ()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _tier_slots_for_model(name: str, tiers: dict[str, str | None]) -> list[str]:
|
|
87
|
+
slots: list[str] = []
|
|
88
|
+
for key, value in tiers.items():
|
|
89
|
+
if value == name:
|
|
90
|
+
slots.append(key)
|
|
91
|
+
return slots
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _hints_for_model(name: str, hint_routes: tuple[HintRoute, ...]) -> list[str]:
|
|
95
|
+
return [item.hint_id for item in hint_routes if item.model == name]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _best_for_text(
|
|
99
|
+
*,
|
|
100
|
+
name: str,
|
|
101
|
+
family: str | None,
|
|
102
|
+
tier_slots: list[str],
|
|
103
|
+
in_stack: bool,
|
|
104
|
+
community_note: str | None,
|
|
105
|
+
community_hints: tuple[str, ...],
|
|
106
|
+
installed: bool,
|
|
107
|
+
) -> str:
|
|
108
|
+
parts: list[str] = []
|
|
109
|
+
if community_note:
|
|
110
|
+
parts.append(community_note)
|
|
111
|
+
elif community_hints:
|
|
112
|
+
parts.append(f"Community pick for: {', '.join(community_hints)}")
|
|
113
|
+
if not installed:
|
|
114
|
+
return parts[0] if parts else "Recommended by community — not installed yet"
|
|
115
|
+
|
|
116
|
+
lowered = name.lower()
|
|
117
|
+
if "reasoning" in lowered or "deepseek-r1" in lowered:
|
|
118
|
+
parts.append("Reasoning steps — proofs and step-by-step logic")
|
|
119
|
+
elif any(token in lowered for token in ("coder", "codellama", "starcoder")):
|
|
120
|
+
parts.append("Code steps — refactor, debug, implement")
|
|
121
|
+
elif family and family in FAMILY_BEST_FOR and not parts:
|
|
122
|
+
parts.append(FAMILY_BEST_FOR[family])
|
|
123
|
+
|
|
124
|
+
if not in_stack:
|
|
125
|
+
suffix = "Installed but not in your active stack"
|
|
126
|
+
return f"{parts[0]} — {suffix}" if parts else suffix
|
|
127
|
+
if "simple" in tier_slots:
|
|
128
|
+
parts.append("Routed for lookup hints")
|
|
129
|
+
elif "medium" in tier_slots and "complex" not in tier_slots:
|
|
130
|
+
parts.append("Routed for explain hints")
|
|
131
|
+
elif "complex" in tier_slots:
|
|
132
|
+
parts.append("Routed for design/code hints")
|
|
133
|
+
elif "reasoning" in tier_slots:
|
|
134
|
+
parts.append("Routed for reason hints")
|
|
135
|
+
return " · ".join(dict.fromkeys(p for p in parts if p))
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def build_model_guide(
|
|
139
|
+
stack: list[str],
|
|
140
|
+
*,
|
|
141
|
+
pool: list[str] | None = None,
|
|
142
|
+
config_path: str | None = None,
|
|
143
|
+
profile: str = "workstation_12gb",
|
|
144
|
+
) -> ModelGuide:
|
|
145
|
+
"""Build hint routes for the active stack and cards for all models in pool."""
|
|
146
|
+
if not stack:
|
|
147
|
+
raise ValueError("stack must contain at least one model")
|
|
148
|
+
|
|
149
|
+
registry = load_registry(config_path)
|
|
150
|
+
vram_tier = vram_tier_for_profile(profile, config_path=config_path)
|
|
151
|
+
from split_stack.discovery import audit_model_folders, model_locations_by_tag
|
|
152
|
+
|
|
153
|
+
locations = model_locations_by_tag()
|
|
154
|
+
audit = audit_model_folders()
|
|
155
|
+
recommended = recommended_models_for_tier(vram_tier=vram_tier, config_path=config_path)
|
|
156
|
+
tiers_map = assign_tiers(stack, registry=registry)
|
|
157
|
+
tiers = describe_tiers(tiers_map)
|
|
158
|
+
|
|
159
|
+
hint_routes: list[HintRoute] = []
|
|
160
|
+
for item in list_hints():
|
|
161
|
+
hint_id = item["id"]
|
|
162
|
+
example = HINT_EXAMPLES.get(hint_id, item["summary"])
|
|
163
|
+
tier, model = route_prompt(example, tiers_map, hint=hint_id)
|
|
164
|
+
hint_routes.append(
|
|
165
|
+
HintRoute(
|
|
166
|
+
hint_id=hint_id,
|
|
167
|
+
label=item["label"],
|
|
168
|
+
summary=item["summary"],
|
|
169
|
+
tier=tier.value,
|
|
170
|
+
model=model,
|
|
171
|
+
example_prompt=example,
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
hint_routes_tuple = tuple(hint_routes)
|
|
175
|
+
|
|
176
|
+
catalog = pool if pool is not None else stack
|
|
177
|
+
resolved = resolve_discovered_models(sorted(set(catalog)), registry=registry)
|
|
178
|
+
stack_set = set(stack)
|
|
179
|
+
seen_names: set[str] = set()
|
|
180
|
+
|
|
181
|
+
full_tiers = describe_tiers(assign_tiers(list(catalog), registry=registry)) if len(catalog) >= 2 else tiers
|
|
182
|
+
|
|
183
|
+
cards: list[ModelCard] = []
|
|
184
|
+
for item in resolved:
|
|
185
|
+
seen_names.add(item.name)
|
|
186
|
+
in_stack = item.name in stack_set
|
|
187
|
+
tier_slots = _tier_slots_for_model(item.name, tiers if in_stack else full_tiers)
|
|
188
|
+
route_hints = _hints_for_model(item.name, hint_routes_tuple) if in_stack else []
|
|
189
|
+
comm_hints = community_index_for_model(item.name, vram_tier=vram_tier, config_path=config_path)
|
|
190
|
+
comm_note = community_note_for_model(item.name, vram_tier=vram_tier, config_path=config_path)
|
|
191
|
+
locs = locations.get(item.name, ())
|
|
192
|
+
cards.append(
|
|
193
|
+
ModelCard(
|
|
194
|
+
name=item.name,
|
|
195
|
+
family=item.family,
|
|
196
|
+
weight=item.weight,
|
|
197
|
+
vram_gb=item.vram_gb,
|
|
198
|
+
tier_slots=tuple(tier_slots),
|
|
199
|
+
hints=tuple(route_hints),
|
|
200
|
+
best_for=_best_for_text(
|
|
201
|
+
name=item.name,
|
|
202
|
+
family=item.family,
|
|
203
|
+
tier_slots=tier_slots,
|
|
204
|
+
in_stack=in_stack,
|
|
205
|
+
community_note=comm_note,
|
|
206
|
+
community_hints=comm_hints,
|
|
207
|
+
installed=True,
|
|
208
|
+
),
|
|
209
|
+
in_stack=in_stack,
|
|
210
|
+
vram_ok=item.vram_ok,
|
|
211
|
+
community_note=comm_note,
|
|
212
|
+
community_hints=comm_hints,
|
|
213
|
+
installed=True,
|
|
214
|
+
duplicate_locations=tuple(locs) if len(locs) > 1 else (),
|
|
215
|
+
status="duplicate" if len(locs) > 1 else "installed",
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
installed_lower = {name.lower() for name in seen_names}
|
|
220
|
+
missing: list[str] = []
|
|
221
|
+
for model_name, note in recommended.items():
|
|
222
|
+
if model_name.lower() in installed_lower:
|
|
223
|
+
continue
|
|
224
|
+
if any(model_name.lower() in name or name.startswith(model_name.lower()) for name in installed_lower):
|
|
225
|
+
continue
|
|
226
|
+
missing.append(model_name)
|
|
227
|
+
comm_hints = community_index_for_model(model_name, vram_tier=vram_tier, config_path=config_path)
|
|
228
|
+
cards.append(
|
|
229
|
+
ModelCard(
|
|
230
|
+
name=model_name,
|
|
231
|
+
family=model_name.split(":")[0],
|
|
232
|
+
weight=0,
|
|
233
|
+
vram_gb=None,
|
|
234
|
+
tier_slots=(),
|
|
235
|
+
hints=(),
|
|
236
|
+
best_for=_best_for_text(
|
|
237
|
+
name=model_name,
|
|
238
|
+
family=model_name.split(":")[0],
|
|
239
|
+
tier_slots=[],
|
|
240
|
+
in_stack=False,
|
|
241
|
+
community_note=note,
|
|
242
|
+
community_hints=comm_hints,
|
|
243
|
+
installed=False,
|
|
244
|
+
),
|
|
245
|
+
in_stack=False,
|
|
246
|
+
vram_ok=True,
|
|
247
|
+
community_note=note,
|
|
248
|
+
community_hints=comm_hints,
|
|
249
|
+
installed=False,
|
|
250
|
+
status="recommended",
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
cards.sort(
|
|
255
|
+
key=lambda card: (
|
|
256
|
+
card.status != "installed",
|
|
257
|
+
card.status == "recommended",
|
|
258
|
+
not card.in_stack,
|
|
259
|
+
card.weight,
|
|
260
|
+
card.name,
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
return ModelGuide(
|
|
265
|
+
stack=tuple(stack),
|
|
266
|
+
tiers=tiers,
|
|
267
|
+
tier_labels=dict(TIER_LABELS),
|
|
268
|
+
hint_routes=hint_routes_tuple,
|
|
269
|
+
models=tuple(cards),
|
|
270
|
+
vram_tier=vram_tier,
|
|
271
|
+
audit=audit,
|
|
272
|
+
missing_recommended=tuple(missing),
|
|
273
|
+
)
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
DEFAULT_ASSUMED_VRAM_GB = 12
|
|
10
|
+
DEFAULT_DEPLOYMENT_PROFILE = "workstation_12gb"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class DeploymentProfileSpec:
|
|
15
|
+
name: str
|
|
16
|
+
assumed_vram_gb: int | None
|
|
17
|
+
apply_vram_filter: bool
|
|
18
|
+
description: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
DEPLOYMENT_PROFILES: dict[str, DeploymentProfileSpec] = {
|
|
22
|
+
"workstation_8gb": DeploymentProfileSpec(
|
|
23
|
+
name="workstation_8gb",
|
|
24
|
+
assumed_vram_gb=8,
|
|
25
|
+
apply_vram_filter=True,
|
|
26
|
+
description="8 GB GPU workstation preset",
|
|
27
|
+
),
|
|
28
|
+
"workstation_12gb": DeploymentProfileSpec(
|
|
29
|
+
name="workstation_12gb",
|
|
30
|
+
assumed_vram_gb=12,
|
|
31
|
+
apply_vram_filter=True,
|
|
32
|
+
description="12 GB GPU workstation preset (default)",
|
|
33
|
+
),
|
|
34
|
+
"workstation_16gb": DeploymentProfileSpec(
|
|
35
|
+
name="workstation_16gb",
|
|
36
|
+
assumed_vram_gb=16,
|
|
37
|
+
apply_vram_filter=True,
|
|
38
|
+
description="16 GB GPU workstation preset",
|
|
39
|
+
),
|
|
40
|
+
"workstation_24gb": DeploymentProfileSpec(
|
|
41
|
+
name="workstation_24gb",
|
|
42
|
+
assumed_vram_gb=24,
|
|
43
|
+
apply_vram_filter=True,
|
|
44
|
+
description="24 GB GPU workstation preset (4090, 3090 class)",
|
|
45
|
+
),
|
|
46
|
+
"workstation_32gb": DeploymentProfileSpec(
|
|
47
|
+
name="workstation_32gb",
|
|
48
|
+
assumed_vram_gb=32,
|
|
49
|
+
apply_vram_filter=True,
|
|
50
|
+
description="32 GB GPU workstation preset (5090 class; top single-GPU tier)",
|
|
51
|
+
),
|
|
52
|
+
"datacenter": DeploymentProfileSpec(
|
|
53
|
+
name="datacenter",
|
|
54
|
+
assumed_vram_gb=None,
|
|
55
|
+
apply_vram_filter=False,
|
|
56
|
+
description="Private inference fleet; custom model catalog, no VRAM filter",
|
|
57
|
+
),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
_PROFILE_ALIASES: dict[str, str] = {
|
|
61
|
+
"workstation": DEFAULT_DEPLOYMENT_PROFILE,
|
|
62
|
+
"8gb": "workstation_8gb",
|
|
63
|
+
"12gb": "workstation_12gb",
|
|
64
|
+
"16gb": "workstation_16gb",
|
|
65
|
+
"24gb": "workstation_24gb",
|
|
66
|
+
"32gb": "workstation_32gb",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
_BUILTIN_RAW: list[dict[str, object]] = [
|
|
70
|
+
{"match": "gemma4:e4b", "weight": 4000, "vram_gb": 4, "family": "gemma"},
|
|
71
|
+
{"match": "gemma4:12b", "weight": 12000, "vram_gb": 10, "family": "gemma"},
|
|
72
|
+
{"match": "gemma4:26b-a4b", "weight": 26000, "vram_gb": 20, "family": "gemma"},
|
|
73
|
+
{"match": "gemma4:26b", "weight": 26000, "vram_gb": 22, "family": "gemma"},
|
|
74
|
+
{"match": "gemma4:31b", "weight": 31000, "vram_gb": 28, "family": "gemma"},
|
|
75
|
+
{"match": "gemma3:4b", "weight": 4000, "vram_gb": 4, "family": "gemma"},
|
|
76
|
+
{"match": "gemma3:12b", "weight": 12000, "vram_gb": 10, "family": "gemma"},
|
|
77
|
+
{"match": "qwen3:4b", "weight": 4000, "vram_gb": 4, "family": "qwen"},
|
|
78
|
+
{"match": "qwen3:8b", "weight": 8000, "vram_gb": 6, "family": "qwen"},
|
|
79
|
+
{"match": "qwen3:14b", "weight": 14000, "vram_gb": 10, "family": "qwen"},
|
|
80
|
+
{"match": "qwen3:30b", "weight": 30000, "vram_gb": 20, "family": "qwen"},
|
|
81
|
+
{"match": "qwen3:30b-a3b", "weight": 30000, "vram_gb": 20, "family": "qwen"},
|
|
82
|
+
{"match": "llama3.2:1b", "weight": 1000, "vram_gb": 2, "family": "llama"},
|
|
83
|
+
{"match": "llama3.2:3b", "weight": 3000, "vram_gb": 3, "family": "llama"},
|
|
84
|
+
{"match": "llama3.1:8b", "weight": 8000, "vram_gb": 6, "family": "llama"},
|
|
85
|
+
{"match": "llama3.1:70b", "weight": 70000, "vram_gb": 48, "family": "llama"},
|
|
86
|
+
{"match": "mistral:7b", "weight": 7000, "vram_gb": 5, "family": "mistral"},
|
|
87
|
+
{"match": "mistral-nemo", "weight": 12000, "vram_gb": 8, "family": "mistral"},
|
|
88
|
+
{"match": "phi3:mini", "weight": 3800, "vram_gb": 4, "family": "phi"},
|
|
89
|
+
{"match": "phi4", "weight": 14000, "vram_gb": 10, "family": "phi"},
|
|
90
|
+
{"match": "phi4-reasoning", "weight": 14000, "vram_gb": 10, "family": "phi"},
|
|
91
|
+
{"match": "deepseek-coder:6.7b", "weight": 7000, "vram_gb": 6, "family": "deepseek"},
|
|
92
|
+
{"match": "deepseek-r1", "weight": 14000, "vram_gb": 10, "family": "deepseek"},
|
|
93
|
+
{"match": "deepseek-coder", "weight": 7000, "vram_gb": 6, "family": "deepseek"},
|
|
94
|
+
{"match": "codellama", "weight": 7000, "vram_gb": 6, "family": "llama"},
|
|
95
|
+
{"match": "starcoder2", "weight": 7000, "vram_gb": 6, "family": "starcoder"},
|
|
96
|
+
{"match": ":e4b", "weight": 4000, "vram_gb": 4, "family": "gemma"},
|
|
97
|
+
{"match": ":e2b", "weight": 2000, "vram_gb": 3, "family": "gemma"},
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass(frozen=True)
|
|
102
|
+
class ModelEntry:
|
|
103
|
+
match: str
|
|
104
|
+
weight: int
|
|
105
|
+
vram_gb: int | None = None
|
|
106
|
+
family: str | None = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass(frozen=True)
|
|
110
|
+
class ModelRegistry:
|
|
111
|
+
profile: str
|
|
112
|
+
assumed_vram_gb: int | None
|
|
113
|
+
apply_vram_filter: bool
|
|
114
|
+
entries: tuple[ModelEntry, ...]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass(frozen=True)
|
|
118
|
+
class ResolvedModel:
|
|
119
|
+
name: str
|
|
120
|
+
weight: int
|
|
121
|
+
vram_gb: int | None
|
|
122
|
+
family: str | None
|
|
123
|
+
vram_ok: bool
|
|
124
|
+
source: str
|
|
125
|
+
quant_mode: str | None = None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _entries_from_raw(raw: list[dict[str, object]]) -> tuple[ModelEntry, ...]:
|
|
129
|
+
return tuple(
|
|
130
|
+
ModelEntry(
|
|
131
|
+
match=str(item["match"]),
|
|
132
|
+
weight=int(item["weight"]), # type: ignore[arg-type]
|
|
133
|
+
vram_gb=int(item["vram_gb"]) if item.get("vram_gb") is not None else None,
|
|
134
|
+
family=str(item["family"]) if item.get("family") else None,
|
|
135
|
+
)
|
|
136
|
+
for item in raw
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def normalize_deployment_profile(name: str | None) -> str:
|
|
141
|
+
if not name:
|
|
142
|
+
return DEFAULT_DEPLOYMENT_PROFILE
|
|
143
|
+
lowered = name.strip().lower()
|
|
144
|
+
if lowered in DEPLOYMENT_PROFILES:
|
|
145
|
+
return lowered
|
|
146
|
+
if lowered in _PROFILE_ALIASES:
|
|
147
|
+
return _PROFILE_ALIASES[lowered]
|
|
148
|
+
valid = ", ".join(sorted(DEPLOYMENT_PROFILES))
|
|
149
|
+
raise ValueError(f"Unknown deployment profile '{name}'. Valid profiles: {valid}")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def list_deployment_profiles() -> tuple[DeploymentProfileSpec, ...]:
|
|
153
|
+
return tuple(DEPLOYMENT_PROFILES[name] for name in sorted(DEPLOYMENT_PROFILES))
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _default_registry() -> ModelRegistry:
|
|
157
|
+
spec = DEPLOYMENT_PROFILES[DEFAULT_DEPLOYMENT_PROFILE]
|
|
158
|
+
return ModelRegistry(
|
|
159
|
+
profile=spec.name,
|
|
160
|
+
assumed_vram_gb=spec.assumed_vram_gb,
|
|
161
|
+
apply_vram_filter=spec.apply_vram_filter,
|
|
162
|
+
entries=_entries_from_raw(_BUILTIN_RAW),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _registry_from_payload(
|
|
167
|
+
payload: dict[str, object],
|
|
168
|
+
*,
|
|
169
|
+
profile_override: str | None = None,
|
|
170
|
+
) -> ModelRegistry:
|
|
171
|
+
profile_name = normalize_deployment_profile(
|
|
172
|
+
profile_override
|
|
173
|
+
or str(payload.get("deployment_profile") or payload.get("profile") or "")
|
|
174
|
+
or None
|
|
175
|
+
)
|
|
176
|
+
spec = DEPLOYMENT_PROFILES[profile_name]
|
|
177
|
+
entries_raw = payload.get("models")
|
|
178
|
+
entries = _entries_from_raw(list(entries_raw)) if entries_raw else _entries_from_raw(_BUILTIN_RAW)
|
|
179
|
+
if spec.apply_vram_filter:
|
|
180
|
+
if profile_override:
|
|
181
|
+
assumed_vram_gb = spec.assumed_vram_gb
|
|
182
|
+
else:
|
|
183
|
+
assumed = payload.get("assumed_vram_gb")
|
|
184
|
+
assumed_vram_gb = int(assumed) if assumed is not None else spec.assumed_vram_gb
|
|
185
|
+
else:
|
|
186
|
+
assumed_vram_gb = None
|
|
187
|
+
return ModelRegistry(
|
|
188
|
+
profile=profile_name,
|
|
189
|
+
assumed_vram_gb=assumed_vram_gb,
|
|
190
|
+
apply_vram_filter=spec.apply_vram_filter,
|
|
191
|
+
entries=entries,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def config_search_paths(explicit: str | None = None) -> list[Path]:
|
|
196
|
+
paths: list[Path] = []
|
|
197
|
+
if explicit:
|
|
198
|
+
paths.append(Path(explicit))
|
|
199
|
+
env_path = os.environ.get("SPLIT_STACK_MODELS_CONFIG")
|
|
200
|
+
if env_path:
|
|
201
|
+
paths.append(Path(env_path))
|
|
202
|
+
paths.extend(
|
|
203
|
+
[
|
|
204
|
+
Path.cwd() / "split-stack.models.json",
|
|
205
|
+
Path.home() / ".config" / "split-stack" / "models.json",
|
|
206
|
+
]
|
|
207
|
+
)
|
|
208
|
+
return paths
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def load_registry(
|
|
212
|
+
config_path: str | None = None,
|
|
213
|
+
*,
|
|
214
|
+
profile: str | None = None,
|
|
215
|
+
) -> ModelRegistry:
|
|
216
|
+
for path in config_search_paths(config_path):
|
|
217
|
+
if path.is_file():
|
|
218
|
+
payload = json.loads(path.read_text(encoding="utf-8-sig"))
|
|
219
|
+
return _registry_from_payload(payload, profile_override=profile)
|
|
220
|
+
if profile:
|
|
221
|
+
spec = DEPLOYMENT_PROFILES[normalize_deployment_profile(profile)]
|
|
222
|
+
base = _default_registry()
|
|
223
|
+
return ModelRegistry(
|
|
224
|
+
profile=spec.name,
|
|
225
|
+
assumed_vram_gb=spec.assumed_vram_gb,
|
|
226
|
+
apply_vram_filter=spec.apply_vram_filter,
|
|
227
|
+
entries=base.entries,
|
|
228
|
+
)
|
|
229
|
+
return _default_registry()
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _heuristic_weight(name: str) -> int:
|
|
233
|
+
lowered = name.lower()
|
|
234
|
+
match = re.search(r":(\d+)b", lowered)
|
|
235
|
+
if match:
|
|
236
|
+
return int(match.group(1)) * 1000
|
|
237
|
+
match = re.search(r":e(\d+)b", lowered)
|
|
238
|
+
if match:
|
|
239
|
+
return int(match.group(1)) * 1000
|
|
240
|
+
if "70b" in lowered:
|
|
241
|
+
return 70000
|
|
242
|
+
if "30b" in lowered or "32b" in lowered or "34b" in lowered:
|
|
243
|
+
return 30000
|
|
244
|
+
return 1000
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def infer_model_profile(
|
|
248
|
+
name: str,
|
|
249
|
+
registry: ModelRegistry | None = None,
|
|
250
|
+
*,
|
|
251
|
+
quant_mode: str | None = None,
|
|
252
|
+
) -> ResolvedModel:
|
|
253
|
+
reg = registry or _default_registry()
|
|
254
|
+
lowered = name.lower()
|
|
255
|
+
best: ModelEntry | None = None
|
|
256
|
+
best_len = -1
|
|
257
|
+
for entry in reg.entries:
|
|
258
|
+
token = entry.match.lower()
|
|
259
|
+
if token in lowered and len(token) > best_len:
|
|
260
|
+
best = entry
|
|
261
|
+
best_len = len(token)
|
|
262
|
+
if best is not None:
|
|
263
|
+
vram_gb = best.vram_gb
|
|
264
|
+
source = "registry"
|
|
265
|
+
weight = best.weight
|
|
266
|
+
family = best.family
|
|
267
|
+
else:
|
|
268
|
+
source = "heuristic"
|
|
269
|
+
weight = _heuristic_weight(name)
|
|
270
|
+
family = _guess_family(name)
|
|
271
|
+
vram_gb = max(3, weight // 1000)
|
|
272
|
+
from split_stack.quantization import adjust_vram_for_quant, normalize_quant_mode
|
|
273
|
+
|
|
274
|
+
mode = normalize_quant_mode(quant_mode)
|
|
275
|
+
effective_vram = adjust_vram_for_quant(name, vram_gb, mode)
|
|
276
|
+
if not reg.apply_vram_filter or reg.assumed_vram_gb is None:
|
|
277
|
+
vram_ok = True
|
|
278
|
+
else:
|
|
279
|
+
vram_ok = effective_vram is None or effective_vram <= reg.assumed_vram_gb
|
|
280
|
+
return ResolvedModel(
|
|
281
|
+
name=name,
|
|
282
|
+
weight=weight,
|
|
283
|
+
vram_gb=effective_vram,
|
|
284
|
+
family=family,
|
|
285
|
+
vram_ok=vram_ok,
|
|
286
|
+
source=source,
|
|
287
|
+
quant_mode=mode if mode != "default" else None,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _guess_family(name: str) -> str | None:
|
|
292
|
+
lowered = name.lower()
|
|
293
|
+
for family in ("qwen", "gemma", "llama", "mistral", "phi", "deepseek"):
|
|
294
|
+
if family in lowered:
|
|
295
|
+
return family
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def resolve_discovered_models(
|
|
300
|
+
model_names: list[str],
|
|
301
|
+
*,
|
|
302
|
+
registry: ModelRegistry | None = None,
|
|
303
|
+
only_vram_ok: bool = False,
|
|
304
|
+
quant_mode: str | None = None,
|
|
305
|
+
) -> list[ResolvedModel]:
|
|
306
|
+
reg = registry or _default_registry()
|
|
307
|
+
resolved = [infer_model_profile(name, reg, quant_mode=quant_mode) for name in model_names]
|
|
308
|
+
if only_vram_ok:
|
|
309
|
+
resolved = [item for item in resolved if item.vram_ok]
|
|
310
|
+
return sorted(resolved, key=lambda item: item.weight)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def model_weight(name: str, registry: ModelRegistry | None = None) -> int:
|
|
314
|
+
return infer_model_profile(name, registry).weight
|
split_stack/models.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ComplexityTier(str, Enum):
|
|
8
|
+
SIMPLE = "simple"
|
|
9
|
+
MEDIUM = "medium"
|
|
10
|
+
COMPLEX = "complex"
|
|
11
|
+
REASONING = "reasoning"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StepKind(str, Enum):
|
|
15
|
+
LOOKUP = "lookup"
|
|
16
|
+
EXPLAIN = "explain"
|
|
17
|
+
DESIGN = "design"
|
|
18
|
+
CODE = "code"
|
|
19
|
+
REASON = "reason"
|
|
20
|
+
# Deprecated aliases (still parse)
|
|
21
|
+
WORK = "work"
|
|
22
|
+
BUILD = "build"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class TierMap:
|
|
27
|
+
simple: str
|
|
28
|
+
medium: str
|
|
29
|
+
complex: str
|
|
30
|
+
reasoning: str
|
|
31
|
+
code: str | None = None
|
|
32
|
+
|
|
33
|
+
def for_tier(self, tier: ComplexityTier) -> str:
|
|
34
|
+
lookup = {
|
|
35
|
+
ComplexityTier.SIMPLE: self.simple,
|
|
36
|
+
ComplexityTier.MEDIUM: self.medium,
|
|
37
|
+
ComplexityTier.COMPLEX: self.complex,
|
|
38
|
+
ComplexityTier.REASONING: self.reasoning,
|
|
39
|
+
}
|
|
40
|
+
return lookup[tier]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class RouteDecision:
|
|
45
|
+
"""Full routing outcome for logging, CLI explain, and agent-loop telemetry."""
|
|
46
|
+
|
|
47
|
+
tier: ComplexityTier
|
|
48
|
+
model: str
|
|
49
|
+
hint: str | None
|
|
50
|
+
step_kind: str | None
|
|
51
|
+
tier_source: str
|
|
52
|
+
model_source: str
|
|
53
|
+
reasons: tuple[str, ...]
|
|
54
|
+
tiers: dict[str, str | None]
|
|
55
|
+
|
|
56
|
+
def to_dict(self) -> dict[str, object]:
|
|
57
|
+
return {
|
|
58
|
+
"tier": self.tier.value,
|
|
59
|
+
"model": self.model,
|
|
60
|
+
"hint": self.hint,
|
|
61
|
+
"step_kind": self.step_kind,
|
|
62
|
+
"tier_source": self.tier_source,
|
|
63
|
+
"model_source": self.model_source,
|
|
64
|
+
"reasons": list(self.reasons),
|
|
65
|
+
"tiers": self.tiers,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def as_tuple(self) -> tuple[ComplexityTier, str]:
|
|
69
|
+
return self.tier, self.model
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(frozen=True)
|
|
73
|
+
class StackAdvice:
|
|
74
|
+
cursor_model: str
|
|
75
|
+
prose_path: str
|
|
76
|
+
local_path: str
|
|
77
|
+
warn_cursor_override: bool
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Actionable error messages for Ollama HTTP failures."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def format_ollama_error(
|
|
7
|
+
exc: BaseException,
|
|
8
|
+
*,
|
|
9
|
+
model: str,
|
|
10
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
11
|
+
) -> str:
|
|
12
|
+
"""Turn requests/Ollama failures into short fix hints."""
|
|
13
|
+
try:
|
|
14
|
+
import requests
|
|
15
|
+
except ImportError:
|
|
16
|
+
return str(exc)
|
|
17
|
+
|
|
18
|
+
if isinstance(exc, requests.Timeout):
|
|
19
|
+
return f"Ollama request timed out for model '{model}' at {base_url}."
|
|
20
|
+
|
|
21
|
+
if isinstance(exc, requests.ConnectionError):
|
|
22
|
+
return f"Ollama not reachable at {base_url}. Start Ollama first."
|
|
23
|
+
|
|
24
|
+
if isinstance(exc, requests.HTTPError) and exc.response is not None:
|
|
25
|
+
status = exc.response.status_code
|
|
26
|
+
if status == 404:
|
|
27
|
+
return f"Model '{model}' not found. Run: ollama pull {model}"
|
|
28
|
+
return f"Ollama HTTP {status} for model '{model}': {exc.response.reason}"
|
|
29
|
+
|
|
30
|
+
return str(exc)
|