split-stack 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split_stack/__init__.py +106 -0
- split_stack/__main__.py +4 -0
- split_stack/advice.py +12 -0
- split_stack/benchmark.py +97 -0
- split_stack/cli.py +690 -0
- split_stack/community_picks.py +247 -0
- split_stack/compare.py +194 -0
- split_stack/complexity.py +77 -0
- split_stack/discovery.py +288 -0
- split_stack/hints.py +102 -0
- split_stack/local_models.py +63 -0
- split_stack/model_guide.py +273 -0
- split_stack/model_registry.py +314 -0
- split_stack/models.py +77 -0
- split_stack/ollama_errors.py +30 -0
- split_stack/ollama_generate.py +135 -0
- split_stack/poc_models.py +131 -0
- split_stack/presets.py +75 -0
- split_stack/quantization.py +137 -0
- split_stack/requirements.py +287 -0
- split_stack/routing.py +96 -0
- split_stack/session.py +259 -0
- split_stack/setup_wizard.py +259 -0
- split_stack/startup_tips.py +169 -0
- split_stack/tiering.py +66 -0
- split_stack/validation.py +85 -0
- split_stack-0.2.0.dist-info/METADATA +364 -0
- split_stack-0.2.0.dist-info/RECORD +32 -0
- split_stack-0.2.0.dist-info/WHEEL +5 -0
- split_stack-0.2.0.dist-info/entry_points.txt +2 -0
- split_stack-0.2.0.dist-info/licenses/LICENSE +21 -0
- split_stack-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UsageProfile(str, Enum):
|
|
10
|
+
"""Supported ways to use split-stack."""
|
|
11
|
+
|
|
12
|
+
CORE = "core"
|
|
13
|
+
OLLAMA_DISCOVERY = "ollama_discovery"
|
|
14
|
+
LOCAL_ASSISTANT = "local_assistant"
|
|
15
|
+
CLI_DOCTOR = "cli_doctor"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class Prerequisite:
|
|
20
|
+
id: str
|
|
21
|
+
description: str
|
|
22
|
+
kind: str
|
|
23
|
+
required: bool
|
|
24
|
+
install_command: str | None = None
|
|
25
|
+
verify_hint: str | None = None
|
|
26
|
+
satisfied: bool | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class ProfileRequirements:
|
|
31
|
+
profile: UsageProfile
|
|
32
|
+
title: str
|
|
33
|
+
summary: str
|
|
34
|
+
prerequisites: tuple[Prerequisite, ...]
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def ready(self) -> bool:
|
|
38
|
+
return all(item.satisfied is not False for item in self.prerequisites if item.required)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _python_ok() -> bool:
|
|
42
|
+
return sys.version_info >= (3, 10)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _requests_ok() -> bool:
|
|
46
|
+
try:
|
|
47
|
+
import requests # noqa: F401
|
|
48
|
+
except ImportError:
|
|
49
|
+
return False
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _ollama_ok(base_url: str = "http://127.0.0.1:11434") -> bool:
|
|
54
|
+
if not _requests_ok():
|
|
55
|
+
return False
|
|
56
|
+
try:
|
|
57
|
+
import requests
|
|
58
|
+
|
|
59
|
+
response = requests.get(f"{base_url.rstrip('/')}/api/tags", timeout=2)
|
|
60
|
+
response.raise_for_status()
|
|
61
|
+
payload = response.json() or {}
|
|
62
|
+
return bool(payload.get("models"))
|
|
63
|
+
except Exception:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _catalog() -> dict[UsageProfile, tuple[Prerequisite, ...]]:
|
|
68
|
+
return {
|
|
69
|
+
UsageProfile.CORE: (
|
|
70
|
+
Prerequisite(
|
|
71
|
+
id="python",
|
|
72
|
+
description="Python 3.10 or newer",
|
|
73
|
+
kind="runtime",
|
|
74
|
+
required=True,
|
|
75
|
+
install_command="https://www.python.org/downloads/",
|
|
76
|
+
verify_hint="python --version",
|
|
77
|
+
),
|
|
78
|
+
Prerequisite(
|
|
79
|
+
id="split_stack",
|
|
80
|
+
description="split-stack package installed",
|
|
81
|
+
kind="package",
|
|
82
|
+
required=True,
|
|
83
|
+
install_command="python -m pip install split-stack",
|
|
84
|
+
verify_hint="python -c \"import split_stack\"",
|
|
85
|
+
),
|
|
86
|
+
Prerequisite(
|
|
87
|
+
id="model_names",
|
|
88
|
+
description="A list of model names for assign_tiers() (from config or your provider)",
|
|
89
|
+
kind="input",
|
|
90
|
+
required=True,
|
|
91
|
+
verify_hint='assign_tiers(["qwen3:4b", "qwen3:8b"])',
|
|
92
|
+
),
|
|
93
|
+
),
|
|
94
|
+
UsageProfile.OLLAMA_DISCOVERY: (
|
|
95
|
+
Prerequisite(
|
|
96
|
+
id="python",
|
|
97
|
+
description="Python 3.10 or newer",
|
|
98
|
+
kind="runtime",
|
|
99
|
+
required=True,
|
|
100
|
+
install_command="https://www.python.org/downloads/",
|
|
101
|
+
verify_hint="python --version",
|
|
102
|
+
),
|
|
103
|
+
Prerequisite(
|
|
104
|
+
id="split_stack",
|
|
105
|
+
description="split-stack package installed",
|
|
106
|
+
kind="package",
|
|
107
|
+
required=True,
|
|
108
|
+
install_command="python -m pip install split-stack",
|
|
109
|
+
verify_hint="python -c \"import split_stack\"",
|
|
110
|
+
),
|
|
111
|
+
Prerequisite(
|
|
112
|
+
id="requests",
|
|
113
|
+
description="requests library (optional extra)",
|
|
114
|
+
kind="package",
|
|
115
|
+
required=True,
|
|
116
|
+
install_command="python -m pip install split-stack[ollama]",
|
|
117
|
+
verify_hint="python -c \"import requests\"",
|
|
118
|
+
),
|
|
119
|
+
Prerequisite(
|
|
120
|
+
id="ollama",
|
|
121
|
+
description="Ollama installed and running on localhost:11434",
|
|
122
|
+
kind="service",
|
|
123
|
+
required=True,
|
|
124
|
+
install_command="https://ollama.com/download",
|
|
125
|
+
verify_hint="ollama list",
|
|
126
|
+
),
|
|
127
|
+
Prerequisite(
|
|
128
|
+
id="ollama_models",
|
|
129
|
+
description="At least one model pulled in Ollama (e.g. ollama pull qwen3:8b)",
|
|
130
|
+
kind="input",
|
|
131
|
+
required=True,
|
|
132
|
+
verify_hint="ollama list",
|
|
133
|
+
),
|
|
134
|
+
),
|
|
135
|
+
UsageProfile.LOCAL_ASSISTANT: (
|
|
136
|
+
Prerequisite(
|
|
137
|
+
id="python",
|
|
138
|
+
description="Python 3.10 or newer",
|
|
139
|
+
kind="runtime",
|
|
140
|
+
required=True,
|
|
141
|
+
install_command="https://www.python.org/downloads/",
|
|
142
|
+
verify_hint="python --version",
|
|
143
|
+
),
|
|
144
|
+
Prerequisite(
|
|
145
|
+
id="split_stack",
|
|
146
|
+
description="split-stack installed in editable mode from repo checkout",
|
|
147
|
+
kind="package",
|
|
148
|
+
required=True,
|
|
149
|
+
install_command="python -m pip install -e .",
|
|
150
|
+
verify_hint="python -c \"import split_stack\"",
|
|
151
|
+
),
|
|
152
|
+
Prerequisite(
|
|
153
|
+
id="requests",
|
|
154
|
+
description="requests library for Ollama HTTP calls",
|
|
155
|
+
kind="package",
|
|
156
|
+
required=True,
|
|
157
|
+
install_command="python -m pip install split-stack[ollama]",
|
|
158
|
+
verify_hint="python -c \"import requests\"",
|
|
159
|
+
),
|
|
160
|
+
Prerequisite(
|
|
161
|
+
id="ollama",
|
|
162
|
+
description="Ollama installed and running on localhost:11434",
|
|
163
|
+
kind="service",
|
|
164
|
+
required=True,
|
|
165
|
+
install_command="https://ollama.com/download",
|
|
166
|
+
verify_hint="ollama serve",
|
|
167
|
+
),
|
|
168
|
+
Prerequisite(
|
|
169
|
+
id="ollama_models",
|
|
170
|
+
description="Two or more models recommended so tiers differ (small + large)",
|
|
171
|
+
kind="input",
|
|
172
|
+
required=True,
|
|
173
|
+
verify_hint="ollama pull qwen3:4b && ollama pull qwen3:14b",
|
|
174
|
+
),
|
|
175
|
+
Prerequisite(
|
|
176
|
+
id="gpu_ram",
|
|
177
|
+
description="Enough VRAM/RAM to run your largest pulled model",
|
|
178
|
+
kind="hardware",
|
|
179
|
+
required=True,
|
|
180
|
+
verify_hint="Start with smaller models if inference is slow or fails",
|
|
181
|
+
),
|
|
182
|
+
),
|
|
183
|
+
UsageProfile.CLI_DOCTOR: (
|
|
184
|
+
Prerequisite(
|
|
185
|
+
id="python",
|
|
186
|
+
description="Python 3.10 or newer",
|
|
187
|
+
kind="runtime",
|
|
188
|
+
required=True,
|
|
189
|
+
install_command="https://www.python.org/downloads/",
|
|
190
|
+
verify_hint="python --version",
|
|
191
|
+
),
|
|
192
|
+
Prerequisite(
|
|
193
|
+
id="split_stack",
|
|
194
|
+
description="split-stack package installed (includes stack CLI)",
|
|
195
|
+
kind="package",
|
|
196
|
+
required=True,
|
|
197
|
+
install_command="python -m pip install split-stack",
|
|
198
|
+
verify_hint="stack doctor",
|
|
199
|
+
),
|
|
200
|
+
Prerequisite(
|
|
201
|
+
id="requests",
|
|
202
|
+
description="requests library for Ollama tier detection",
|
|
203
|
+
kind="package",
|
|
204
|
+
required=False,
|
|
205
|
+
install_command="python -m pip install split-stack[ollama]",
|
|
206
|
+
verify_hint="python -c \"import requests\"",
|
|
207
|
+
),
|
|
208
|
+
Prerequisite(
|
|
209
|
+
id="ollama",
|
|
210
|
+
description="Ollama running locally (optional; doctor skips if unavailable)",
|
|
211
|
+
kind="service",
|
|
212
|
+
required=False,
|
|
213
|
+
install_command="https://ollama.com/download",
|
|
214
|
+
verify_hint="ollama list",
|
|
215
|
+
),
|
|
216
|
+
),
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
_PROFILE_META: dict[UsageProfile, tuple[str, str]] = {
|
|
221
|
+
UsageProfile.CORE: (
|
|
222
|
+
"Core library",
|
|
223
|
+
"Use score_prompt(), assign_tiers(), and route_prompt() with your own model list.",
|
|
224
|
+
),
|
|
225
|
+
UsageProfile.OLLAMA_DISCOVERY: (
|
|
226
|
+
"Ollama discovery",
|
|
227
|
+
"Call discover_models() to read model tags from a local Ollama instance.",
|
|
228
|
+
),
|
|
229
|
+
UsageProfile.LOCAL_ASSISTANT: (
|
|
230
|
+
"Local work assistant example",
|
|
231
|
+
"Run examples/local_work_assistant/app.py for auto-tiered local Q&A.",
|
|
232
|
+
),
|
|
233
|
+
UsageProfile.CLI_DOCTOR: (
|
|
234
|
+
"CLI doctor",
|
|
235
|
+
"Run stack doctor for split-stack guidance and optional Ollama tier output.",
|
|
236
|
+
),
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def list_usage_profiles() -> list[UsageProfile]:
|
|
241
|
+
return list(UsageProfile)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
_CHECKERS: dict[str, Callable[[], bool]] = {
|
|
245
|
+
"python": _python_ok,
|
|
246
|
+
"split_stack": lambda: True,
|
|
247
|
+
"requests": _requests_ok,
|
|
248
|
+
"ollama": _ollama_ok,
|
|
249
|
+
"ollama_models": _ollama_ok,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def usage_requirements(
|
|
254
|
+
profile: UsageProfile = UsageProfile.CORE,
|
|
255
|
+
*,
|
|
256
|
+
check: bool = False,
|
|
257
|
+
) -> ProfileRequirements:
|
|
258
|
+
"""Return prerequisites for a usage profile.
|
|
259
|
+
|
|
260
|
+
Set check=True to probe the local machine (Python version, requests, Ollama).
|
|
261
|
+
"""
|
|
262
|
+
title, summary = _PROFILE_META[profile]
|
|
263
|
+
catalog = _catalog()[profile]
|
|
264
|
+
prerequisites: list[Prerequisite] = []
|
|
265
|
+
|
|
266
|
+
for item in catalog:
|
|
267
|
+
satisfied = None
|
|
268
|
+
if check and item.id in _CHECKERS:
|
|
269
|
+
satisfied = _CHECKERS[item.id]()
|
|
270
|
+
prerequisites.append(
|
|
271
|
+
Prerequisite(
|
|
272
|
+
id=item.id,
|
|
273
|
+
description=item.description,
|
|
274
|
+
kind=item.kind,
|
|
275
|
+
required=item.required,
|
|
276
|
+
install_command=item.install_command,
|
|
277
|
+
verify_hint=item.verify_hint,
|
|
278
|
+
satisfied=satisfied,
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return ProfileRequirements(
|
|
283
|
+
profile=profile,
|
|
284
|
+
title=title,
|
|
285
|
+
summary=summary,
|
|
286
|
+
prerequisites=tuple(prerequisites),
|
|
287
|
+
)
|
split_stack/routing.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from split_stack.complexity import (
|
|
4
|
+
looks_like_code,
|
|
5
|
+
resolve_tier,
|
|
6
|
+
score_prompt,
|
|
7
|
+
)
|
|
8
|
+
from split_stack.hints import normalize_step_kind, prefer_code_model
|
|
9
|
+
from split_stack.models import ComplexityTier, RouteDecision, StepKind, TierMap
|
|
10
|
+
from split_stack.tiering import describe_tiers
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def route_prompt(
|
|
14
|
+
prompt: str,
|
|
15
|
+
tiers: TierMap,
|
|
16
|
+
*,
|
|
17
|
+
hint: str | StepKind | None = None,
|
|
18
|
+
) -> tuple[ComplexityTier, str]:
|
|
19
|
+
"""Return complexity tier and selected model for a prompt."""
|
|
20
|
+
return explain_route(prompt, tiers, hint=hint).as_tuple()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def explain_route(
|
|
24
|
+
prompt: str,
|
|
25
|
+
tiers: TierMap,
|
|
26
|
+
*,
|
|
27
|
+
hint: str | StepKind | None = None,
|
|
28
|
+
) -> RouteDecision:
|
|
29
|
+
"""Return tier, model, and a trace of why routing chose them."""
|
|
30
|
+
raw_hint = hint.value if isinstance(hint, StepKind) else hint
|
|
31
|
+
step_kind: StepKind | None = None
|
|
32
|
+
tier_source = "heuristic"
|
|
33
|
+
reasons: list[str] = []
|
|
34
|
+
|
|
35
|
+
if hint is not None:
|
|
36
|
+
step_kind = normalize_step_kind(hint)
|
|
37
|
+
tier = resolve_tier(prompt, hint=step_kind)
|
|
38
|
+
tier_source = "hint"
|
|
39
|
+
reasons.append(f"hint={step_kind.value} maps to tier {tier.value}")
|
|
40
|
+
else:
|
|
41
|
+
tier = score_prompt(prompt)
|
|
42
|
+
reasons.append(f"no hint — keyword/heuristic scoring → tier {tier.value}")
|
|
43
|
+
if len((prompt or "").split()) > 80:
|
|
44
|
+
reasons.append("prompt length > 80 tokens influenced complex tier")
|
|
45
|
+
|
|
46
|
+
use_code = _should_use_code_model(prompt, tier, hint, step_kind, tiers)
|
|
47
|
+
if use_code and tiers.code:
|
|
48
|
+
model = tiers.code
|
|
49
|
+
model_source = "code_slot"
|
|
50
|
+
if prefer_code_model(hint) or step_kind == StepKind.CODE:
|
|
51
|
+
reasons.append(f"code specialist {model} (hint={step_kind.value if step_kind else hint})")
|
|
52
|
+
else:
|
|
53
|
+
reasons.append(f"code specialist {model} (prompt looks like code)")
|
|
54
|
+
elif use_code and not tiers.code:
|
|
55
|
+
model = tiers.for_tier(tier)
|
|
56
|
+
model_source = "tier_slot"
|
|
57
|
+
reasons.append(
|
|
58
|
+
f"code-like prompt but no code slot — using {tier.value} model {model}"
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
model = tiers.for_tier(tier)
|
|
62
|
+
model_source = "tier_slot"
|
|
63
|
+
slot_name = tier.value
|
|
64
|
+
if tier == ComplexityTier.REASONING and tiers.reasoning == tiers.complex:
|
|
65
|
+
reasons.append(
|
|
66
|
+
f"reasoning tier → {model} (no separate reasoning model; complex fallback)"
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
reasons.append(f"{slot_name} slot → {model}")
|
|
70
|
+
|
|
71
|
+
return RouteDecision(
|
|
72
|
+
tier=tier,
|
|
73
|
+
model=model,
|
|
74
|
+
hint=raw_hint,
|
|
75
|
+
step_kind=step_kind.value if step_kind else None,
|
|
76
|
+
tier_source=tier_source,
|
|
77
|
+
model_source=model_source,
|
|
78
|
+
reasons=tuple(reasons),
|
|
79
|
+
tiers=describe_tiers(tiers),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _should_use_code_model(
|
|
84
|
+
prompt: str,
|
|
85
|
+
tier: ComplexityTier,
|
|
86
|
+
hint: str | StepKind | None,
|
|
87
|
+
step_kind: StepKind | None,
|
|
88
|
+
tiers: TierMap,
|
|
89
|
+
) -> bool:
|
|
90
|
+
if prefer_code_model(hint) or step_kind == StepKind.CODE:
|
|
91
|
+
return True
|
|
92
|
+
if tiers.code is None:
|
|
93
|
+
return False
|
|
94
|
+
if tier not in (ComplexityTier.COMPLEX, ComplexityTier.MEDIUM):
|
|
95
|
+
return False
|
|
96
|
+
return looks_like_code(prompt)
|
split_stack/session.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""Minimal session: set VRAM once, route every call."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from split_stack.model_registry import normalize_deployment_profile
|
|
9
|
+
from split_stack.models import ComplexityTier, RouteDecision, TierMap
|
|
10
|
+
from split_stack.presets import recommended_models
|
|
11
|
+
from split_stack.routing import explain_route, route_prompt
|
|
12
|
+
from split_stack.tiering import assign_tiers, describe_tiers
|
|
13
|
+
from split_stack.validation import validate_tier_map
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class Session:
|
|
18
|
+
profile: str
|
|
19
|
+
vram_gb: int | None
|
|
20
|
+
quant: str
|
|
21
|
+
models: tuple[str, ...]
|
|
22
|
+
tiers: TierMap
|
|
23
|
+
warnings: tuple[str, ...] = ()
|
|
24
|
+
note: str | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
_session: Session | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def profile_for_vram_gb(vram_gb: int) -> str:
|
|
31
|
+
"""Map discrete GPU VRAM to a workstation deployment profile."""
|
|
32
|
+
if vram_gb <= 8:
|
|
33
|
+
return "workstation_8gb"
|
|
34
|
+
if vram_gb <= 12:
|
|
35
|
+
return "workstation_12gb"
|
|
36
|
+
if vram_gb <= 16:
|
|
37
|
+
return "workstation_16gb"
|
|
38
|
+
if vram_gb <= 24:
|
|
39
|
+
return "workstation_24gb"
|
|
40
|
+
if vram_gb <= 32:
|
|
41
|
+
return "workstation_32gb"
|
|
42
|
+
return "datacenter"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _vram_from_env() -> int | None:
|
|
46
|
+
raw = os.environ.get("SPLIT_STACK_VRAM_GB", "").strip()
|
|
47
|
+
if not raw:
|
|
48
|
+
return None
|
|
49
|
+
try:
|
|
50
|
+
value = int(raw)
|
|
51
|
+
except ValueError:
|
|
52
|
+
return None
|
|
53
|
+
return value if value > 0 else None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _profile_from_env() -> str | None:
|
|
57
|
+
raw = os.environ.get("SPLIT_STACK_PROFILE", "").strip()
|
|
58
|
+
return raw or None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def default_profile_from_env(*, fallback: str = "workstation_12gb") -> str:
|
|
62
|
+
"""Profile from SPLIT_STACK_PROFILE or SPLIT_STACK_VRAM_GB, else fallback."""
|
|
63
|
+
profile = _profile_from_env()
|
|
64
|
+
if profile:
|
|
65
|
+
return normalize_deployment_profile(profile)
|
|
66
|
+
vram = _vram_from_env()
|
|
67
|
+
if vram is not None:
|
|
68
|
+
return profile_for_vram_gb(vram)
|
|
69
|
+
return fallback
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _quant_from_env() -> str | None:
|
|
73
|
+
from split_stack.quantization import quant_from_env
|
|
74
|
+
|
|
75
|
+
return quant_from_env()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _resolve_models(
|
|
79
|
+
profile: str,
|
|
80
|
+
models: list[str] | None,
|
|
81
|
+
*,
|
|
82
|
+
quant: str | None = None,
|
|
83
|
+
) -> tuple[list[str], str | None]:
|
|
84
|
+
if models:
|
|
85
|
+
return models, None
|
|
86
|
+
|
|
87
|
+
from split_stack.quantization import normalize_quant_mode
|
|
88
|
+
|
|
89
|
+
desired = recommended_models(profile, quant=quant)
|
|
90
|
+
note: str | None = None
|
|
91
|
+
mode = normalize_quant_mode(quant)
|
|
92
|
+
if mode == "qat" and len(desired) > len(recommended_models(profile)):
|
|
93
|
+
note = "QAT stack: added Gemma 4 models that fit at int4 runtime sizes."
|
|
94
|
+
try:
|
|
95
|
+
from split_stack.discovery import discover_models_from_disk
|
|
96
|
+
|
|
97
|
+
disk = discover_models_from_disk()
|
|
98
|
+
except Exception:
|
|
99
|
+
disk = []
|
|
100
|
+
|
|
101
|
+
if disk:
|
|
102
|
+
matched = [name for name in desired if name in disk]
|
|
103
|
+
if len(matched) >= 2:
|
|
104
|
+
return matched, None
|
|
105
|
+
if len(disk) >= 2:
|
|
106
|
+
from split_stack.model_registry import load_registry, model_weight
|
|
107
|
+
|
|
108
|
+
registry = load_registry(profile=profile)
|
|
109
|
+
ranked = sorted(disk, key=lambda name: model_weight(name, registry))
|
|
110
|
+
note = (
|
|
111
|
+
f"Recommended stack not fully installed ({', '.join(desired)}). "
|
|
112
|
+
f"Using: {', '.join(ranked[:3])}"
|
|
113
|
+
)
|
|
114
|
+
return ranked[:3], note
|
|
115
|
+
note = f"Using installed models only: {', '.join(disk)}"
|
|
116
|
+
return disk, note
|
|
117
|
+
|
|
118
|
+
return desired, "Using recommended stack (not verified against disk)."
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def configure(
|
|
122
|
+
*,
|
|
123
|
+
vram_gb: int | None = None,
|
|
124
|
+
profile: str | None = None,
|
|
125
|
+
models: list[str] | None = None,
|
|
126
|
+
tiers: TierMap | None = None,
|
|
127
|
+
quant: str | None = None,
|
|
128
|
+
) -> Session:
|
|
129
|
+
"""Set the default profile and tier map for ``route()`` and ``explain()``.
|
|
130
|
+
|
|
131
|
+
Progressive control:
|
|
132
|
+
- ``configure(vram_gb=16)`` — preset profile and recommended models
|
|
133
|
+
- ``configure(..., models=[...])`` — explicit model list, auto tier ladder
|
|
134
|
+
- ``configure(..., models=[...], tiers=...)`` — explicit ladder (power users)
|
|
135
|
+
"""
|
|
136
|
+
global _session
|
|
137
|
+
from split_stack.quantization import normalize_quant_mode
|
|
138
|
+
|
|
139
|
+
if quant is None:
|
|
140
|
+
quant = _quant_from_env()
|
|
141
|
+
quant_mode = normalize_quant_mode(quant)
|
|
142
|
+
|
|
143
|
+
if profile is None:
|
|
144
|
+
profile = _profile_from_env()
|
|
145
|
+
if profile is None:
|
|
146
|
+
if vram_gb is None:
|
|
147
|
+
vram_gb = _vram_from_env()
|
|
148
|
+
if vram_gb is None:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
"Pass vram_gb=16 (or profile='workstation_16gb'), "
|
|
151
|
+
"or set SPLIT_STACK_VRAM_GB / SPLIT_STACK_PROFILE."
|
|
152
|
+
)
|
|
153
|
+
profile = profile_for_vram_gb(vram_gb)
|
|
154
|
+
else:
|
|
155
|
+
profile = normalize_deployment_profile(profile)
|
|
156
|
+
if vram_gb is None:
|
|
157
|
+
vram_gb = _vram_from_env()
|
|
158
|
+
|
|
159
|
+
resolved_models, note = _resolve_models(profile, models, quant=quant_mode)
|
|
160
|
+
if not resolved_models:
|
|
161
|
+
raise ValueError(f"No models for profile {profile}")
|
|
162
|
+
|
|
163
|
+
if tiers is not None:
|
|
164
|
+
tier_map = tiers
|
|
165
|
+
if models is not None:
|
|
166
|
+
unknown = [
|
|
167
|
+
name
|
|
168
|
+
for name in (
|
|
169
|
+
tier_map.simple,
|
|
170
|
+
tier_map.medium,
|
|
171
|
+
tier_map.complex,
|
|
172
|
+
tier_map.reasoning,
|
|
173
|
+
tier_map.code,
|
|
174
|
+
)
|
|
175
|
+
if name and name not in resolved_models
|
|
176
|
+
]
|
|
177
|
+
if unknown:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"tiers= references models not in models=: {', '.join(sorted(set(unknown)))}"
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
tier_map = assign_tiers(resolved_models)
|
|
183
|
+
warnings = tuple(validate_tier_map(tier_map, resolved_models, profile=profile))
|
|
184
|
+
_session = Session(
|
|
185
|
+
profile=profile,
|
|
186
|
+
vram_gb=vram_gb,
|
|
187
|
+
quant=quant_mode,
|
|
188
|
+
models=tuple(resolved_models),
|
|
189
|
+
tiers=tier_map,
|
|
190
|
+
warnings=warnings,
|
|
191
|
+
note=note,
|
|
192
|
+
)
|
|
193
|
+
return _session
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_session() -> Session | None:
|
|
197
|
+
return _session
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def session_warnings() -> tuple[str, ...]:
|
|
201
|
+
"""Warnings from the last ``configure()`` (empty if none)."""
|
|
202
|
+
session = _session
|
|
203
|
+
return session.warnings if session else ()
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _ensure_session() -> Session:
|
|
207
|
+
session = _session
|
|
208
|
+
if session is None:
|
|
209
|
+
if _vram_from_env() is not None or _profile_from_env() is not None:
|
|
210
|
+
configure()
|
|
211
|
+
session = _session
|
|
212
|
+
else:
|
|
213
|
+
raise RuntimeError(
|
|
214
|
+
"split_stack.configure(vram_gb=16) first, or set SPLIT_STACK_VRAM_GB."
|
|
215
|
+
)
|
|
216
|
+
assert session is not None
|
|
217
|
+
return session
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def route(
|
|
221
|
+
prompt: str,
|
|
222
|
+
*,
|
|
223
|
+
hint: str | None = None,
|
|
224
|
+
) -> tuple[ComplexityTier, str]:
|
|
225
|
+
"""Route one prompt using the configured session. Call ``configure()`` first."""
|
|
226
|
+
session = _ensure_session()
|
|
227
|
+
return route_prompt(prompt, session.tiers, hint=hint)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def explain(
|
|
231
|
+
prompt: str,
|
|
232
|
+
*,
|
|
233
|
+
hint: str | None = None,
|
|
234
|
+
) -> RouteDecision:
|
|
235
|
+
"""Route with a full decision trace (logging, CLI, tests)."""
|
|
236
|
+
session = _ensure_session()
|
|
237
|
+
return explain_route(prompt, session.tiers, hint=hint)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def describe_session() -> dict[str, object]:
|
|
241
|
+
"""Snapshot of the active session for logs and ``stack explain`` without a prompt."""
|
|
242
|
+
session = get_session()
|
|
243
|
+
if session is None:
|
|
244
|
+
return {"configured": False}
|
|
245
|
+
return {
|
|
246
|
+
"configured": True,
|
|
247
|
+
"profile": session.profile,
|
|
248
|
+
"vram_gb": session.vram_gb,
|
|
249
|
+
"quant": session.quant,
|
|
250
|
+
"models": list(session.models),
|
|
251
|
+
"tiers": describe_tiers(session.tiers),
|
|
252
|
+
"warnings": list(session.warnings),
|
|
253
|
+
"note": session.note,
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def reset_session_for_tests() -> None:
|
|
258
|
+
global _session
|
|
259
|
+
_session = None
|