split-stack 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split_stack/__init__.py +106 -0
- split_stack/__main__.py +4 -0
- split_stack/advice.py +12 -0
- split_stack/benchmark.py +97 -0
- split_stack/cli.py +690 -0
- split_stack/community_picks.py +247 -0
- split_stack/compare.py +194 -0
- split_stack/complexity.py +77 -0
- split_stack/discovery.py +288 -0
- split_stack/hints.py +102 -0
- split_stack/local_models.py +63 -0
- split_stack/model_guide.py +273 -0
- split_stack/model_registry.py +314 -0
- split_stack/models.py +77 -0
- split_stack/ollama_errors.py +30 -0
- split_stack/ollama_generate.py +135 -0
- split_stack/poc_models.py +131 -0
- split_stack/presets.py +75 -0
- split_stack/quantization.py +137 -0
- split_stack/requirements.py +287 -0
- split_stack/routing.py +96 -0
- split_stack/session.py +259 -0
- split_stack/setup_wizard.py +259 -0
- split_stack/startup_tips.py +169 -0
- split_stack/tiering.py +66 -0
- split_stack/validation.py +85 -0
- split_stack-0.2.0.dist-info/METADATA +364 -0
- split_stack-0.2.0.dist-info/RECORD +32 -0
- split_stack-0.2.0.dist-info/WHEEL +5 -0
- split_stack-0.2.0.dist-info/entry_points.txt +2 -0
- split_stack-0.2.0.dist-info/licenses/LICENSE +21 -0
- split_stack-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from split_stack.discovery import discover_models
|
|
6
|
+
from split_stack.local_models import list_local_models
|
|
7
|
+
from split_stack.ollama_errors import format_ollama_error
|
|
8
|
+
from split_stack.routing import route_prompt
|
|
9
|
+
from split_stack.tiering import assign_tiers
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class RouteResult:
|
|
14
|
+
tier: str
|
|
15
|
+
model: str
|
|
16
|
+
ready: bool
|
|
17
|
+
error: str | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class AskResult:
|
|
22
|
+
tier: str
|
|
23
|
+
model: str
|
|
24
|
+
text: str
|
|
25
|
+
ready: bool
|
|
26
|
+
error: str | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _resolve_model_names(
|
|
30
|
+
*,
|
|
31
|
+
base_url: str,
|
|
32
|
+
model_names: list[str] | None,
|
|
33
|
+
config_path: str | None = None,
|
|
34
|
+
only_vram_ok: bool = True,
|
|
35
|
+
) -> tuple[list[str], str | None]:
|
|
36
|
+
if model_names:
|
|
37
|
+
return model_names, None
|
|
38
|
+
resolved, warning = list_local_models(
|
|
39
|
+
base_url=base_url,
|
|
40
|
+
config_path=config_path,
|
|
41
|
+
only_vram_ok=only_vram_ok,
|
|
42
|
+
)
|
|
43
|
+
if resolved:
|
|
44
|
+
return [item.name for item in resolved], warning
|
|
45
|
+
return discover_models(base_url=base_url), warning
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def route_prompt_json(
|
|
49
|
+
prompt: str,
|
|
50
|
+
*,
|
|
51
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
52
|
+
model_names: list[str] | None = None,
|
|
53
|
+
config_path: str | None = None,
|
|
54
|
+
only_vram_ok: bool = True,
|
|
55
|
+
hint: str | None = None,
|
|
56
|
+
) -> RouteResult:
|
|
57
|
+
try:
|
|
58
|
+
names, _warning = _resolve_model_names(
|
|
59
|
+
base_url=base_url,
|
|
60
|
+
model_names=model_names,
|
|
61
|
+
config_path=config_path,
|
|
62
|
+
only_vram_ok=only_vram_ok,
|
|
63
|
+
)
|
|
64
|
+
if not names:
|
|
65
|
+
return RouteResult(tier="", model="", ready=False, error="No models available")
|
|
66
|
+
tiers = assign_tiers(names)
|
|
67
|
+
tier, model = route_prompt(prompt, tiers, hint=hint)
|
|
68
|
+
return RouteResult(tier=tier.value, model=model, ready=True)
|
|
69
|
+
except Exception as exc:
|
|
70
|
+
return RouteResult(tier="", model="", ready=False, error=str(exc))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def generate_text(
|
|
74
|
+
model: str,
|
|
75
|
+
prompt: str,
|
|
76
|
+
*,
|
|
77
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
78
|
+
timeout_seconds: int = 60,
|
|
79
|
+
) -> str:
|
|
80
|
+
try:
|
|
81
|
+
import requests
|
|
82
|
+
except ImportError as exc:
|
|
83
|
+
raise RuntimeError(
|
|
84
|
+
"generate_text requires optional dependency: pip install split-stack[ollama]"
|
|
85
|
+
) from exc
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
response = requests.post(
|
|
89
|
+
f"{base_url.rstrip('/')}/api/generate",
|
|
90
|
+
json={"model": model, "prompt": prompt, "stream": False},
|
|
91
|
+
timeout=timeout_seconds,
|
|
92
|
+
)
|
|
93
|
+
response.raise_for_status()
|
|
94
|
+
except Exception as exc:
|
|
95
|
+
raise RuntimeError(format_ollama_error(exc, model=model, base_url=base_url)) from exc
|
|
96
|
+
payload = response.json() or {}
|
|
97
|
+
return payload.get("response", "").strip()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def ask_prompt_json(
|
|
101
|
+
prompt: str,
|
|
102
|
+
*,
|
|
103
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
104
|
+
model_names: list[str] | None = None,
|
|
105
|
+
timeout_seconds: int = 60,
|
|
106
|
+
config_path: str | None = None,
|
|
107
|
+
only_vram_ok: bool = True,
|
|
108
|
+
hint: str | None = None,
|
|
109
|
+
) -> AskResult:
|
|
110
|
+
routed = route_prompt_json(
|
|
111
|
+
prompt,
|
|
112
|
+
base_url=base_url,
|
|
113
|
+
model_names=model_names,
|
|
114
|
+
config_path=config_path,
|
|
115
|
+
only_vram_ok=only_vram_ok,
|
|
116
|
+
hint=hint,
|
|
117
|
+
)
|
|
118
|
+
if not routed.ready:
|
|
119
|
+
return AskResult(tier="", model="", text="", ready=False, error=routed.error)
|
|
120
|
+
try:
|
|
121
|
+
text = generate_text(
|
|
122
|
+
routed.model,
|
|
123
|
+
prompt,
|
|
124
|
+
base_url=base_url,
|
|
125
|
+
timeout_seconds=timeout_seconds,
|
|
126
|
+
)
|
|
127
|
+
return AskResult(tier=routed.tier, model=routed.model, text=text, ready=True)
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
return AskResult(
|
|
130
|
+
tier=routed.tier,
|
|
131
|
+
model=routed.model,
|
|
132
|
+
text="",
|
|
133
|
+
ready=False,
|
|
134
|
+
error=str(exc),
|
|
135
|
+
)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Default model stacks for POC demos and compare benchmarks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from split_stack.community_picks import focus_stack, vram_tier_for_profile
|
|
8
|
+
from split_stack.discovery import list_model_inventory
|
|
9
|
+
from split_stack.presets import recommended_models
|
|
10
|
+
|
|
11
|
+
DEFAULT_POC_STACK = ["gemma4:e4b", "qwen3:8b", "qwen3:14b"]
|
|
12
|
+
QWEN_ONLY_STACK = ["qwen3:4b", "qwen3:8b", "qwen3:14b"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class StackPreset:
|
|
17
|
+
id: str
|
|
18
|
+
label: str
|
|
19
|
+
models: tuple[str, ...]
|
|
20
|
+
description: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
STACK_PRESETS: tuple[StackPreset, ...] = (
|
|
24
|
+
StackPreset(
|
|
25
|
+
id="mixed_12gb",
|
|
26
|
+
label="Mixed 12 GB (Gemma + Qwen)",
|
|
27
|
+
models=tuple(DEFAULT_POC_STACK),
|
|
28
|
+
description="Gemma lookup, Qwen 8B medium, Qwen 14B complex",
|
|
29
|
+
),
|
|
30
|
+
StackPreset(
|
|
31
|
+
id="qwen_only",
|
|
32
|
+
label="Qwen only (4B / 8B / 14B)",
|
|
33
|
+
models=tuple(QWEN_ONLY_STACK),
|
|
34
|
+
description="Single-family ladder",
|
|
35
|
+
),
|
|
36
|
+
StackPreset(
|
|
37
|
+
id="recommended_12gb",
|
|
38
|
+
label="Full 12 GB specialist",
|
|
39
|
+
models=tuple(recommended_models("workstation_12gb")),
|
|
40
|
+
description="Gemma + Qwen + DeepSeek R1 for reasoning",
|
|
41
|
+
),
|
|
42
|
+
StackPreset(
|
|
43
|
+
id="community_agentic",
|
|
44
|
+
label="Reddit agentic (M tier)",
|
|
45
|
+
models=tuple(),
|
|
46
|
+
description="r/LocalLLaMA Apr 2026 — Gemma lookup + Qwen spine for agent loops",
|
|
47
|
+
),
|
|
48
|
+
StackPreset(
|
|
49
|
+
id="from_inventory",
|
|
50
|
+
label="From your Ollama (auto ladder)",
|
|
51
|
+
models=tuple(),
|
|
52
|
+
description="Picks small/mid/large tags from API + disk manifests",
|
|
53
|
+
),
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def list_stack_presets() -> tuple[StackPreset, ...]:
|
|
58
|
+
return STACK_PRESETS
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def models_for_preset(
|
|
62
|
+
preset_id: str,
|
|
63
|
+
*,
|
|
64
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
65
|
+
profile: str = "workstation_12gb",
|
|
66
|
+
) -> list[str]:
|
|
67
|
+
if preset_id == "from_inventory":
|
|
68
|
+
inventory = list_model_inventory(base_url=base_url)
|
|
69
|
+
if inventory.suggested_stack:
|
|
70
|
+
return list(inventory.suggested_stack)
|
|
71
|
+
return list(DEFAULT_POC_STACK)
|
|
72
|
+
if preset_id == "community_agentic":
|
|
73
|
+
tier = vram_tier_for_profile(profile)
|
|
74
|
+
stack = focus_stack("agentic", vram_tier=tier)
|
|
75
|
+
if stack and stack.models:
|
|
76
|
+
return list(stack.models)
|
|
77
|
+
return list(DEFAULT_POC_STACK)
|
|
78
|
+
for item in STACK_PRESETS:
|
|
79
|
+
if item.id == preset_id:
|
|
80
|
+
return list(item.models)
|
|
81
|
+
valid = ", ".join(item.id for item in STACK_PRESETS)
|
|
82
|
+
raise ValueError(f"Unknown stack preset '{preset_id}'. Valid: {valid}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def available_model_pool(
|
|
86
|
+
*,
|
|
87
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
88
|
+
source: str = "both",
|
|
89
|
+
) -> tuple[list[str], str | None]:
|
|
90
|
+
"""Return model names from Ollama API, disk manifests, or both."""
|
|
91
|
+
inventory = list_model_inventory(base_url=base_url)
|
|
92
|
+
if source == "api":
|
|
93
|
+
pool = list(inventory.api_models)
|
|
94
|
+
elif source == "disk":
|
|
95
|
+
pool = list(inventory.disk_models)
|
|
96
|
+
else:
|
|
97
|
+
pool = sorted(set(inventory.api_models) | set(inventory.disk_models))
|
|
98
|
+
return pool, inventory.note
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def resolve_installed_stack(
|
|
102
|
+
installed: list[str],
|
|
103
|
+
*,
|
|
104
|
+
preset_id: str = "mixed_12gb",
|
|
105
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
106
|
+
) -> tuple[list[str], str | None]:
|
|
107
|
+
"""Pick preset models that exist in the installed pool; warn when falling back."""
|
|
108
|
+
desired = models_for_preset(preset_id, base_url=base_url)
|
|
109
|
+
installed_set = set(installed)
|
|
110
|
+
matched = [name for name in desired if name in installed_set]
|
|
111
|
+
if len(matched) >= 2:
|
|
112
|
+
return matched, None
|
|
113
|
+
|
|
114
|
+
if installed:
|
|
115
|
+
from split_stack.model_registry import load_registry, model_weight
|
|
116
|
+
|
|
117
|
+
registry = load_registry()
|
|
118
|
+
ranked = sorted(installed, key=lambda name: model_weight(name, registry))
|
|
119
|
+
if len(ranked) >= 2:
|
|
120
|
+
warning = (
|
|
121
|
+
f"Preset '{preset_id}' not fully available ({', '.join(desired)}). "
|
|
122
|
+
f"Using: {', '.join(ranked)}"
|
|
123
|
+
)
|
|
124
|
+
return ranked, warning
|
|
125
|
+
warning = (
|
|
126
|
+
f"Preset '{preset_id}' not fully available. "
|
|
127
|
+
f"Using only {ranked[0]} — need 2+ models for compare spread."
|
|
128
|
+
)
|
|
129
|
+
return ranked, warning
|
|
130
|
+
|
|
131
|
+
return desired, f"Using preset list (not verified): {', '.join(desired)}"
|
split_stack/presets.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from split_stack.model_registry import normalize_deployment_profile
|
|
6
|
+
from split_stack.models import TierMap
|
|
7
|
+
from split_stack.tiering import assign_tiers
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class RecommendedStack:
|
|
12
|
+
profile: str
|
|
13
|
+
models: tuple[str, ...]
|
|
14
|
+
description: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
RECOMMENDED_STACKS: dict[str, RecommendedStack] = {
|
|
18
|
+
"workstation_8gb": RecommendedStack(
|
|
19
|
+
profile="workstation_8gb",
|
|
20
|
+
models=("gemma4:e4b", "qwen3:8b"),
|
|
21
|
+
description="Gemma min + Qwen 8b max (flat but honest on 8 GB)",
|
|
22
|
+
),
|
|
23
|
+
"workstation_12gb": RecommendedStack(
|
|
24
|
+
profile="workstation_12gb",
|
|
25
|
+
models=("gemma4:e4b", "qwen3:8b", "qwen3:14b", "deepseek-r1:8b"),
|
|
26
|
+
description="Gemma lookup, Qwen mid/complex, DeepSeek R1 reasoning",
|
|
27
|
+
),
|
|
28
|
+
"workstation_16gb": RecommendedStack(
|
|
29
|
+
profile="workstation_16gb",
|
|
30
|
+
models=("gemma4:e4b", "qwen3:8b", "qwen3:14b"),
|
|
31
|
+
description="Gemma lookup + Qwen mid/complex; add coder/reason tags via models=",
|
|
32
|
+
),
|
|
33
|
+
"workstation_24gb": RecommendedStack(
|
|
34
|
+
profile="workstation_24gb",
|
|
35
|
+
models=(
|
|
36
|
+
"gemma4:e4b",
|
|
37
|
+
"qwen3:8b",
|
|
38
|
+
"qwen3:14b",
|
|
39
|
+
"qwen3:30b-a3b",
|
|
40
|
+
"deepseek-coder:6.7b",
|
|
41
|
+
),
|
|
42
|
+
description="Full mixed ladder with MoE top and code specialist",
|
|
43
|
+
),
|
|
44
|
+
"workstation_32gb": RecommendedStack(
|
|
45
|
+
profile="workstation_32gb",
|
|
46
|
+
models=(
|
|
47
|
+
"gemma4:e4b",
|
|
48
|
+
"qwen3:8b",
|
|
49
|
+
"qwen3:14b",
|
|
50
|
+
"qwen3:30b-a3b",
|
|
51
|
+
"deepseek-coder:6.7b",
|
|
52
|
+
"deepseek-r1:8b",
|
|
53
|
+
),
|
|
54
|
+
description="5090 class: MoE + separate reasoning and code specialists",
|
|
55
|
+
),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def list_recommended_stacks() -> tuple[RecommendedStack, ...]:
|
|
60
|
+
return tuple(RECOMMENDED_STACKS[name] for name in sorted(RECOMMENDED_STACKS))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def recommended_models(profile: str, *, quant: str | None = None) -> list[str]:
|
|
64
|
+
profile_name = normalize_deployment_profile(profile)
|
|
65
|
+
stack = RECOMMENDED_STACKS.get(profile_name)
|
|
66
|
+
if stack is None:
|
|
67
|
+
valid = ", ".join(sorted(RECOMMENDED_STACKS))
|
|
68
|
+
raise ValueError(f"Unknown profile '{profile}'. Valid workstation stacks: {valid}")
|
|
69
|
+
from split_stack.quantization import expand_models_for_quant
|
|
70
|
+
|
|
71
|
+
return expand_models_for_quant(list(stack.models), profile_name, quant)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def assign_recommended_tiers(profile: str) -> TierMap:
|
|
75
|
+
return assign_tiers(recommended_models(profile))
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Quantization assumptions for VRAM sizing — not per-prompt routing.
|
|
2
|
+
|
|
3
|
+
Ollama tags (``gemma4:e4b``) do not encode quant. ``quant=`` tells split-stack which
|
|
4
|
+
pull format you use so VRAM filters and QAT-aware stack suggestions stay honest.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
QUANT_MODES: tuple[str, ...] = ("default", "qat", "qat_mobile", "bf16")
|
|
12
|
+
|
|
13
|
+
# Runtime memory (GB) from Unsloth Gemma 4 QAT docs — UD-Q4_K_XL, not naive Q4_0.
|
|
14
|
+
_GEMMA4_QAT_RUNTIME_GB: dict[str, int] = {
|
|
15
|
+
"gemma4:e2b": 3,
|
|
16
|
+
":e2b": 3,
|
|
17
|
+
"gemma4:e4b": 5,
|
|
18
|
+
":e4b": 5,
|
|
19
|
+
"gemma4:12b": 7,
|
|
20
|
+
"gemma4:26b-a4b": 15,
|
|
21
|
+
"gemma4:26b": 15,
|
|
22
|
+
"gemma4:31b": 18,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Google mobile mixture QAT (UD-Q2_K_XL class).
|
|
26
|
+
_GEMMA4_QAT_MOBILE_RUNTIME_GB: dict[str, int] = {
|
|
27
|
+
"gemma4:e2b": 3,
|
|
28
|
+
":e2b": 3,
|
|
29
|
+
"gemma4:e4b": 4,
|
|
30
|
+
":e4b": 4,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# BF16 original sizes (Unsloth table, rounded up for filter headroom).
|
|
34
|
+
_GEMMA4_BF16_RUNTIME_GB: dict[str, int] = {
|
|
35
|
+
"gemma4:e2b": 10,
|
|
36
|
+
":e2b": 10,
|
|
37
|
+
"gemma4:e4b": 16,
|
|
38
|
+
":e4b": 16,
|
|
39
|
+
"gemma4:12b": 24,
|
|
40
|
+
"gemma4:26b-a4b": 51,
|
|
41
|
+
"gemma4:26b": 51,
|
|
42
|
+
"gemma4:31b": 62,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Extra models that fit when Gemma pulls use QAT int4 (Unsloth hardware table).
|
|
46
|
+
QAT_STACK_ADDITIONS: dict[str, tuple[str, ...]] = {
|
|
47
|
+
"workstation_16gb": ("gemma4:26b-a4b",),
|
|
48
|
+
"workstation_24gb": ("gemma4:31b",),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def normalize_quant_mode(name: str | None) -> str:
|
|
53
|
+
if not name:
|
|
54
|
+
return "default"
|
|
55
|
+
lowered = name.strip().lower().replace("-", "_")
|
|
56
|
+
aliases = {
|
|
57
|
+
"off": "default",
|
|
58
|
+
"none": "default",
|
|
59
|
+
"q4": "qat",
|
|
60
|
+
"q4_qat": "qat",
|
|
61
|
+
"qat_q4": "qat",
|
|
62
|
+
"mobile": "qat_mobile",
|
|
63
|
+
"qat_mobile_mixture": "qat_mobile",
|
|
64
|
+
"fp16": "bf16",
|
|
65
|
+
"full": "bf16",
|
|
66
|
+
}
|
|
67
|
+
lowered = aliases.get(lowered, lowered)
|
|
68
|
+
if lowered not in QUANT_MODES:
|
|
69
|
+
valid = ", ".join(QUANT_MODES)
|
|
70
|
+
raise ValueError(f"Unknown quant mode '{name}'. Valid modes: {valid}")
|
|
71
|
+
return lowered
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def quant_from_env() -> str | None:
|
|
75
|
+
raw = os.environ.get("SPLIT_STACK_QUANT", "").strip()
|
|
76
|
+
return raw or None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _lookup_table_vram(name: str, table: dict[str, int]) -> int | None:
|
|
80
|
+
lowered = name.lower()
|
|
81
|
+
best_gb: int | None = None
|
|
82
|
+
best_len = -1
|
|
83
|
+
for key, gb in table.items():
|
|
84
|
+
if key in lowered and len(key) > best_len:
|
|
85
|
+
best_gb = gb
|
|
86
|
+
best_len = len(key)
|
|
87
|
+
return best_gb
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def adjust_vram_for_quant(
|
|
91
|
+
name: str,
|
|
92
|
+
base_vram_gb: int | None,
|
|
93
|
+
quant_mode: str | None,
|
|
94
|
+
) -> int | None:
|
|
95
|
+
"""Return effective VRAM for feasibility checks; does not change routing weight."""
|
|
96
|
+
mode = normalize_quant_mode(quant_mode)
|
|
97
|
+
if mode == "default":
|
|
98
|
+
return base_vram_gb
|
|
99
|
+
tables = {
|
|
100
|
+
"qat": _GEMMA4_QAT_RUNTIME_GB,
|
|
101
|
+
"qat_mobile": _GEMMA4_QAT_MOBILE_RUNTIME_GB,
|
|
102
|
+
"bf16": _GEMMA4_BF16_RUNTIME_GB,
|
|
103
|
+
}
|
|
104
|
+
matched = _lookup_table_vram(name, tables[mode])
|
|
105
|
+
if matched is not None:
|
|
106
|
+
return matched
|
|
107
|
+
return base_vram_gb
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def expand_models_for_quant(models: list[str], profile: str, quant_mode: str | None) -> list[str]:
|
|
111
|
+
"""Add QAT-feasible models to a recommended stack (Gemma 4 only today)."""
|
|
112
|
+
if normalize_quant_mode(quant_mode) != "qat":
|
|
113
|
+
return models
|
|
114
|
+
profile_name = profile.strip().lower()
|
|
115
|
+
extras = QAT_STACK_ADDITIONS.get(profile_name, ())
|
|
116
|
+
out = list(models)
|
|
117
|
+
for name in extras:
|
|
118
|
+
if name not in out:
|
|
119
|
+
out.append(name)
|
|
120
|
+
return out
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def pull_guidance_lines(quant_mode: str | None) -> list[str]:
|
|
124
|
+
"""Short pull hints for docs/CLI — not import-time spam."""
|
|
125
|
+
mode = normalize_quant_mode(quant_mode)
|
|
126
|
+
if mode == "default":
|
|
127
|
+
return []
|
|
128
|
+
if mode == "qat":
|
|
129
|
+
return [
|
|
130
|
+
"Gemma 4 QAT: prefer Unsloth UD-Q4_K_XL GGUF over naive Google Q4_0 for llama.cpp/Ollama imports.",
|
|
131
|
+
"Collections: google/gemma-4-qat-q4_0, unsloth/gemma-4-qat — see docs/LOCAL_MODELS.md",
|
|
132
|
+
]
|
|
133
|
+
if mode == "qat_mobile":
|
|
134
|
+
return [
|
|
135
|
+
"Gemma 4 mobile mixture QAT: google/gemma-4-qat-mobile (UD-Q2_K_XL on E2B/E4B).",
|
|
136
|
+
]
|
|
137
|
+
return ["Gemma 4 BF16 pulls need datacenter profile or custom vram_gb in config."]
|