split-stack 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from split_stack.discovery import discover_models
6
+ from split_stack.local_models import list_local_models
7
+ from split_stack.ollama_errors import format_ollama_error
8
+ from split_stack.routing import route_prompt
9
+ from split_stack.tiering import assign_tiers
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class RouteResult:
14
+ tier: str
15
+ model: str
16
+ ready: bool
17
+ error: str | None = None
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class AskResult:
22
+ tier: str
23
+ model: str
24
+ text: str
25
+ ready: bool
26
+ error: str | None = None
27
+
28
+
29
+ def _resolve_model_names(
30
+ *,
31
+ base_url: str,
32
+ model_names: list[str] | None,
33
+ config_path: str | None = None,
34
+ only_vram_ok: bool = True,
35
+ ) -> tuple[list[str], str | None]:
36
+ if model_names:
37
+ return model_names, None
38
+ resolved, warning = list_local_models(
39
+ base_url=base_url,
40
+ config_path=config_path,
41
+ only_vram_ok=only_vram_ok,
42
+ )
43
+ if resolved:
44
+ return [item.name for item in resolved], warning
45
+ return discover_models(base_url=base_url), warning
46
+
47
+
48
+ def route_prompt_json(
49
+ prompt: str,
50
+ *,
51
+ base_url: str = "http://127.0.0.1:11434",
52
+ model_names: list[str] | None = None,
53
+ config_path: str | None = None,
54
+ only_vram_ok: bool = True,
55
+ hint: str | None = None,
56
+ ) -> RouteResult:
57
+ try:
58
+ names, _warning = _resolve_model_names(
59
+ base_url=base_url,
60
+ model_names=model_names,
61
+ config_path=config_path,
62
+ only_vram_ok=only_vram_ok,
63
+ )
64
+ if not names:
65
+ return RouteResult(tier="", model="", ready=False, error="No models available")
66
+ tiers = assign_tiers(names)
67
+ tier, model = route_prompt(prompt, tiers, hint=hint)
68
+ return RouteResult(tier=tier.value, model=model, ready=True)
69
+ except Exception as exc:
70
+ return RouteResult(tier="", model="", ready=False, error=str(exc))
71
+
72
+
73
+ def generate_text(
74
+ model: str,
75
+ prompt: str,
76
+ *,
77
+ base_url: str = "http://127.0.0.1:11434",
78
+ timeout_seconds: int = 60,
79
+ ) -> str:
80
+ try:
81
+ import requests
82
+ except ImportError as exc:
83
+ raise RuntimeError(
84
+ "generate_text requires optional dependency: pip install split-stack[ollama]"
85
+ ) from exc
86
+
87
+ try:
88
+ response = requests.post(
89
+ f"{base_url.rstrip('/')}/api/generate",
90
+ json={"model": model, "prompt": prompt, "stream": False},
91
+ timeout=timeout_seconds,
92
+ )
93
+ response.raise_for_status()
94
+ except Exception as exc:
95
+ raise RuntimeError(format_ollama_error(exc, model=model, base_url=base_url)) from exc
96
+ payload = response.json() or {}
97
+ return payload.get("response", "").strip()
98
+
99
+
100
+ def ask_prompt_json(
101
+ prompt: str,
102
+ *,
103
+ base_url: str = "http://127.0.0.1:11434",
104
+ model_names: list[str] | None = None,
105
+ timeout_seconds: int = 60,
106
+ config_path: str | None = None,
107
+ only_vram_ok: bool = True,
108
+ hint: str | None = None,
109
+ ) -> AskResult:
110
+ routed = route_prompt_json(
111
+ prompt,
112
+ base_url=base_url,
113
+ model_names=model_names,
114
+ config_path=config_path,
115
+ only_vram_ok=only_vram_ok,
116
+ hint=hint,
117
+ )
118
+ if not routed.ready:
119
+ return AskResult(tier="", model="", text="", ready=False, error=routed.error)
120
+ try:
121
+ text = generate_text(
122
+ routed.model,
123
+ prompt,
124
+ base_url=base_url,
125
+ timeout_seconds=timeout_seconds,
126
+ )
127
+ return AskResult(tier=routed.tier, model=routed.model, text=text, ready=True)
128
+ except Exception as exc:
129
+ return AskResult(
130
+ tier=routed.tier,
131
+ model=routed.model,
132
+ text="",
133
+ ready=False,
134
+ error=str(exc),
135
+ )
@@ -0,0 +1,131 @@
1
+ """Default model stacks for POC demos and compare benchmarks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from split_stack.community_picks import focus_stack, vram_tier_for_profile
8
+ from split_stack.discovery import list_model_inventory
9
+ from split_stack.presets import recommended_models
10
+
11
+ DEFAULT_POC_STACK = ["gemma4:e4b", "qwen3:8b", "qwen3:14b"]
12
+ QWEN_ONLY_STACK = ["qwen3:4b", "qwen3:8b", "qwen3:14b"]
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class StackPreset:
17
+ id: str
18
+ label: str
19
+ models: tuple[str, ...]
20
+ description: str
21
+
22
+
23
+ STACK_PRESETS: tuple[StackPreset, ...] = (
24
+ StackPreset(
25
+ id="mixed_12gb",
26
+ label="Mixed 12 GB (Gemma + Qwen)",
27
+ models=tuple(DEFAULT_POC_STACK),
28
+ description="Gemma lookup, Qwen 8B medium, Qwen 14B complex",
29
+ ),
30
+ StackPreset(
31
+ id="qwen_only",
32
+ label="Qwen only (4B / 8B / 14B)",
33
+ models=tuple(QWEN_ONLY_STACK),
34
+ description="Single-family ladder",
35
+ ),
36
+ StackPreset(
37
+ id="recommended_12gb",
38
+ label="Full 12 GB specialist",
39
+ models=tuple(recommended_models("workstation_12gb")),
40
+ description="Gemma + Qwen + DeepSeek R1 for reasoning",
41
+ ),
42
+ StackPreset(
43
+ id="community_agentic",
44
+ label="Reddit agentic (M tier)",
45
+ models=tuple(),
46
+ description="r/LocalLLaMA Apr 2026 — Gemma lookup + Qwen spine for agent loops",
47
+ ),
48
+ StackPreset(
49
+ id="from_inventory",
50
+ label="From your Ollama (auto ladder)",
51
+ models=tuple(),
52
+ description="Picks small/mid/large tags from API + disk manifests",
53
+ ),
54
+ )
55
+
56
+
57
+ def list_stack_presets() -> tuple[StackPreset, ...]:
58
+ return STACK_PRESETS
59
+
60
+
61
+ def models_for_preset(
62
+ preset_id: str,
63
+ *,
64
+ base_url: str = "http://127.0.0.1:11434",
65
+ profile: str = "workstation_12gb",
66
+ ) -> list[str]:
67
+ if preset_id == "from_inventory":
68
+ inventory = list_model_inventory(base_url=base_url)
69
+ if inventory.suggested_stack:
70
+ return list(inventory.suggested_stack)
71
+ return list(DEFAULT_POC_STACK)
72
+ if preset_id == "community_agentic":
73
+ tier = vram_tier_for_profile(profile)
74
+ stack = focus_stack("agentic", vram_tier=tier)
75
+ if stack and stack.models:
76
+ return list(stack.models)
77
+ return list(DEFAULT_POC_STACK)
78
+ for item in STACK_PRESETS:
79
+ if item.id == preset_id:
80
+ return list(item.models)
81
+ valid = ", ".join(item.id for item in STACK_PRESETS)
82
+ raise ValueError(f"Unknown stack preset '{preset_id}'. Valid: {valid}")
83
+
84
+
85
+ def available_model_pool(
86
+ *,
87
+ base_url: str = "http://127.0.0.1:11434",
88
+ source: str = "both",
89
+ ) -> tuple[list[str], str | None]:
90
+ """Return model names from Ollama API, disk manifests, or both."""
91
+ inventory = list_model_inventory(base_url=base_url)
92
+ if source == "api":
93
+ pool = list(inventory.api_models)
94
+ elif source == "disk":
95
+ pool = list(inventory.disk_models)
96
+ else:
97
+ pool = sorted(set(inventory.api_models) | set(inventory.disk_models))
98
+ return pool, inventory.note
99
+
100
+
101
+ def resolve_installed_stack(
102
+ installed: list[str],
103
+ *,
104
+ preset_id: str = "mixed_12gb",
105
+ base_url: str = "http://127.0.0.1:11434",
106
+ ) -> tuple[list[str], str | None]:
107
+ """Pick preset models that exist in the installed pool; warn when falling back."""
108
+ desired = models_for_preset(preset_id, base_url=base_url)
109
+ installed_set = set(installed)
110
+ matched = [name for name in desired if name in installed_set]
111
+ if len(matched) >= 2:
112
+ return matched, None
113
+
114
+ if installed:
115
+ from split_stack.model_registry import load_registry, model_weight
116
+
117
+ registry = load_registry()
118
+ ranked = sorted(installed, key=lambda name: model_weight(name, registry))
119
+ if len(ranked) >= 2:
120
+ warning = (
121
+ f"Preset '{preset_id}' not fully available ({', '.join(desired)}). "
122
+ f"Using: {', '.join(ranked)}"
123
+ )
124
+ return ranked, warning
125
+ warning = (
126
+ f"Preset '{preset_id}' not fully available. "
127
+ f"Using only {ranked[0]} — need 2+ models for compare spread."
128
+ )
129
+ return ranked, warning
130
+
131
+ return desired, f"Using preset list (not verified): {', '.join(desired)}"
split_stack/presets.py ADDED
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from split_stack.model_registry import normalize_deployment_profile
6
+ from split_stack.models import TierMap
7
+ from split_stack.tiering import assign_tiers
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class RecommendedStack:
12
+ profile: str
13
+ models: tuple[str, ...]
14
+ description: str
15
+
16
+
17
+ RECOMMENDED_STACKS: dict[str, RecommendedStack] = {
18
+ "workstation_8gb": RecommendedStack(
19
+ profile="workstation_8gb",
20
+ models=("gemma4:e4b", "qwen3:8b"),
21
+ description="Gemma min + Qwen 8b max (flat but honest on 8 GB)",
22
+ ),
23
+ "workstation_12gb": RecommendedStack(
24
+ profile="workstation_12gb",
25
+ models=("gemma4:e4b", "qwen3:8b", "qwen3:14b", "deepseek-r1:8b"),
26
+ description="Gemma lookup, Qwen mid/complex, DeepSeek R1 reasoning",
27
+ ),
28
+ "workstation_16gb": RecommendedStack(
29
+ profile="workstation_16gb",
30
+ models=("gemma4:e4b", "qwen3:8b", "qwen3:14b"),
31
+ description="Gemma lookup + Qwen mid/complex; add coder/reason tags via models=",
32
+ ),
33
+ "workstation_24gb": RecommendedStack(
34
+ profile="workstation_24gb",
35
+ models=(
36
+ "gemma4:e4b",
37
+ "qwen3:8b",
38
+ "qwen3:14b",
39
+ "qwen3:30b-a3b",
40
+ "deepseek-coder:6.7b",
41
+ ),
42
+ description="Full mixed ladder with MoE top and code specialist",
43
+ ),
44
+ "workstation_32gb": RecommendedStack(
45
+ profile="workstation_32gb",
46
+ models=(
47
+ "gemma4:e4b",
48
+ "qwen3:8b",
49
+ "qwen3:14b",
50
+ "qwen3:30b-a3b",
51
+ "deepseek-coder:6.7b",
52
+ "deepseek-r1:8b",
53
+ ),
54
+ description="5090 class: MoE + separate reasoning and code specialists",
55
+ ),
56
+ }
57
+
58
+
59
+ def list_recommended_stacks() -> tuple[RecommendedStack, ...]:
60
+ return tuple(RECOMMENDED_STACKS[name] for name in sorted(RECOMMENDED_STACKS))
61
+
62
+
63
+ def recommended_models(profile: str, *, quant: str | None = None) -> list[str]:
64
+ profile_name = normalize_deployment_profile(profile)
65
+ stack = RECOMMENDED_STACKS.get(profile_name)
66
+ if stack is None:
67
+ valid = ", ".join(sorted(RECOMMENDED_STACKS))
68
+ raise ValueError(f"Unknown profile '{profile}'. Valid workstation stacks: {valid}")
69
+ from split_stack.quantization import expand_models_for_quant
70
+
71
+ return expand_models_for_quant(list(stack.models), profile_name, quant)
72
+
73
+
74
+ def assign_recommended_tiers(profile: str) -> TierMap:
75
+ return assign_tiers(recommended_models(profile))
@@ -0,0 +1,137 @@
1
+ """Quantization assumptions for VRAM sizing — not per-prompt routing.
2
+
3
+ Ollama tags (``gemma4:e4b``) do not encode quant. ``quant=`` tells split-stack which
4
+ pull format you use so VRAM filters and QAT-aware stack suggestions stay honest.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+
11
+ QUANT_MODES: tuple[str, ...] = ("default", "qat", "qat_mobile", "bf16")
12
+
13
+ # Runtime memory (GB) from Unsloth Gemma 4 QAT docs — UD-Q4_K_XL, not naive Q4_0.
14
+ _GEMMA4_QAT_RUNTIME_GB: dict[str, int] = {
15
+ "gemma4:e2b": 3,
16
+ ":e2b": 3,
17
+ "gemma4:e4b": 5,
18
+ ":e4b": 5,
19
+ "gemma4:12b": 7,
20
+ "gemma4:26b-a4b": 15,
21
+ "gemma4:26b": 15,
22
+ "gemma4:31b": 18,
23
+ }
24
+
25
+ # Google mobile mixture QAT (UD-Q2_K_XL class).
26
+ _GEMMA4_QAT_MOBILE_RUNTIME_GB: dict[str, int] = {
27
+ "gemma4:e2b": 3,
28
+ ":e2b": 3,
29
+ "gemma4:e4b": 4,
30
+ ":e4b": 4,
31
+ }
32
+
33
+ # BF16 original sizes (Unsloth table, rounded up for filter headroom).
34
+ _GEMMA4_BF16_RUNTIME_GB: dict[str, int] = {
35
+ "gemma4:e2b": 10,
36
+ ":e2b": 10,
37
+ "gemma4:e4b": 16,
38
+ ":e4b": 16,
39
+ "gemma4:12b": 24,
40
+ "gemma4:26b-a4b": 51,
41
+ "gemma4:26b": 51,
42
+ "gemma4:31b": 62,
43
+ }
44
+
45
+ # Extra models that fit when Gemma pulls use QAT int4 (Unsloth hardware table).
46
+ QAT_STACK_ADDITIONS: dict[str, tuple[str, ...]] = {
47
+ "workstation_16gb": ("gemma4:26b-a4b",),
48
+ "workstation_24gb": ("gemma4:31b",),
49
+ }
50
+
51
+
52
+ def normalize_quant_mode(name: str | None) -> str:
53
+ if not name:
54
+ return "default"
55
+ lowered = name.strip().lower().replace("-", "_")
56
+ aliases = {
57
+ "off": "default",
58
+ "none": "default",
59
+ "q4": "qat",
60
+ "q4_qat": "qat",
61
+ "qat_q4": "qat",
62
+ "mobile": "qat_mobile",
63
+ "qat_mobile_mixture": "qat_mobile",
64
+ "fp16": "bf16",
65
+ "full": "bf16",
66
+ }
67
+ lowered = aliases.get(lowered, lowered)
68
+ if lowered not in QUANT_MODES:
69
+ valid = ", ".join(QUANT_MODES)
70
+ raise ValueError(f"Unknown quant mode '{name}'. Valid modes: {valid}")
71
+ return lowered
72
+
73
+
74
+ def quant_from_env() -> str | None:
75
+ raw = os.environ.get("SPLIT_STACK_QUANT", "").strip()
76
+ return raw or None
77
+
78
+
79
+ def _lookup_table_vram(name: str, table: dict[str, int]) -> int | None:
80
+ lowered = name.lower()
81
+ best_gb: int | None = None
82
+ best_len = -1
83
+ for key, gb in table.items():
84
+ if key in lowered and len(key) > best_len:
85
+ best_gb = gb
86
+ best_len = len(key)
87
+ return best_gb
88
+
89
+
90
+ def adjust_vram_for_quant(
91
+ name: str,
92
+ base_vram_gb: int | None,
93
+ quant_mode: str | None,
94
+ ) -> int | None:
95
+ """Return effective VRAM for feasibility checks; does not change routing weight."""
96
+ mode = normalize_quant_mode(quant_mode)
97
+ if mode == "default":
98
+ return base_vram_gb
99
+ tables = {
100
+ "qat": _GEMMA4_QAT_RUNTIME_GB,
101
+ "qat_mobile": _GEMMA4_QAT_MOBILE_RUNTIME_GB,
102
+ "bf16": _GEMMA4_BF16_RUNTIME_GB,
103
+ }
104
+ matched = _lookup_table_vram(name, tables[mode])
105
+ if matched is not None:
106
+ return matched
107
+ return base_vram_gb
108
+
109
+
110
+ def expand_models_for_quant(models: list[str], profile: str, quant_mode: str | None) -> list[str]:
111
+ """Add QAT-feasible models to a recommended stack (Gemma 4 only today)."""
112
+ if normalize_quant_mode(quant_mode) != "qat":
113
+ return models
114
+ profile_name = profile.strip().lower()
115
+ extras = QAT_STACK_ADDITIONS.get(profile_name, ())
116
+ out = list(models)
117
+ for name in extras:
118
+ if name not in out:
119
+ out.append(name)
120
+ return out
121
+
122
+
123
+ def pull_guidance_lines(quant_mode: str | None) -> list[str]:
124
+ """Short pull hints for docs/CLI — not import-time spam."""
125
+ mode = normalize_quant_mode(quant_mode)
126
+ if mode == "default":
127
+ return []
128
+ if mode == "qat":
129
+ return [
130
+ "Gemma 4 QAT: prefer Unsloth UD-Q4_K_XL GGUF over naive Google Q4_0 for llama.cpp/Ollama imports.",
131
+ "Collections: google/gemma-4-qat-q4_0, unsloth/gemma-4-qat — see docs/LOCAL_MODELS.md",
132
+ ]
133
+ if mode == "qat_mobile":
134
+ return [
135
+ "Gemma 4 mobile mixture QAT: google/gemma-4-qat-mobile (UD-Q2_K_XL on E2B/E4B).",
136
+ ]
137
+ return ["Gemma 4 BF16 pulls need datacenter profile or custom vram_gb in config."]