split-stack 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split_stack/__init__.py +4 -1
- split_stack/cli.py +62 -0
- split_stack/discovery.py +1 -1
- split_stack/poc_models.py +136 -66
- split_stack/stack_health.py +360 -0
- {split_stack-0.2.0.dist-info → split_stack-0.3.0.dist-info}/METADATA +1 -1
- {split_stack-0.2.0.dist-info → split_stack-0.3.0.dist-info}/RECORD +11 -10
- {split_stack-0.2.0.dist-info → split_stack-0.3.0.dist-info}/WHEEL +0 -0
- {split_stack-0.2.0.dist-info → split_stack-0.3.0.dist-info}/entry_points.txt +0 -0
- {split_stack-0.2.0.dist-info → split_stack-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {split_stack-0.2.0.dist-info → split_stack-0.3.0.dist-info}/top_level.txt +0 -0
split_stack/__init__.py
CHANGED
|
@@ -46,11 +46,12 @@ from split_stack.session import (
|
|
|
46
46
|
route,
|
|
47
47
|
session_warnings,
|
|
48
48
|
)
|
|
49
|
+
from split_stack.stack_health import check_stack_health, format_stack_health
|
|
49
50
|
from split_stack.startup_tips import emit_import_tips, model_recommendation_report
|
|
50
51
|
from split_stack.tiering import assign_tiers, describe_tiers
|
|
51
52
|
from split_stack.validation import validate_tier_map
|
|
52
53
|
|
|
53
|
-
__version__ = "0.
|
|
54
|
+
__version__ = "0.3.0"
|
|
54
55
|
|
|
55
56
|
__all__ = [
|
|
56
57
|
"ComplexityTier",
|
|
@@ -98,6 +99,8 @@ __all__ = [
|
|
|
98
99
|
"score_prompt",
|
|
99
100
|
"session_warnings",
|
|
100
101
|
"stack_recommendation",
|
|
102
|
+
"check_stack_health",
|
|
103
|
+
"format_stack_health",
|
|
101
104
|
"usage_requirements",
|
|
102
105
|
"validate_tier_map",
|
|
103
106
|
"__version__",
|
split_stack/cli.py
CHANGED
|
@@ -20,6 +20,7 @@ from split_stack.ollama_generate import ask_prompt_json, route_prompt_json
|
|
|
20
20
|
from split_stack.requirements import UsageProfile, list_usage_profiles, usage_requirements
|
|
21
21
|
from split_stack.presets import assign_recommended_tiers, list_recommended_stacks, recommended_models
|
|
22
22
|
from split_stack.setup_wizard import format_setup_summary, plan_setup, run_setup
|
|
23
|
+
from split_stack.stack_health import check_stack_health, format_stack_health
|
|
23
24
|
from split_stack.tiering import assign_tiers, describe_tiers
|
|
24
25
|
|
|
25
26
|
|
|
@@ -159,7 +160,47 @@ def _cmd_profiles(args: argparse.Namespace) -> int:
|
|
|
159
160
|
return 0
|
|
160
161
|
|
|
161
162
|
|
|
163
|
+
def _cmd_doctor_check_stack(args: argparse.Namespace) -> int:
|
|
164
|
+
models = None
|
|
165
|
+
if args.models:
|
|
166
|
+
models = [part.strip() for part in args.models.split(",") if part.strip()]
|
|
167
|
+
report = check_stack_health(
|
|
168
|
+
profile=args.profile,
|
|
169
|
+
vram_gb=args.vram_gb,
|
|
170
|
+
quant=args.quant,
|
|
171
|
+
base_url=args.base_url,
|
|
172
|
+
models=models,
|
|
173
|
+
)
|
|
174
|
+
if args.json:
|
|
175
|
+
payload = {
|
|
176
|
+
"ready": report.ready,
|
|
177
|
+
"profile": report.profile,
|
|
178
|
+
"vram_gb": report.vram_gb,
|
|
179
|
+
"quant": report.quant,
|
|
180
|
+
"recommended": list(report.recommended),
|
|
181
|
+
"resolved": list(report.resolved),
|
|
182
|
+
"missing": list(report.missing),
|
|
183
|
+
"pool_size": report.pool_size,
|
|
184
|
+
"inventory_note": report.inventory_note,
|
|
185
|
+
"findings": [
|
|
186
|
+
{
|
|
187
|
+
"level": item.level,
|
|
188
|
+
"code": item.code,
|
|
189
|
+
"message": item.message,
|
|
190
|
+
"models": list(item.models),
|
|
191
|
+
}
|
|
192
|
+
for item in report.findings
|
|
193
|
+
],
|
|
194
|
+
}
|
|
195
|
+
return _emit_json(payload)
|
|
196
|
+
print(format_stack_health(report))
|
|
197
|
+
return 0 if report.ready else 1
|
|
198
|
+
|
|
199
|
+
|
|
162
200
|
def _cmd_doctor(args: argparse.Namespace) -> int:
|
|
201
|
+
if args.check_stack:
|
|
202
|
+
return _cmd_doctor_check_stack(args)
|
|
203
|
+
|
|
163
204
|
advice = stack_recommendation(cursor_override_enabled=False)
|
|
164
205
|
print(f"Cursor model: {advice.cursor_model}")
|
|
165
206
|
print(f"Prose path: {advice.prose_path}")
|
|
@@ -505,6 +546,27 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
505
546
|
help="Path to split-stack.models.json (or set SPLIT_STACK_MODELS_CONFIG)",
|
|
506
547
|
)
|
|
507
548
|
_add_quant_arg(doctor_parser)
|
|
549
|
+
doctor_parser.add_argument(
|
|
550
|
+
"--check-stack",
|
|
551
|
+
action="store_true",
|
|
552
|
+
help="Offline stack health: missing models, duplicates, routing spread (exit 1 if not ready)",
|
|
553
|
+
)
|
|
554
|
+
doctor_parser.add_argument(
|
|
555
|
+
"--vram-gb",
|
|
556
|
+
type=int,
|
|
557
|
+
choices=[8, 12, 16, 24, 32],
|
|
558
|
+
help="GPU VRAM for recommended stack (alternative to --profile)",
|
|
559
|
+
)
|
|
560
|
+
doctor_parser.add_argument(
|
|
561
|
+
"--base-url",
|
|
562
|
+
default="http://127.0.0.1:11434",
|
|
563
|
+
help="Ollama base URL for inventory scan",
|
|
564
|
+
)
|
|
565
|
+
doctor_parser.add_argument(
|
|
566
|
+
"--models",
|
|
567
|
+
help="Comma-separated stack override when using --check-stack",
|
|
568
|
+
)
|
|
569
|
+
doctor_parser.add_argument("--json", action="store_true", help="JSON output (with --check-stack)")
|
|
508
570
|
doctor_parser.set_defaults(handler=_cmd_doctor)
|
|
509
571
|
|
|
510
572
|
requirements_parser = subparsers.add_parser(
|
split_stack/discovery.py
CHANGED
|
@@ -240,7 +240,7 @@ def audit_model_folders(
|
|
|
240
240
|
primary = home if home.is_dir() else None
|
|
241
241
|
return {
|
|
242
242
|
"primary_root": str(primary) if primary else None,
|
|
243
|
-
"scan_roots":
|
|
243
|
+
"scan_roots": [str(path) for path in manifest_search_paths()],
|
|
244
244
|
"tag_count": len(locations),
|
|
245
245
|
"locations": {tag: list(paths) for tag, paths in locations.items()},
|
|
246
246
|
"duplicates": duplicates,
|
split_stack/poc_models.py
CHANGED
|
@@ -1,85 +1,112 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Workstation stack helpers for demos and compare POC."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
|
|
7
|
-
from split_stack.community_picks import focus_stack, vram_tier_for_profile
|
|
8
7
|
from split_stack.discovery import list_model_inventory
|
|
9
|
-
from split_stack.presets import recommended_models
|
|
8
|
+
from split_stack.presets import RECOMMENDED_STACKS, recommended_models
|
|
9
|
+
from split_stack.quantization import QAT_STACK_ADDITIONS, normalize_quant_mode
|
|
10
|
+
from split_stack.session import profile_for_vram_gb
|
|
10
11
|
|
|
11
12
|
DEFAULT_POC_STACK = ["gemma4:e4b", "qwen3:8b", "qwen3:14b"]
|
|
12
|
-
|
|
13
|
+
|
|
14
|
+
VRAM_OPTIONS: tuple[tuple[int, str], ...] = (
|
|
15
|
+
(8, "8 GB"),
|
|
16
|
+
(12, "12 GB"),
|
|
17
|
+
(16, "16 GB"),
|
|
18
|
+
(24, "24 GB"),
|
|
19
|
+
(32, "32 GB"),
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
QUANT_OPTIONS: tuple[tuple[str, str], ...] = (
|
|
23
|
+
("default", "Default (PTQ)"),
|
|
24
|
+
("qat", "Gemma QAT (int4)"),
|
|
25
|
+
("qat_mobile", "Gemma mobile QAT"),
|
|
26
|
+
("bf16", "BF16 (full size)"),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
_PRESET_VRAM_ALIASES: dict[str, int] = {
|
|
30
|
+
"mixed_12gb": 12,
|
|
31
|
+
"mixed_16gb": 16,
|
|
32
|
+
"qwen_only": 12,
|
|
33
|
+
"recommended_12gb": 12,
|
|
34
|
+
"from_inventory": 0,
|
|
35
|
+
}
|
|
13
36
|
|
|
14
37
|
|
|
15
38
|
@dataclass(frozen=True)
|
|
16
|
-
class
|
|
17
|
-
|
|
18
|
-
|
|
39
|
+
class WorkstationStack:
|
|
40
|
+
vram_gb: int
|
|
41
|
+
profile: str
|
|
42
|
+
quant: str
|
|
19
43
|
models: tuple[str, ...]
|
|
20
44
|
description: str
|
|
45
|
+
notes: tuple[str, ...]
|
|
21
46
|
|
|
22
47
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
id="mixed_12gb",
|
|
26
|
-
label="Mixed 12 GB (Gemma + Qwen)",
|
|
27
|
-
models=tuple(DEFAULT_POC_STACK),
|
|
28
|
-
description="Gemma lookup, Qwen 8B medium, Qwen 14B complex",
|
|
29
|
-
),
|
|
30
|
-
StackPreset(
|
|
31
|
-
id="qwen_only",
|
|
32
|
-
label="Qwen only (4B / 8B / 14B)",
|
|
33
|
-
models=tuple(QWEN_ONLY_STACK),
|
|
34
|
-
description="Single-family ladder",
|
|
35
|
-
),
|
|
36
|
-
StackPreset(
|
|
37
|
-
id="recommended_12gb",
|
|
38
|
-
label="Full 12 GB specialist",
|
|
39
|
-
models=tuple(recommended_models("workstation_12gb")),
|
|
40
|
-
description="Gemma + Qwen + DeepSeek R1 for reasoning",
|
|
41
|
-
),
|
|
42
|
-
StackPreset(
|
|
43
|
-
id="community_agentic",
|
|
44
|
-
label="Reddit agentic (M tier)",
|
|
45
|
-
models=tuple(),
|
|
46
|
-
description="r/LocalLLaMA Apr 2026 — Gemma lookup + Qwen spine for agent loops",
|
|
47
|
-
),
|
|
48
|
-
StackPreset(
|
|
49
|
-
id="from_inventory",
|
|
50
|
-
label="From your Ollama (auto ladder)",
|
|
51
|
-
models=tuple(),
|
|
52
|
-
description="Picks small/mid/large tags from API + disk manifests",
|
|
53
|
-
),
|
|
54
|
-
)
|
|
48
|
+
def list_vram_options() -> tuple[tuple[int, str], ...]:
|
|
49
|
+
return VRAM_OPTIONS
|
|
55
50
|
|
|
56
51
|
|
|
57
|
-
def
|
|
58
|
-
return
|
|
52
|
+
def list_quant_options() -> tuple[tuple[str, str], ...]:
|
|
53
|
+
return QUANT_OPTIONS
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def recommended_stack_for_vram(
|
|
57
|
+
vram_gb: int,
|
|
58
|
+
*,
|
|
59
|
+
quant: str | None = None,
|
|
60
|
+
) -> WorkstationStack:
|
|
61
|
+
profile = profile_for_vram_gb(vram_gb)
|
|
62
|
+
stack = RECOMMENDED_STACKS[profile]
|
|
63
|
+
quant_mode = normalize_quant_mode(quant)
|
|
64
|
+
models = recommended_models(profile, quant=quant_mode)
|
|
65
|
+
notes: list[str] = []
|
|
66
|
+
if quant_mode == "qat":
|
|
67
|
+
extras = QAT_STACK_ADDITIONS.get(profile, ())
|
|
68
|
+
if extras:
|
|
69
|
+
notes.append(
|
|
70
|
+
f"QAT adds {', '.join(extras)} on {vram_gb} GB — Gemma-only int4 runtime sizes."
|
|
71
|
+
)
|
|
72
|
+
elif quant_mode == "bf16":
|
|
73
|
+
notes.append("BF16 uses full Gemma pull sizes — prefer 24 GB+ or datacenter.")
|
|
74
|
+
return WorkstationStack(
|
|
75
|
+
vram_gb=vram_gb,
|
|
76
|
+
profile=profile,
|
|
77
|
+
quant=quant_mode,
|
|
78
|
+
models=tuple(models),
|
|
79
|
+
description=stack.description,
|
|
80
|
+
notes=tuple(notes),
|
|
81
|
+
)
|
|
59
82
|
|
|
60
83
|
|
|
61
84
|
def models_for_preset(
|
|
62
85
|
preset_id: str,
|
|
63
86
|
*,
|
|
64
87
|
base_url: str = "http://127.0.0.1:11434",
|
|
65
|
-
profile: str =
|
|
88
|
+
profile: str | None = None,
|
|
89
|
+
quant: str | None = None,
|
|
66
90
|
) -> list[str]:
|
|
67
91
|
if preset_id == "from_inventory":
|
|
68
92
|
inventory = list_model_inventory(base_url=base_url)
|
|
69
93
|
if inventory.suggested_stack:
|
|
70
94
|
return list(inventory.suggested_stack)
|
|
71
95
|
return list(DEFAULT_POC_STACK)
|
|
96
|
+
if preset_id == "qwen_only":
|
|
97
|
+
return ["qwen3:4b", "qwen3:8b", "qwen3:14b"]
|
|
72
98
|
if preset_id == "community_agentic":
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
99
|
+
from split_stack.community_picks import focus_stack, vram_tier_for_profile
|
|
100
|
+
|
|
101
|
+
tier = vram_tier_for_profile(profile or "workstation_12gb")
|
|
102
|
+
focus = focus_stack("agentic", vram_tier=tier)
|
|
103
|
+
if focus and focus.models:
|
|
104
|
+
return list(focus.models)
|
|
77
105
|
return list(DEFAULT_POC_STACK)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
raise ValueError(f"Unknown stack preset '{preset_id}'. Valid: {valid}")
|
|
106
|
+
vram = _PRESET_VRAM_ALIASES.get(preset_id)
|
|
107
|
+
if vram:
|
|
108
|
+
return list(recommended_stack_for_vram(vram, quant=quant).models)
|
|
109
|
+
raise ValueError(f"Unknown stack preset '{preset_id}'.")
|
|
83
110
|
|
|
84
111
|
|
|
85
112
|
def available_model_pool(
|
|
@@ -87,7 +114,6 @@ def available_model_pool(
|
|
|
87
114
|
base_url: str = "http://127.0.0.1:11434",
|
|
88
115
|
source: str = "both",
|
|
89
116
|
) -> tuple[list[str], str | None]:
|
|
90
|
-
"""Return model names from Ollama API, disk manifests, or both."""
|
|
91
117
|
inventory = list_model_inventory(base_url=base_url)
|
|
92
118
|
if source == "api":
|
|
93
119
|
pool = list(inventory.api_models)
|
|
@@ -98,18 +124,15 @@ def available_model_pool(
|
|
|
98
124
|
return pool, inventory.note
|
|
99
125
|
|
|
100
126
|
|
|
101
|
-
def
|
|
127
|
+
def resolve_stack_against_pool(
|
|
128
|
+
desired: list[str],
|
|
102
129
|
installed: list[str],
|
|
103
|
-
|
|
104
|
-
preset_id: str = "mixed_12gb",
|
|
105
|
-
base_url: str = "http://127.0.0.1:11434",
|
|
106
|
-
) -> tuple[list[str], str | None]:
|
|
107
|
-
"""Pick preset models that exist in the installed pool; warn when falling back."""
|
|
108
|
-
desired = models_for_preset(preset_id, base_url=base_url)
|
|
130
|
+
) -> tuple[list[str], list[str], str | None]:
|
|
109
131
|
installed_set = set(installed)
|
|
110
132
|
matched = [name for name in desired if name in installed_set]
|
|
133
|
+
missing = [name for name in desired if name not in installed_set]
|
|
111
134
|
if len(matched) >= 2:
|
|
112
|
-
return matched, None
|
|
135
|
+
return matched, missing, None
|
|
113
136
|
|
|
114
137
|
if installed:
|
|
115
138
|
from split_stack.model_registry import load_registry, model_weight
|
|
@@ -118,14 +141,61 @@ def resolve_installed_stack(
|
|
|
118
141
|
ranked = sorted(installed, key=lambda name: model_weight(name, registry))
|
|
119
142
|
if len(ranked) >= 2:
|
|
120
143
|
warning = (
|
|
121
|
-
f"
|
|
144
|
+
f"Recommended stack not fully installed ({', '.join(desired)}). "
|
|
122
145
|
f"Using: {', '.join(ranked)}"
|
|
123
146
|
)
|
|
124
|
-
return ranked, warning
|
|
147
|
+
return ranked, missing, warning
|
|
125
148
|
warning = (
|
|
126
|
-
f"
|
|
127
|
-
f"Using only {ranked[0]} — need 2+ models for
|
|
149
|
+
f"Recommended stack not fully installed. "
|
|
150
|
+
f"Using only {ranked[0]} — need 2+ models for routing spread."
|
|
128
151
|
)
|
|
129
|
-
return ranked, warning
|
|
152
|
+
return ranked, missing, warning
|
|
153
|
+
|
|
154
|
+
return desired, missing, f"Using recommended list (not verified against disk): {', '.join(desired)}"
|
|
130
155
|
|
|
131
|
-
|
|
156
|
+
|
|
157
|
+
def resolve_installed_stack(
|
|
158
|
+
installed: list[str],
|
|
159
|
+
*,
|
|
160
|
+
preset_id: str = "mixed_12gb",
|
|
161
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
162
|
+
vram_gb: int | None = None,
|
|
163
|
+
quant: str | None = None,
|
|
164
|
+
models: list[str] | None = None,
|
|
165
|
+
) -> tuple[list[str], str | None]:
|
|
166
|
+
if models:
|
|
167
|
+
desired = models
|
|
168
|
+
elif vram_gb is not None:
|
|
169
|
+
desired = list(recommended_stack_for_vram(vram_gb, quant=quant).models)
|
|
170
|
+
else:
|
|
171
|
+
desired = models_for_preset(preset_id, base_url=base_url, quant=quant)
|
|
172
|
+
resolved, _missing, warning = resolve_stack_against_pool(desired, installed)
|
|
173
|
+
return resolved, warning
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def stack_payload(
|
|
177
|
+
*,
|
|
178
|
+
vram_gb: int = 16,
|
|
179
|
+
quant: str | None = "qat",
|
|
180
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
181
|
+
source: str = "both",
|
|
182
|
+
models_override: list[str] | None = None,
|
|
183
|
+
) -> dict[str, object]:
|
|
184
|
+
stack = recommended_stack_for_vram(vram_gb, quant=quant)
|
|
185
|
+
desired = list(models_override) if models_override else list(stack.models)
|
|
186
|
+
pool, inventory_note = available_model_pool(base_url=base_url, source=source)
|
|
187
|
+
resolved, missing, warning = resolve_stack_against_pool(desired, pool)
|
|
188
|
+
return {
|
|
189
|
+
"ready": True,
|
|
190
|
+
"vram_gb": vram_gb,
|
|
191
|
+
"profile": stack.profile,
|
|
192
|
+
"quant": stack.quant,
|
|
193
|
+
"description": stack.description,
|
|
194
|
+
"notes": list(stack.notes),
|
|
195
|
+
"models": desired,
|
|
196
|
+
"resolved_models": resolved,
|
|
197
|
+
"missing_models": missing,
|
|
198
|
+
"warning": warning,
|
|
199
|
+
"inventory_note": inventory_note,
|
|
200
|
+
"pool_size": len(pool),
|
|
201
|
+
}
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""Offline stack health checks — missing models, duplicates, routing spread, quant honesty."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from split_stack.discovery import audit_model_folders, list_model_inventory
|
|
8
|
+
from split_stack.poc_models import resolve_stack_against_pool, stack_payload
|
|
9
|
+
from split_stack.presets import recommended_models
|
|
10
|
+
from split_stack.quantization import adjust_vram_for_quant, normalize_quant_mode, pull_guidance_lines
|
|
11
|
+
from split_stack.session import default_profile_from_env, profile_for_vram_gb
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class ModelTagInfo:
|
|
16
|
+
name: str
|
|
17
|
+
size_bytes: int
|
|
18
|
+
quantization_level: str | None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class StackHealthFinding:
|
|
23
|
+
level: str # ok, warn, error
|
|
24
|
+
code: str
|
|
25
|
+
message: str
|
|
26
|
+
models: tuple[str, ...] = ()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class StackHealthReport:
|
|
31
|
+
ready: bool
|
|
32
|
+
profile: str
|
|
33
|
+
vram_gb: int | None
|
|
34
|
+
quant: str
|
|
35
|
+
recommended: tuple[str, ...]
|
|
36
|
+
resolved: tuple[str, ...]
|
|
37
|
+
missing: tuple[str, ...]
|
|
38
|
+
pool_size: int
|
|
39
|
+
findings: tuple[StackHealthFinding, ...]
|
|
40
|
+
inventory_note: str | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def check_stack_health(
|
|
44
|
+
*,
|
|
45
|
+
profile: str | None = None,
|
|
46
|
+
vram_gb: int | None = None,
|
|
47
|
+
quant: str | None = None,
|
|
48
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
49
|
+
models: list[str] | None = None,
|
|
50
|
+
source: str = "both",
|
|
51
|
+
) -> StackHealthReport:
|
|
52
|
+
"""Check recommended stack against local inventory (offline; no upstream registry)."""
|
|
53
|
+
quant_mode = normalize_quant_mode(quant)
|
|
54
|
+
if vram_gb is not None:
|
|
55
|
+
resolved_profile = profile_for_vram_gb(vram_gb)
|
|
56
|
+
else:
|
|
57
|
+
resolved_profile = profile or default_profile_from_env()
|
|
58
|
+
vram_gb = _vram_for_profile(resolved_profile)
|
|
59
|
+
|
|
60
|
+
if models:
|
|
61
|
+
recommended = tuple(models)
|
|
62
|
+
payload = stack_payload(
|
|
63
|
+
vram_gb=vram_gb or 12,
|
|
64
|
+
quant=quant_mode,
|
|
65
|
+
base_url=base_url,
|
|
66
|
+
source=source,
|
|
67
|
+
models_override=list(models),
|
|
68
|
+
)
|
|
69
|
+
resolved = tuple(payload.get("resolved_models") or ())
|
|
70
|
+
missing = tuple(payload.get("missing_models") or ())
|
|
71
|
+
inventory_note = payload.get("inventory_note")
|
|
72
|
+
if isinstance(inventory_note, str):
|
|
73
|
+
pass
|
|
74
|
+
else:
|
|
75
|
+
inventory_note = None
|
|
76
|
+
pool_size = int(payload.get("pool_size") or 0)
|
|
77
|
+
else:
|
|
78
|
+
recommended = tuple(recommended_models(resolved_profile, quant=quant_mode))
|
|
79
|
+
inventory = list_model_inventory(base_url=base_url)
|
|
80
|
+
if source == "api":
|
|
81
|
+
pool = list(inventory.api_models)
|
|
82
|
+
elif source == "disk":
|
|
83
|
+
pool = list(inventory.disk_models)
|
|
84
|
+
else:
|
|
85
|
+
pool = sorted(set(inventory.api_models) | set(inventory.disk_models))
|
|
86
|
+
resolved_list, missing_list, _warning = resolve_stack_against_pool(
|
|
87
|
+
list(recommended),
|
|
88
|
+
pool,
|
|
89
|
+
)
|
|
90
|
+
resolved = tuple(resolved_list)
|
|
91
|
+
missing = tuple(missing_list)
|
|
92
|
+
inventory_note = inventory.note
|
|
93
|
+
pool_size = len(pool)
|
|
94
|
+
|
|
95
|
+
findings: list[StackHealthFinding] = []
|
|
96
|
+
|
|
97
|
+
for name in recommended:
|
|
98
|
+
if name in missing:
|
|
99
|
+
findings.append(
|
|
100
|
+
StackHealthFinding(
|
|
101
|
+
level="error",
|
|
102
|
+
code="missing",
|
|
103
|
+
message=f"{name} is recommended but not found in local inventory.",
|
|
104
|
+
models=(name,),
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
elif name in resolved:
|
|
108
|
+
findings.append(
|
|
109
|
+
StackHealthFinding(
|
|
110
|
+
level="ok",
|
|
111
|
+
code="present",
|
|
112
|
+
message=f"{name} is installed.",
|
|
113
|
+
models=(name,),
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if len(resolved) < 2:
|
|
118
|
+
findings.append(
|
|
119
|
+
StackHealthFinding(
|
|
120
|
+
level="error",
|
|
121
|
+
code="routing_spread",
|
|
122
|
+
message=(
|
|
123
|
+
f"Only {len(resolved)} model(s) available for routing "
|
|
124
|
+
f"({', '.join(resolved) or 'none'}). Need at least 2 for tier spread."
|
|
125
|
+
),
|
|
126
|
+
models=resolved,
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
elif missing:
|
|
130
|
+
findings.append(
|
|
131
|
+
StackHealthFinding(
|
|
132
|
+
level="warn",
|
|
133
|
+
code="partial_stack",
|
|
134
|
+
message=(
|
|
135
|
+
f"Using {len(resolved)} installed model(s); "
|
|
136
|
+
f"{len(missing)} recommended tag(s) missing."
|
|
137
|
+
),
|
|
138
|
+
models=resolved,
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
findings.append(
|
|
143
|
+
StackHealthFinding(
|
|
144
|
+
level="ok",
|
|
145
|
+
code="stack_complete",
|
|
146
|
+
message=f"All {len(recommended)} recommended model(s) are installed.",
|
|
147
|
+
models=resolved,
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
audit = audit_model_folders()
|
|
152
|
+
duplicate_tags = audit.get("duplicate_tags") or []
|
|
153
|
+
if duplicate_tags:
|
|
154
|
+
dup_list = tuple(str(tag) for tag in duplicate_tags)
|
|
155
|
+
findings.append(
|
|
156
|
+
StackHealthFinding(
|
|
157
|
+
level="warn",
|
|
158
|
+
code="duplicate_tags",
|
|
159
|
+
message=(
|
|
160
|
+
f"Duplicate tags across Ollama folders: {', '.join(dup_list)}. "
|
|
161
|
+
"Keep one models directory or run audit cleanup."
|
|
162
|
+
),
|
|
163
|
+
models=dup_list,
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
findings.extend(
|
|
168
|
+
_quant_mismatch_findings(
|
|
169
|
+
quant_mode=quant_mode,
|
|
170
|
+
model_names=tuple(name for name in resolved if name not in missing),
|
|
171
|
+
base_url=base_url,
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
ready = len(resolved) >= 2
|
|
176
|
+
return StackHealthReport(
|
|
177
|
+
ready=ready,
|
|
178
|
+
profile=resolved_profile,
|
|
179
|
+
vram_gb=vram_gb,
|
|
180
|
+
quant=quant_mode,
|
|
181
|
+
recommended=recommended,
|
|
182
|
+
resolved=resolved,
|
|
183
|
+
missing=missing,
|
|
184
|
+
pool_size=pool_size,
|
|
185
|
+
findings=tuple(findings),
|
|
186
|
+
inventory_note=inventory_note,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def format_stack_health(report: StackHealthReport) -> str:
|
|
191
|
+
lines: list[str] = []
|
|
192
|
+
vram_label = f"{report.vram_gb} GB" if report.vram_gb is not None else report.profile
|
|
193
|
+
lines.append(f"Stack health ({vram_label}, quant={report.quant})")
|
|
194
|
+
lines.append(f" Recommended: {', '.join(report.recommended) or '-'}")
|
|
195
|
+
lines.append(f" Resolved: {', '.join(report.resolved) or '-'}")
|
|
196
|
+
if report.missing:
|
|
197
|
+
lines.append(f" Missing: {', '.join(report.missing)}")
|
|
198
|
+
lines.append(f" Inventory: {report.pool_size} tag(s) seen (API + disk)")
|
|
199
|
+
if report.inventory_note:
|
|
200
|
+
lines.append(f" Note: {report.inventory_note}")
|
|
201
|
+
lines.append("")
|
|
202
|
+
for item in report.findings:
|
|
203
|
+
prefix = {"ok": "OK", "warn": "WARN", "error": "ERROR"}.get(item.level, item.level.upper())
|
|
204
|
+
lines.append(f" [{prefix}] {item.message}")
|
|
205
|
+
lines.append("")
|
|
206
|
+
if report.ready:
|
|
207
|
+
lines.append("Routing: ready (2+ models)")
|
|
208
|
+
else:
|
|
209
|
+
lines.append("Routing: not ready — install more models or adjust profile/VRAM.")
|
|
210
|
+
return "\n".join(lines)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _vram_for_profile(profile: str) -> int | None:
|
|
214
|
+
from split_stack.model_registry import DEPLOYMENT_PROFILES
|
|
215
|
+
|
|
216
|
+
spec = DEPLOYMENT_PROFILES.get(profile)
|
|
217
|
+
if spec is None:
|
|
218
|
+
return None
|
|
219
|
+
return spec.assumed_vram_gb
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _fetch_ollama_tag_info(
|
|
223
|
+
*,
|
|
224
|
+
base_url: str = "http://127.0.0.1:11434",
|
|
225
|
+
) -> dict[str, ModelTagInfo]:
|
|
226
|
+
try:
|
|
227
|
+
import requests
|
|
228
|
+
except ImportError:
|
|
229
|
+
return {}
|
|
230
|
+
|
|
231
|
+
url = f"{base_url.rstrip('/')}/api/tags"
|
|
232
|
+
try:
|
|
233
|
+
response = requests.get(url, timeout=5)
|
|
234
|
+
response.raise_for_status()
|
|
235
|
+
except Exception:
|
|
236
|
+
return {}
|
|
237
|
+
|
|
238
|
+
out: dict[str, ModelTagInfo] = {}
|
|
239
|
+
payload = response.json() or {}
|
|
240
|
+
for item in payload.get("models") or []:
|
|
241
|
+
name = (item.get("name") or "").strip()
|
|
242
|
+
if not name:
|
|
243
|
+
continue
|
|
244
|
+
details = item.get("details") or {}
|
|
245
|
+
quant = details.get("quantization_level")
|
|
246
|
+
out[name] = ModelTagInfo(
|
|
247
|
+
name=name,
|
|
248
|
+
size_bytes=int(item.get("size") or 0),
|
|
249
|
+
quantization_level=str(quant) if quant else None,
|
|
250
|
+
)
|
|
251
|
+
return out
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _is_gemma_tag(name: str) -> bool:
|
|
255
|
+
family = name.split(":")[0].lower()
|
|
256
|
+
return family.startswith("gemma")
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _tag_suggests_qat(name: str) -> bool:
|
|
260
|
+
lowered = name.lower()
|
|
261
|
+
markers = ("qat", "ud-q4", "ud_q4", "unsloth", "gemma-4-qat")
|
|
262
|
+
return any(marker in lowered for marker in markers)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _tag_suggests_bf16(name: str) -> bool:
|
|
266
|
+
lowered = name.lower()
|
|
267
|
+
return "bf16" in lowered or "-it-bf16" in lowered
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _quant_mismatch_findings(
|
|
271
|
+
*,
|
|
272
|
+
quant_mode: str,
|
|
273
|
+
model_names: tuple[str, ...],
|
|
274
|
+
base_url: str,
|
|
275
|
+
) -> list[StackHealthFinding]:
|
|
276
|
+
if quant_mode == "default":
|
|
277
|
+
return []
|
|
278
|
+
|
|
279
|
+
tag_info = _fetch_ollama_tag_info(base_url=base_url)
|
|
280
|
+
if not tag_info:
|
|
281
|
+
return [
|
|
282
|
+
StackHealthFinding(
|
|
283
|
+
level="warn",
|
|
284
|
+
code="quant_check_skipped",
|
|
285
|
+
message=(
|
|
286
|
+
"Quant check skipped — Ollama /api/tags unreachable or "
|
|
287
|
+
"install split-stack[ollama] for requests."
|
|
288
|
+
),
|
|
289
|
+
)
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
findings: list[StackHealthFinding] = []
|
|
293
|
+
gemma_tags = [name for name in model_names if _is_gemma_tag(name)]
|
|
294
|
+
if not gemma_tags:
|
|
295
|
+
return findings
|
|
296
|
+
|
|
297
|
+
for name in gemma_tags:
|
|
298
|
+
info = tag_info.get(name)
|
|
299
|
+
if info is None:
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
expected_gb = adjust_vram_for_quant(name, base_vram_gb=999, quant_mode=quant_mode)
|
|
303
|
+
size_gb = info.size_bytes / (1024**3) if info.size_bytes else 0.0
|
|
304
|
+
quant_label = info.quantization_level or "unknown"
|
|
305
|
+
|
|
306
|
+
if quant_mode == "bf16":
|
|
307
|
+
if not _tag_suggests_bf16(name) and quant_label not in {"F16", "BF16", "FP16"}:
|
|
308
|
+
findings.append(
|
|
309
|
+
StackHealthFinding(
|
|
310
|
+
level="warn",
|
|
311
|
+
code="quant_mismatch",
|
|
312
|
+
message=(
|
|
313
|
+
f"{name}: quant=bf16 but installed as {quant_label} "
|
|
314
|
+
f"({size_gb:.1f} GB on disk). VRAM sizing may be wrong."
|
|
315
|
+
),
|
|
316
|
+
models=(name,),
|
|
317
|
+
)
|
|
318
|
+
)
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
# qat / qat_mobile — expect smaller runtime than library Q4_K_M pulls
|
|
322
|
+
if _tag_suggests_qat(name):
|
|
323
|
+
findings.append(
|
|
324
|
+
StackHealthFinding(
|
|
325
|
+
level="ok",
|
|
326
|
+
code="quant_ok",
|
|
327
|
+
message=f"{name}: tag looks QAT-aligned ({quant_label}, {size_gb:.1f} GB).",
|
|
328
|
+
models=(name,),
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
oversized = expected_gb is not None and size_gb > expected_gb * 1.35
|
|
334
|
+
library_ptq = quant_label in {"Q4_K_M", "Q4_0", "Q5_K_M", "Q5_0"}
|
|
335
|
+
if oversized or library_ptq:
|
|
336
|
+
expected_text = f"~{expected_gb} GB runtime" if expected_gb else "smaller QAT runtime"
|
|
337
|
+
findings.append(
|
|
338
|
+
StackHealthFinding(
|
|
339
|
+
level="warn",
|
|
340
|
+
code="quant_mismatch",
|
|
341
|
+
message=(
|
|
342
|
+
f"{name}: quant={quant_mode} expects {expected_text} but installed "
|
|
343
|
+
f"{quant_label} at {size_gb:.1f} GB — likely library PTQ, not QAT."
|
|
344
|
+
),
|
|
345
|
+
models=(name,),
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
if any(item.code == "quant_mismatch" for item in findings):
|
|
350
|
+
hint = pull_guidance_lines(quant_mode)
|
|
351
|
+
if hint:
|
|
352
|
+
findings.append(
|
|
353
|
+
StackHealthFinding(
|
|
354
|
+
level="warn",
|
|
355
|
+
code="quant_hint",
|
|
356
|
+
message=hint[0],
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
return findings
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
split_stack/__init__.py,sha256=
|
|
1
|
+
split_stack/__init__.py,sha256=4xE6uDgiYDt7yLVFaSGrS8a7g85EfNndTbHxnMNEUmw,2999
|
|
2
2
|
split_stack/__main__.py,sha256=X79S2PFqrYOGd2hLQtB0-uqsdquj9Iuez7SYR64r0Ps,90
|
|
3
3
|
split_stack/advice.py,sha256=KBw-Ly35O-4KaJgXwabtJqIFeaoP65wOj7VscodUiWc,427
|
|
4
4
|
split_stack/benchmark.py,sha256=lx2sJvlPlzHsCkGxQ5YS_Z4duzt25yhmqQpNSdAsOPM,3170
|
|
5
|
-
split_stack/cli.py,sha256=
|
|
5
|
+
split_stack/cli.py,sha256=nZgwDioIua7GoITTh3p5z_X1JA1m0mcuebf_Gqt36Ew,27129
|
|
6
6
|
split_stack/community_picks.py,sha256=Jaq0Wxs3_U4ix8kYLu2ge_VDMgym5xqKGtPsfxdEQhs,8148
|
|
7
7
|
split_stack/compare.py,sha256=4u4OQ38I6DhMft-TYrAKljLJ8vIam-SlfiT9hcjun-Q,6665
|
|
8
8
|
split_stack/complexity.py,sha256=R3N2t5QkGIkprx4fKnEqmsTPbTdsRo1Ap8-Er0lDEXk,1884
|
|
9
|
-
split_stack/discovery.py,sha256=
|
|
9
|
+
split_stack/discovery.py,sha256=l__HjKmy3ESjX9lVnrj7A77kavZhxmdIdI5vtB_iX1E,9711
|
|
10
10
|
split_stack/hints.py,sha256=jsfbWzuPDgZwrfmuuwhh1QHSFA7650Yy4VOuouJFSww,3125
|
|
11
11
|
split_stack/local_models.py,sha256=IEHrE9w0tmJ0Nb9toP50EJcOQMbO8VXX4id0TWS-MkY,2270
|
|
12
12
|
split_stack/model_guide.py,sha256=_xKFfyI30kN2VM8GgY4a05SIqPXTyDluOUZUcDpvuVw,9782
|
|
@@ -14,19 +14,20 @@ split_stack/model_registry.py,sha256=vXPj0-sRPv0tPNkodVpz5WyucOfrmawrWiTq8yaEYls
|
|
|
14
14
|
split_stack/models.py,sha256=nDqFdeSGmcPPUCOVEEsArvUJApehNWvMFB97TaBI7zY,1820
|
|
15
15
|
split_stack/ollama_errors.py,sha256=nu3qLCGIcS3asX03AEKKALuiCnvLmr5BhP0RcRHhtlQ,962
|
|
16
16
|
split_stack/ollama_generate.py,sha256=qHJN1izaS855nL2TdiPnYV05TZPVpz03tqlXljcxvNo,3789
|
|
17
|
-
split_stack/poc_models.py,sha256=
|
|
17
|
+
split_stack/poc_models.py,sha256=EfUMJaMkLGIsUCeeK6DYxcRXON7EyWGzU3b3_PiUsQg,6562
|
|
18
18
|
split_stack/presets.py,sha256=1E7UsT0bahQMZZxBHr0iG0fxEIvq7f0VPKxPTRj_CI0,2580
|
|
19
19
|
split_stack/quantization.py,sha256=zZMs7aiqksUyVXzKK5JxQDEDYiYXYza2gXkMlWQqywE,4311
|
|
20
20
|
split_stack/requirements.py,sha256=QK7lxn7jVU39z2IZByEKOiv1xz3G2SIs96uRsspskdY,9475
|
|
21
21
|
split_stack/routing.py,sha256=99fZilyXddkZIhTaPQEsE6P2EDDuaXo4n1Xqs28Zq5Y,3219
|
|
22
22
|
split_stack/session.py,sha256=_YkoNhsOp_4u14NgzWSkIDDNiCrhUt0-eU3e5y6lTfI,7959
|
|
23
23
|
split_stack/setup_wizard.py,sha256=EyCr_QtiUZMBW20mEjYbjvHBg6tIqAUafLHZd_9dqBY,8195
|
|
24
|
+
split_stack/stack_health.py,sha256=Mr4TvwOAkRsriucLDRxqQU3En4IO4uZTjgaoP-1QJeU,12163
|
|
24
25
|
split_stack/startup_tips.py,sha256=CY6k_lBSgmulbe0PLH3sIy6qL3B1wtQeLGGpHisskNs,5524
|
|
25
26
|
split_stack/tiering.py,sha256=M4outcZwO-m-th7OYbKYRILB5trnxJU6oQI0pEo_MsY,2163
|
|
26
27
|
split_stack/validation.py,sha256=-JMuDnia1Rd3fMYtHVHtJ-GW_4Rrbijl9MKlTnfpCyw,3056
|
|
27
|
-
split_stack-0.
|
|
28
|
-
split_stack-0.
|
|
29
|
-
split_stack-0.
|
|
30
|
-
split_stack-0.
|
|
31
|
-
split_stack-0.
|
|
32
|
-
split_stack-0.
|
|
28
|
+
split_stack-0.3.0.dist-info/licenses/LICENSE,sha256=scGzQpUJlz3hAQQfj_Ukpj_rGSSDKp2TgqP5wzchytQ,1069
|
|
29
|
+
split_stack-0.3.0.dist-info/METADATA,sha256=qbPkSiM_X_0J51Qc7qNB2UDuqgm35IuOy01ah2eU-NQ,13655
|
|
30
|
+
split_stack-0.3.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
31
|
+
split_stack-0.3.0.dist-info/entry_points.txt,sha256=ZByxKJLPs5y8blfIgplNejfVkjQuH2F9A99JoH1m5gw,47
|
|
32
|
+
split_stack-0.3.0.dist-info/top_level.txt,sha256=gfw1Q0n9UcJE069uO9G-TPSU9P1fwvOj0nhUYKix2pM,12
|
|
33
|
+
split_stack-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|