split-stack 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
split_stack/__init__.py CHANGED
@@ -46,11 +46,12 @@ from split_stack.session import (
46
46
  route,
47
47
  session_warnings,
48
48
  )
49
+ from split_stack.stack_health import check_stack_health, format_stack_health
49
50
  from split_stack.startup_tips import emit_import_tips, model_recommendation_report
50
51
  from split_stack.tiering import assign_tiers, describe_tiers
51
52
  from split_stack.validation import validate_tier_map
52
53
 
53
- __version__ = "0.2.0"
54
+ __version__ = "0.3.0"
54
55
 
55
56
  __all__ = [
56
57
  "ComplexityTier",
@@ -98,6 +99,8 @@ __all__ = [
98
99
  "score_prompt",
99
100
  "session_warnings",
100
101
  "stack_recommendation",
102
+ "check_stack_health",
103
+ "format_stack_health",
101
104
  "usage_requirements",
102
105
  "validate_tier_map",
103
106
  "__version__",
split_stack/cli.py CHANGED
@@ -20,6 +20,7 @@ from split_stack.ollama_generate import ask_prompt_json, route_prompt_json
20
20
  from split_stack.requirements import UsageProfile, list_usage_profiles, usage_requirements
21
21
  from split_stack.presets import assign_recommended_tiers, list_recommended_stacks, recommended_models
22
22
  from split_stack.setup_wizard import format_setup_summary, plan_setup, run_setup
23
+ from split_stack.stack_health import check_stack_health, format_stack_health
23
24
  from split_stack.tiering import assign_tiers, describe_tiers
24
25
 
25
26
 
@@ -159,7 +160,47 @@ def _cmd_profiles(args: argparse.Namespace) -> int:
159
160
  return 0
160
161
 
161
162
 
163
+ def _cmd_doctor_check_stack(args: argparse.Namespace) -> int:
164
+ models = None
165
+ if args.models:
166
+ models = [part.strip() for part in args.models.split(",") if part.strip()]
167
+ report = check_stack_health(
168
+ profile=args.profile,
169
+ vram_gb=args.vram_gb,
170
+ quant=args.quant,
171
+ base_url=args.base_url,
172
+ models=models,
173
+ )
174
+ if args.json:
175
+ payload = {
176
+ "ready": report.ready,
177
+ "profile": report.profile,
178
+ "vram_gb": report.vram_gb,
179
+ "quant": report.quant,
180
+ "recommended": list(report.recommended),
181
+ "resolved": list(report.resolved),
182
+ "missing": list(report.missing),
183
+ "pool_size": report.pool_size,
184
+ "inventory_note": report.inventory_note,
185
+ "findings": [
186
+ {
187
+ "level": item.level,
188
+ "code": item.code,
189
+ "message": item.message,
190
+ "models": list(item.models),
191
+ }
192
+ for item in report.findings
193
+ ],
194
+ }
195
+ return _emit_json(payload)
196
+ print(format_stack_health(report))
197
+ return 0 if report.ready else 1
198
+
199
+
162
200
  def _cmd_doctor(args: argparse.Namespace) -> int:
201
+ if args.check_stack:
202
+ return _cmd_doctor_check_stack(args)
203
+
163
204
  advice = stack_recommendation(cursor_override_enabled=False)
164
205
  print(f"Cursor model: {advice.cursor_model}")
165
206
  print(f"Prose path: {advice.prose_path}")
@@ -505,6 +546,27 @@ def main(argv: list[str] | None = None) -> int:
505
546
  help="Path to split-stack.models.json (or set SPLIT_STACK_MODELS_CONFIG)",
506
547
  )
507
548
  _add_quant_arg(doctor_parser)
549
+ doctor_parser.add_argument(
550
+ "--check-stack",
551
+ action="store_true",
552
+ help="Offline stack health: missing models, duplicates, routing spread (exit 1 if not ready)",
553
+ )
554
+ doctor_parser.add_argument(
555
+ "--vram-gb",
556
+ type=int,
557
+ choices=[8, 12, 16, 24, 32],
558
+ help="GPU VRAM for recommended stack (alternative to --profile)",
559
+ )
560
+ doctor_parser.add_argument(
561
+ "--base-url",
562
+ default="http://127.0.0.1:11434",
563
+ help="Ollama base URL for inventory scan",
564
+ )
565
+ doctor_parser.add_argument(
566
+ "--models",
567
+ help="Comma-separated stack override when using --check-stack",
568
+ )
569
+ doctor_parser.add_argument("--json", action="store_true", help="JSON output (with --check-stack)")
508
570
  doctor_parser.set_defaults(handler=_cmd_doctor)
509
571
 
510
572
  requirements_parser = subparsers.add_parser(
split_stack/discovery.py CHANGED
@@ -240,7 +240,7 @@ def audit_model_folders(
240
240
  primary = home if home.is_dir() else None
241
241
  return {
242
242
  "primary_root": str(primary) if primary else None,
243
- "scan_roots": list(manifest_search_paths()),
243
+ "scan_roots": [str(path) for path in manifest_search_paths()],
244
244
  "tag_count": len(locations),
245
245
  "locations": {tag: list(paths) for tag, paths in locations.items()},
246
246
  "duplicates": duplicates,
split_stack/poc_models.py CHANGED
@@ -1,85 +1,112 @@
1
- """Default model stacks for POC demos and compare benchmarks."""
1
+ """Workstation stack helpers for demos and compare POC."""
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
- from split_stack.community_picks import focus_stack, vram_tier_for_profile
8
7
  from split_stack.discovery import list_model_inventory
9
- from split_stack.presets import recommended_models
8
+ from split_stack.presets import RECOMMENDED_STACKS, recommended_models
9
+ from split_stack.quantization import QAT_STACK_ADDITIONS, normalize_quant_mode
10
+ from split_stack.session import profile_for_vram_gb
10
11
 
11
12
  DEFAULT_POC_STACK = ["gemma4:e4b", "qwen3:8b", "qwen3:14b"]
12
- QWEN_ONLY_STACK = ["qwen3:4b", "qwen3:8b", "qwen3:14b"]
13
+
14
+ VRAM_OPTIONS: tuple[tuple[int, str], ...] = (
15
+ (8, "8 GB"),
16
+ (12, "12 GB"),
17
+ (16, "16 GB"),
18
+ (24, "24 GB"),
19
+ (32, "32 GB"),
20
+ )
21
+
22
+ QUANT_OPTIONS: tuple[tuple[str, str], ...] = (
23
+ ("default", "Default (PTQ)"),
24
+ ("qat", "Gemma QAT (int4)"),
25
+ ("qat_mobile", "Gemma mobile QAT"),
26
+ ("bf16", "BF16 (full size)"),
27
+ )
28
+
29
+ _PRESET_VRAM_ALIASES: dict[str, int] = {
30
+ "mixed_12gb": 12,
31
+ "mixed_16gb": 16,
32
+ "qwen_only": 12,
33
+ "recommended_12gb": 12,
34
+ "from_inventory": 0,
35
+ }
13
36
 
14
37
 
15
38
  @dataclass(frozen=True)
16
- class StackPreset:
17
- id: str
18
- label: str
39
+ class WorkstationStack:
40
+ vram_gb: int
41
+ profile: str
42
+ quant: str
19
43
  models: tuple[str, ...]
20
44
  description: str
45
+ notes: tuple[str, ...]
21
46
 
22
47
 
23
- STACK_PRESETS: tuple[StackPreset, ...] = (
24
- StackPreset(
25
- id="mixed_12gb",
26
- label="Mixed 12 GB (Gemma + Qwen)",
27
- models=tuple(DEFAULT_POC_STACK),
28
- description="Gemma lookup, Qwen 8B medium, Qwen 14B complex",
29
- ),
30
- StackPreset(
31
- id="qwen_only",
32
- label="Qwen only (4B / 8B / 14B)",
33
- models=tuple(QWEN_ONLY_STACK),
34
- description="Single-family ladder",
35
- ),
36
- StackPreset(
37
- id="recommended_12gb",
38
- label="Full 12 GB specialist",
39
- models=tuple(recommended_models("workstation_12gb")),
40
- description="Gemma + Qwen + DeepSeek R1 for reasoning",
41
- ),
42
- StackPreset(
43
- id="community_agentic",
44
- label="Reddit agentic (M tier)",
45
- models=tuple(),
46
- description="r/LocalLLaMA Apr 2026 — Gemma lookup + Qwen spine for agent loops",
47
- ),
48
- StackPreset(
49
- id="from_inventory",
50
- label="From your Ollama (auto ladder)",
51
- models=tuple(),
52
- description="Picks small/mid/large tags from API + disk manifests",
53
- ),
54
- )
48
+ def list_vram_options() -> tuple[tuple[int, str], ...]:
49
+ return VRAM_OPTIONS
55
50
 
56
51
 
57
- def list_stack_presets() -> tuple[StackPreset, ...]:
58
- return STACK_PRESETS
52
+ def list_quant_options() -> tuple[tuple[str, str], ...]:
53
+ return QUANT_OPTIONS
54
+
55
+
56
+ def recommended_stack_for_vram(
57
+ vram_gb: int,
58
+ *,
59
+ quant: str | None = None,
60
+ ) -> WorkstationStack:
61
+ profile = profile_for_vram_gb(vram_gb)
62
+ stack = RECOMMENDED_STACKS[profile]
63
+ quant_mode = normalize_quant_mode(quant)
64
+ models = recommended_models(profile, quant=quant_mode)
65
+ notes: list[str] = []
66
+ if quant_mode == "qat":
67
+ extras = QAT_STACK_ADDITIONS.get(profile, ())
68
+ if extras:
69
+ notes.append(
70
+ f"QAT adds {', '.join(extras)} on {vram_gb} GB — Gemma-only int4 runtime sizes."
71
+ )
72
+ elif quant_mode == "bf16":
73
+ notes.append("BF16 uses full Gemma pull sizes — prefer 24 GB+ or datacenter.")
74
+ return WorkstationStack(
75
+ vram_gb=vram_gb,
76
+ profile=profile,
77
+ quant=quant_mode,
78
+ models=tuple(models),
79
+ description=stack.description,
80
+ notes=tuple(notes),
81
+ )
59
82
 
60
83
 
61
84
  def models_for_preset(
62
85
  preset_id: str,
63
86
  *,
64
87
  base_url: str = "http://127.0.0.1:11434",
65
- profile: str = "workstation_12gb",
88
+ profile: str | None = None,
89
+ quant: str | None = None,
66
90
  ) -> list[str]:
67
91
  if preset_id == "from_inventory":
68
92
  inventory = list_model_inventory(base_url=base_url)
69
93
  if inventory.suggested_stack:
70
94
  return list(inventory.suggested_stack)
71
95
  return list(DEFAULT_POC_STACK)
96
+ if preset_id == "qwen_only":
97
+ return ["qwen3:4b", "qwen3:8b", "qwen3:14b"]
72
98
  if preset_id == "community_agentic":
73
- tier = vram_tier_for_profile(profile)
74
- stack = focus_stack("agentic", vram_tier=tier)
75
- if stack and stack.models:
76
- return list(stack.models)
99
+ from split_stack.community_picks import focus_stack, vram_tier_for_profile
100
+
101
+ tier = vram_tier_for_profile(profile or "workstation_12gb")
102
+ focus = focus_stack("agentic", vram_tier=tier)
103
+ if focus and focus.models:
104
+ return list(focus.models)
77
105
  return list(DEFAULT_POC_STACK)
78
- for item in STACK_PRESETS:
79
- if item.id == preset_id:
80
- return list(item.models)
81
- valid = ", ".join(item.id for item in STACK_PRESETS)
82
- raise ValueError(f"Unknown stack preset '{preset_id}'. Valid: {valid}")
106
+ vram = _PRESET_VRAM_ALIASES.get(preset_id)
107
+ if vram:
108
+ return list(recommended_stack_for_vram(vram, quant=quant).models)
109
+ raise ValueError(f"Unknown stack preset '{preset_id}'.")
83
110
 
84
111
 
85
112
  def available_model_pool(
@@ -87,7 +114,6 @@ def available_model_pool(
87
114
  base_url: str = "http://127.0.0.1:11434",
88
115
  source: str = "both",
89
116
  ) -> tuple[list[str], str | None]:
90
- """Return model names from Ollama API, disk manifests, or both."""
91
117
  inventory = list_model_inventory(base_url=base_url)
92
118
  if source == "api":
93
119
  pool = list(inventory.api_models)
@@ -98,18 +124,15 @@ def available_model_pool(
98
124
  return pool, inventory.note
99
125
 
100
126
 
101
- def resolve_installed_stack(
127
+ def resolve_stack_against_pool(
128
+ desired: list[str],
102
129
  installed: list[str],
103
- *,
104
- preset_id: str = "mixed_12gb",
105
- base_url: str = "http://127.0.0.1:11434",
106
- ) -> tuple[list[str], str | None]:
107
- """Pick preset models that exist in the installed pool; warn when falling back."""
108
- desired = models_for_preset(preset_id, base_url=base_url)
130
+ ) -> tuple[list[str], list[str], str | None]:
109
131
  installed_set = set(installed)
110
132
  matched = [name for name in desired if name in installed_set]
133
+ missing = [name for name in desired if name not in installed_set]
111
134
  if len(matched) >= 2:
112
- return matched, None
135
+ return matched, missing, None
113
136
 
114
137
  if installed:
115
138
  from split_stack.model_registry import load_registry, model_weight
@@ -118,14 +141,61 @@ def resolve_installed_stack(
118
141
  ranked = sorted(installed, key=lambda name: model_weight(name, registry))
119
142
  if len(ranked) >= 2:
120
143
  warning = (
121
- f"Preset '{preset_id}' not fully available ({', '.join(desired)}). "
144
+ f"Recommended stack not fully installed ({', '.join(desired)}). "
122
145
  f"Using: {', '.join(ranked)}"
123
146
  )
124
- return ranked, warning
147
+ return ranked, missing, warning
125
148
  warning = (
126
- f"Preset '{preset_id}' not fully available. "
127
- f"Using only {ranked[0]} — need 2+ models for compare spread."
149
+ f"Recommended stack not fully installed. "
150
+ f"Using only {ranked[0]} — need 2+ models for routing spread."
128
151
  )
129
- return ranked, warning
152
+ return ranked, missing, warning
153
+
154
+ return desired, missing, f"Using recommended list (not verified against disk): {', '.join(desired)}"
130
155
 
131
- return desired, f"Using preset list (not verified): {', '.join(desired)}"
156
+
157
+ def resolve_installed_stack(
158
+ installed: list[str],
159
+ *,
160
+ preset_id: str = "mixed_12gb",
161
+ base_url: str = "http://127.0.0.1:11434",
162
+ vram_gb: int | None = None,
163
+ quant: str | None = None,
164
+ models: list[str] | None = None,
165
+ ) -> tuple[list[str], str | None]:
166
+ if models:
167
+ desired = models
168
+ elif vram_gb is not None:
169
+ desired = list(recommended_stack_for_vram(vram_gb, quant=quant).models)
170
+ else:
171
+ desired = models_for_preset(preset_id, base_url=base_url, quant=quant)
172
+ resolved, _missing, warning = resolve_stack_against_pool(desired, installed)
173
+ return resolved, warning
174
+
175
+
176
+ def stack_payload(
177
+ *,
178
+ vram_gb: int = 16,
179
+ quant: str | None = "qat",
180
+ base_url: str = "http://127.0.0.1:11434",
181
+ source: str = "both",
182
+ models_override: list[str] | None = None,
183
+ ) -> dict[str, object]:
184
+ stack = recommended_stack_for_vram(vram_gb, quant=quant)
185
+ desired = list(models_override) if models_override else list(stack.models)
186
+ pool, inventory_note = available_model_pool(base_url=base_url, source=source)
187
+ resolved, missing, warning = resolve_stack_against_pool(desired, pool)
188
+ return {
189
+ "ready": True,
190
+ "vram_gb": vram_gb,
191
+ "profile": stack.profile,
192
+ "quant": stack.quant,
193
+ "description": stack.description,
194
+ "notes": list(stack.notes),
195
+ "models": desired,
196
+ "resolved_models": resolved,
197
+ "missing_models": missing,
198
+ "warning": warning,
199
+ "inventory_note": inventory_note,
200
+ "pool_size": len(pool),
201
+ }
@@ -0,0 +1,360 @@
1
+ """Offline stack health checks — missing models, duplicates, routing spread, quant honesty."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from split_stack.discovery import audit_model_folders, list_model_inventory
8
+ from split_stack.poc_models import resolve_stack_against_pool, stack_payload
9
+ from split_stack.presets import recommended_models
10
+ from split_stack.quantization import adjust_vram_for_quant, normalize_quant_mode, pull_guidance_lines
11
+ from split_stack.session import default_profile_from_env, profile_for_vram_gb
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class ModelTagInfo:
16
+ name: str
17
+ size_bytes: int
18
+ quantization_level: str | None
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class StackHealthFinding:
23
+ level: str # ok, warn, error
24
+ code: str
25
+ message: str
26
+ models: tuple[str, ...] = ()
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class StackHealthReport:
31
+ ready: bool
32
+ profile: str
33
+ vram_gb: int | None
34
+ quant: str
35
+ recommended: tuple[str, ...]
36
+ resolved: tuple[str, ...]
37
+ missing: tuple[str, ...]
38
+ pool_size: int
39
+ findings: tuple[StackHealthFinding, ...]
40
+ inventory_note: str | None = None
41
+
42
+
43
+ def check_stack_health(
44
+ *,
45
+ profile: str | None = None,
46
+ vram_gb: int | None = None,
47
+ quant: str | None = None,
48
+ base_url: str = "http://127.0.0.1:11434",
49
+ models: list[str] | None = None,
50
+ source: str = "both",
51
+ ) -> StackHealthReport:
52
+ """Check recommended stack against local inventory (offline; no upstream registry)."""
53
+ quant_mode = normalize_quant_mode(quant)
54
+ if vram_gb is not None:
55
+ resolved_profile = profile_for_vram_gb(vram_gb)
56
+ else:
57
+ resolved_profile = profile or default_profile_from_env()
58
+ vram_gb = _vram_for_profile(resolved_profile)
59
+
60
+ if models:
61
+ recommended = tuple(models)
62
+ payload = stack_payload(
63
+ vram_gb=vram_gb or 12,
64
+ quant=quant_mode,
65
+ base_url=base_url,
66
+ source=source,
67
+ models_override=list(models),
68
+ )
69
+ resolved = tuple(payload.get("resolved_models") or ())
70
+ missing = tuple(payload.get("missing_models") or ())
71
+ inventory_note = payload.get("inventory_note")
72
+ if isinstance(inventory_note, str):
73
+ pass
74
+ else:
75
+ inventory_note = None
76
+ pool_size = int(payload.get("pool_size") or 0)
77
+ else:
78
+ recommended = tuple(recommended_models(resolved_profile, quant=quant_mode))
79
+ inventory = list_model_inventory(base_url=base_url)
80
+ if source == "api":
81
+ pool = list(inventory.api_models)
82
+ elif source == "disk":
83
+ pool = list(inventory.disk_models)
84
+ else:
85
+ pool = sorted(set(inventory.api_models) | set(inventory.disk_models))
86
+ resolved_list, missing_list, _warning = resolve_stack_against_pool(
87
+ list(recommended),
88
+ pool,
89
+ )
90
+ resolved = tuple(resolved_list)
91
+ missing = tuple(missing_list)
92
+ inventory_note = inventory.note
93
+ pool_size = len(pool)
94
+
95
+ findings: list[StackHealthFinding] = []
96
+
97
+ for name in recommended:
98
+ if name in missing:
99
+ findings.append(
100
+ StackHealthFinding(
101
+ level="error",
102
+ code="missing",
103
+ message=f"{name} is recommended but not found in local inventory.",
104
+ models=(name,),
105
+ )
106
+ )
107
+ elif name in resolved:
108
+ findings.append(
109
+ StackHealthFinding(
110
+ level="ok",
111
+ code="present",
112
+ message=f"{name} is installed.",
113
+ models=(name,),
114
+ )
115
+ )
116
+
117
+ if len(resolved) < 2:
118
+ findings.append(
119
+ StackHealthFinding(
120
+ level="error",
121
+ code="routing_spread",
122
+ message=(
123
+ f"Only {len(resolved)} model(s) available for routing "
124
+ f"({', '.join(resolved) or 'none'}). Need at least 2 for tier spread."
125
+ ),
126
+ models=resolved,
127
+ )
128
+ )
129
+ elif missing:
130
+ findings.append(
131
+ StackHealthFinding(
132
+ level="warn",
133
+ code="partial_stack",
134
+ message=(
135
+ f"Using {len(resolved)} installed model(s); "
136
+ f"{len(missing)} recommended tag(s) missing."
137
+ ),
138
+ models=resolved,
139
+ )
140
+ )
141
+ else:
142
+ findings.append(
143
+ StackHealthFinding(
144
+ level="ok",
145
+ code="stack_complete",
146
+ message=f"All {len(recommended)} recommended model(s) are installed.",
147
+ models=resolved,
148
+ )
149
+ )
150
+
151
+ audit = audit_model_folders()
152
+ duplicate_tags = audit.get("duplicate_tags") or []
153
+ if duplicate_tags:
154
+ dup_list = tuple(str(tag) for tag in duplicate_tags)
155
+ findings.append(
156
+ StackHealthFinding(
157
+ level="warn",
158
+ code="duplicate_tags",
159
+ message=(
160
+ f"Duplicate tags across Ollama folders: {', '.join(dup_list)}. "
161
+ "Keep one models directory or run audit cleanup."
162
+ ),
163
+ models=dup_list,
164
+ )
165
+ )
166
+
167
+ findings.extend(
168
+ _quant_mismatch_findings(
169
+ quant_mode=quant_mode,
170
+ model_names=tuple(name for name in resolved if name not in missing),
171
+ base_url=base_url,
172
+ )
173
+ )
174
+
175
+ ready = len(resolved) >= 2
176
+ return StackHealthReport(
177
+ ready=ready,
178
+ profile=resolved_profile,
179
+ vram_gb=vram_gb,
180
+ quant=quant_mode,
181
+ recommended=recommended,
182
+ resolved=resolved,
183
+ missing=missing,
184
+ pool_size=pool_size,
185
+ findings=tuple(findings),
186
+ inventory_note=inventory_note,
187
+ )
188
+
189
+
190
+ def format_stack_health(report: StackHealthReport) -> str:
191
+ lines: list[str] = []
192
+ vram_label = f"{report.vram_gb} GB" if report.vram_gb is not None else report.profile
193
+ lines.append(f"Stack health ({vram_label}, quant={report.quant})")
194
+ lines.append(f" Recommended: {', '.join(report.recommended) or '-'}")
195
+ lines.append(f" Resolved: {', '.join(report.resolved) or '-'}")
196
+ if report.missing:
197
+ lines.append(f" Missing: {', '.join(report.missing)}")
198
+ lines.append(f" Inventory: {report.pool_size} tag(s) seen (API + disk)")
199
+ if report.inventory_note:
200
+ lines.append(f" Note: {report.inventory_note}")
201
+ lines.append("")
202
+ for item in report.findings:
203
+ prefix = {"ok": "OK", "warn": "WARN", "error": "ERROR"}.get(item.level, item.level.upper())
204
+ lines.append(f" [{prefix}] {item.message}")
205
+ lines.append("")
206
+ if report.ready:
207
+ lines.append("Routing: ready (2+ models)")
208
+ else:
209
+ lines.append("Routing: not ready — install more models or adjust profile/VRAM.")
210
+ return "\n".join(lines)
211
+
212
+
213
+ def _vram_for_profile(profile: str) -> int | None:
214
+ from split_stack.model_registry import DEPLOYMENT_PROFILES
215
+
216
+ spec = DEPLOYMENT_PROFILES.get(profile)
217
+ if spec is None:
218
+ return None
219
+ return spec.assumed_vram_gb
220
+
221
+
222
+ def _fetch_ollama_tag_info(
223
+ *,
224
+ base_url: str = "http://127.0.0.1:11434",
225
+ ) -> dict[str, ModelTagInfo]:
226
+ try:
227
+ import requests
228
+ except ImportError:
229
+ return {}
230
+
231
+ url = f"{base_url.rstrip('/')}/api/tags"
232
+ try:
233
+ response = requests.get(url, timeout=5)
234
+ response.raise_for_status()
235
+ except Exception:
236
+ return {}
237
+
238
+ out: dict[str, ModelTagInfo] = {}
239
+ payload = response.json() or {}
240
+ for item in payload.get("models") or []:
241
+ name = (item.get("name") or "").strip()
242
+ if not name:
243
+ continue
244
+ details = item.get("details") or {}
245
+ quant = details.get("quantization_level")
246
+ out[name] = ModelTagInfo(
247
+ name=name,
248
+ size_bytes=int(item.get("size") or 0),
249
+ quantization_level=str(quant) if quant else None,
250
+ )
251
+ return out
252
+
253
+
254
+ def _is_gemma_tag(name: str) -> bool:
255
+ family = name.split(":")[0].lower()
256
+ return family.startswith("gemma")
257
+
258
+
259
+ def _tag_suggests_qat(name: str) -> bool:
260
+ lowered = name.lower()
261
+ markers = ("qat", "ud-q4", "ud_q4", "unsloth", "gemma-4-qat")
262
+ return any(marker in lowered for marker in markers)
263
+
264
+
265
+ def _tag_suggests_bf16(name: str) -> bool:
266
+ lowered = name.lower()
267
+ return "bf16" in lowered or "-it-bf16" in lowered
268
+
269
+
270
+ def _quant_mismatch_findings(
271
+ *,
272
+ quant_mode: str,
273
+ model_names: tuple[str, ...],
274
+ base_url: str,
275
+ ) -> list[StackHealthFinding]:
276
+ if quant_mode == "default":
277
+ return []
278
+
279
+ tag_info = _fetch_ollama_tag_info(base_url=base_url)
280
+ if not tag_info:
281
+ return [
282
+ StackHealthFinding(
283
+ level="warn",
284
+ code="quant_check_skipped",
285
+ message=(
286
+ "Quant check skipped — Ollama /api/tags unreachable or "
287
+ "install split-stack[ollama] for requests."
288
+ ),
289
+ )
290
+ ]
291
+
292
+ findings: list[StackHealthFinding] = []
293
+ gemma_tags = [name for name in model_names if _is_gemma_tag(name)]
294
+ if not gemma_tags:
295
+ return findings
296
+
297
+ for name in gemma_tags:
298
+ info = tag_info.get(name)
299
+ if info is None:
300
+ continue
301
+
302
+ expected_gb = adjust_vram_for_quant(name, base_vram_gb=999, quant_mode=quant_mode)
303
+ size_gb = info.size_bytes / (1024**3) if info.size_bytes else 0.0
304
+ quant_label = info.quantization_level or "unknown"
305
+
306
+ if quant_mode == "bf16":
307
+ if not _tag_suggests_bf16(name) and quant_label not in {"F16", "BF16", "FP16"}:
308
+ findings.append(
309
+ StackHealthFinding(
310
+ level="warn",
311
+ code="quant_mismatch",
312
+ message=(
313
+ f"{name}: quant=bf16 but installed as {quant_label} "
314
+ f"({size_gb:.1f} GB on disk). VRAM sizing may be wrong."
315
+ ),
316
+ models=(name,),
317
+ )
318
+ )
319
+ continue
320
+
321
+ # qat / qat_mobile — expect smaller runtime than library Q4_K_M pulls
322
+ if _tag_suggests_qat(name):
323
+ findings.append(
324
+ StackHealthFinding(
325
+ level="ok",
326
+ code="quant_ok",
327
+ message=f"{name}: tag looks QAT-aligned ({quant_label}, {size_gb:.1f} GB).",
328
+ models=(name,),
329
+ )
330
+ )
331
+ continue
332
+
333
+ oversized = expected_gb is not None and size_gb > expected_gb * 1.35
334
+ library_ptq = quant_label in {"Q4_K_M", "Q4_0", "Q5_K_M", "Q5_0"}
335
+ if oversized or library_ptq:
336
+ expected_text = f"~{expected_gb} GB runtime" if expected_gb else "smaller QAT runtime"
337
+ findings.append(
338
+ StackHealthFinding(
339
+ level="warn",
340
+ code="quant_mismatch",
341
+ message=(
342
+ f"{name}: quant={quant_mode} expects {expected_text} but installed "
343
+ f"{quant_label} at {size_gb:.1f} GB — likely library PTQ, not QAT."
344
+ ),
345
+ models=(name,),
346
+ )
347
+ )
348
+
349
+ if any(item.code == "quant_mismatch" for item in findings):
350
+ hint = pull_guidance_lines(quant_mode)
351
+ if hint:
352
+ findings.append(
353
+ StackHealthFinding(
354
+ level="warn",
355
+ code="quant_hint",
356
+ message=hint[0],
357
+ )
358
+ )
359
+
360
+ return findings
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: split-stack
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Python routing library for local LLM agent loops: score prompts, map tiers to model names, embed in your runner.
5
5
  Author: Eddie Baumel
6
6
  License: MIT
@@ -1,12 +1,12 @@
1
- split_stack/__init__.py,sha256=AtvBma-QYci2OlS4bm0WqNQr1uRAaUMya2XklE5dwVA,2869
1
+ split_stack/__init__.py,sha256=4xE6uDgiYDt7yLVFaSGrS8a7g85EfNndTbHxnMNEUmw,2999
2
2
  split_stack/__main__.py,sha256=X79S2PFqrYOGd2hLQtB0-uqsdquj9Iuez7SYR64r0Ps,90
3
3
  split_stack/advice.py,sha256=KBw-Ly35O-4KaJgXwabtJqIFeaoP65wOj7VscodUiWc,427
4
4
  split_stack/benchmark.py,sha256=lx2sJvlPlzHsCkGxQ5YS_Z4duzt25yhmqQpNSdAsOPM,3170
5
- split_stack/cli.py,sha256=ohcTIV4V2fo7uoB52K_4Ynu1euDTq_jLPqalpwlnMMs,25005
5
+ split_stack/cli.py,sha256=nZgwDioIua7GoITTh3p5z_X1JA1m0mcuebf_Gqt36Ew,27129
6
6
  split_stack/community_picks.py,sha256=Jaq0Wxs3_U4ix8kYLu2ge_VDMgym5xqKGtPsfxdEQhs,8148
7
7
  split_stack/compare.py,sha256=4u4OQ38I6DhMft-TYrAKljLJ8vIam-SlfiT9hcjun-Q,6665
8
8
  split_stack/complexity.py,sha256=R3N2t5QkGIkprx4fKnEqmsTPbTdsRo1Ap8-Er0lDEXk,1884
9
- split_stack/discovery.py,sha256=W4B7DLpCQBkVtipIy2wlcs6bcNGikGHzoB94TJG4AI8,9693
9
+ split_stack/discovery.py,sha256=l__HjKmy3ESjX9lVnrj7A77kavZhxmdIdI5vtB_iX1E,9711
10
10
  split_stack/hints.py,sha256=jsfbWzuPDgZwrfmuuwhh1QHSFA7650Yy4VOuouJFSww,3125
11
11
  split_stack/local_models.py,sha256=IEHrE9w0tmJ0Nb9toP50EJcOQMbO8VXX4id0TWS-MkY,2270
12
12
  split_stack/model_guide.py,sha256=_xKFfyI30kN2VM8GgY4a05SIqPXTyDluOUZUcDpvuVw,9782
@@ -14,19 +14,20 @@ split_stack/model_registry.py,sha256=vXPj0-sRPv0tPNkodVpz5WyucOfrmawrWiTq8yaEYls
14
14
  split_stack/models.py,sha256=nDqFdeSGmcPPUCOVEEsArvUJApehNWvMFB97TaBI7zY,1820
15
15
  split_stack/ollama_errors.py,sha256=nu3qLCGIcS3asX03AEKKALuiCnvLmr5BhP0RcRHhtlQ,962
16
16
  split_stack/ollama_generate.py,sha256=qHJN1izaS855nL2TdiPnYV05TZPVpz03tqlXljcxvNo,3789
17
- split_stack/poc_models.py,sha256=4wSXsRy2gz3aFPsTuwf4Mf2qbWAX9Cef_RS3zB5ZiPs,4422
17
+ split_stack/poc_models.py,sha256=EfUMJaMkLGIsUCeeK6DYxcRXON7EyWGzU3b3_PiUsQg,6562
18
18
  split_stack/presets.py,sha256=1E7UsT0bahQMZZxBHr0iG0fxEIvq7f0VPKxPTRj_CI0,2580
19
19
  split_stack/quantization.py,sha256=zZMs7aiqksUyVXzKK5JxQDEDYiYXYza2gXkMlWQqywE,4311
20
20
  split_stack/requirements.py,sha256=QK7lxn7jVU39z2IZByEKOiv1xz3G2SIs96uRsspskdY,9475
21
21
  split_stack/routing.py,sha256=99fZilyXddkZIhTaPQEsE6P2EDDuaXo4n1Xqs28Zq5Y,3219
22
22
  split_stack/session.py,sha256=_YkoNhsOp_4u14NgzWSkIDDNiCrhUt0-eU3e5y6lTfI,7959
23
23
  split_stack/setup_wizard.py,sha256=EyCr_QtiUZMBW20mEjYbjvHBg6tIqAUafLHZd_9dqBY,8195
24
+ split_stack/stack_health.py,sha256=Mr4TvwOAkRsriucLDRxqQU3En4IO4uZTjgaoP-1QJeU,12163
24
25
  split_stack/startup_tips.py,sha256=CY6k_lBSgmulbe0PLH3sIy6qL3B1wtQeLGGpHisskNs,5524
25
26
  split_stack/tiering.py,sha256=M4outcZwO-m-th7OYbKYRILB5trnxJU6oQI0pEo_MsY,2163
26
27
  split_stack/validation.py,sha256=-JMuDnia1Rd3fMYtHVHtJ-GW_4Rrbijl9MKlTnfpCyw,3056
27
- split_stack-0.2.0.dist-info/licenses/LICENSE,sha256=scGzQpUJlz3hAQQfj_Ukpj_rGSSDKp2TgqP5wzchytQ,1069
28
- split_stack-0.2.0.dist-info/METADATA,sha256=dvnZStht1fgl5ZDKsItIjILSxcVyJaYeujrJFceQP3g,13655
29
- split_stack-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
30
- split_stack-0.2.0.dist-info/entry_points.txt,sha256=ZByxKJLPs5y8blfIgplNejfVkjQuH2F9A99JoH1m5gw,47
31
- split_stack-0.2.0.dist-info/top_level.txt,sha256=gfw1Q0n9UcJE069uO9G-TPSU9P1fwvOj0nhUYKix2pM,12
32
- split_stack-0.2.0.dist-info/RECORD,,
28
+ split_stack-0.3.0.dist-info/licenses/LICENSE,sha256=scGzQpUJlz3hAQQfj_Ukpj_rGSSDKp2TgqP5wzchytQ,1069
29
+ split_stack-0.3.0.dist-info/METADATA,sha256=qbPkSiM_X_0J51Qc7qNB2UDuqgm35IuOy01ah2eU-NQ,13655
30
+ split_stack-0.3.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
31
+ split_stack-0.3.0.dist-info/entry_points.txt,sha256=ZByxKJLPs5y8blfIgplNejfVkjQuH2F9A99JoH1m5gw,47
32
+ split_stack-0.3.0.dist-info/top_level.txt,sha256=gfw1Q0n9UcJE069uO9G-TPSU9P1fwvOj0nhUYKix2pM,12
33
+ split_stack-0.3.0.dist-info/RECORD,,