opencode-llmstack 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,421 @@
1
+ """Generate ``llama-swap.yaml`` from ``models.ini``.
2
+
3
+ Single source of truth: ``models.ini``. Top-level config (logging,
4
+ healthcheck, the ``llama_server`` binary path, the ``metal_defaults``
5
+ macro, the ``matrix`` and the ``on_startup.preload`` list) and per-tier
6
+ ``cmd`` blocks are all DERIVED from the ini.
7
+
8
+ - ``llama_server`` = ``[DEFAULT].llama_server_bin`` or the baked-in default
9
+ - ``metal_defaults`` = built from ``[DEFAULT]`` (host, n_gpu_layers, ...) +
10
+ baked-in ``--no-warmup --no-mmap``.
11
+ - ``matrix.vars`` = role -> single-letter from :data:`ROLE_LETTER`,
12
+ value = tier name
13
+ - ``matrix.evict_costs`` = ``max(1, min(30, round(size_gb / 1.5)))``
14
+ - ``matrix.sets`` = ``f & X`` per non-fast tier, plus an
15
+ ``all_chats_with_fast`` super-set when there
16
+ are 2+ chat tiers.
17
+ - ``preload`` = every tier with ``role == "fast"``
18
+
19
+ Per-tier defaults (overridable in the ini per section):
20
+
21
+ - ``aliases`` : :data:`ROLE_ALIASES`\\[role] (override: ``aliases = a, b, c``)
22
+ - ``ttl`` : :data:`ROLE_TTL`\\[role] (override: ``ttl = 0``)
23
+
24
+ CLI (kept for scripting; the public entry point is ``llmstack install``):
25
+
26
+ python -m llmstack.generators.llama_swap # YAML to stdout
27
+ python -m llmstack.generators.llama_swap PATH # write YAML to PATH
28
+ python -m llmstack.generators.llama_swap --use-next ... # swap hf_file_next
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import configparser
34
+ import os
35
+ import re
36
+ import shutil
37
+ import sys
38
+ from pathlib import Path
39
+
40
+ import yaml
41
+
42
+ from llmstack._platform import EXE_SUFFIX, IS_WINDOWS
43
+ from llmstack.paths import models_ini_path
44
+ from llmstack.tiers import _int, load_tiers
45
+
46
+ USE_NEXT_ENV = "LLMSTACK_USE_NEXT"
47
+
48
+
49
+ def _default_llama_server_bin() -> str:
50
+ """Best-guess absolute path of the ``llama-server`` executable.
51
+
52
+ Resolution order:
53
+
54
+ 1. ``$LLAMA_SERVER_BIN`` (escape hatch).
55
+ 2. ``shutil.which`` -- matches whatever the user actually has on PATH.
56
+ 3. Per-platform conventional install location, useful when the
57
+ generated YAML will be loaded by llama-swap which doesn't share
58
+ our PATH (e.g. launchd / systemd / scheduled tasks).
59
+
60
+ Always returns a string, never raises -- if everything fails we hand
61
+ back the bare ``llama-server`` name and let llama-swap surface the
62
+ error at first request.
63
+ """
64
+ explicit = os.environ.get("LLAMA_SERVER_BIN", "").strip()
65
+ if explicit:
66
+ return explicit
67
+ found = shutil.which(f"llama-server{EXE_SUFFIX}")
68
+ if found:
69
+ return found
70
+ if IS_WINDOWS:
71
+ for candidate in (
72
+ r"C:\Program Files\llama.cpp\llama-server.exe",
73
+ r"C:\tools\llama.cpp\llama-server.exe",
74
+ ):
75
+ if Path(candidate).is_file():
76
+ return candidate
77
+ return f"llama-server{EXE_SUFFIX}"
78
+ for candidate in (
79
+ "/opt/homebrew/bin/llama-server", # mac (Apple Silicon Homebrew)
80
+ "/usr/local/bin/llama-server", # mac (Intel Homebrew) / generic
81
+ "/usr/bin/llama-server", # apt / dnf
82
+ ):
83
+ if Path(candidate).is_file():
84
+ return candidate
85
+ return "/opt/homebrew/bin/llama-server"
86
+
87
+
88
+ LLAMA_SERVER_BIN_DEFAULT = _default_llama_server_bin()
89
+ HEALTH_CHECK_TIMEOUT = 600
90
+ LOG_LEVEL = "info"
91
+ LOG_TO_STDOUT = "proxy"
92
+ START_PORT = 10001
93
+ GLOBAL_TTL = 0
94
+
95
+ ROLE_LETTER: dict[str, str] = {
96
+ "fast": "f",
97
+ "agent": "c",
98
+ "plan": "p",
99
+ "plan-uncensored": "u",
100
+ }
101
+
102
+ ROLE_ALIASES: dict[str, list[str]] = {
103
+ "fast": ["fast", "small", "autocomplete"],
104
+ "agent": ["agent", "smart", "code", "coder"],
105
+ "plan": ["plan", "planner", "chat"],
106
+ "plan-uncensored": ["uncensored", "nofilter", "plan-nofilter", "heretic"],
107
+ }
108
+
109
+ ROLE_TTL: dict[str, int] = {
110
+ "fast": 0,
111
+ "agent": 1800,
112
+ "plan": 1200,
113
+ "plan-uncensored": 1200,
114
+ }
115
+
116
+ ROPE_RE = re.compile(
117
+ r"yarn\s*\(\s*scale\s*=\s*(\d+)\s*,\s*orig_ctx\s*=\s*(\d+)\s*\)",
118
+ re.IGNORECASE,
119
+ )
120
+ SIZE_RE = re.compile(r"[\d.]+")
121
+
122
+
123
+ def parse_rope(raw: str) -> tuple[int, int] | None:
124
+ m = ROPE_RE.search(raw or "")
125
+ return (int(m.group(1)), int(m.group(2))) if m else None
126
+
127
+
128
+ def parse_size_gb(raw: str, default: float = 5.0) -> float:
129
+ m = SIZE_RE.search(raw or "")
130
+ return float(m.group()) if m else default
131
+
132
+
133
+ def evict_cost(size_gb: float) -> int:
134
+ return max(1, min(30, int(round(size_gb / 1.5))))
135
+
136
+
137
+ def is_truthy(raw: str | None, default: bool = True) -> bool:
138
+ if raw is None:
139
+ return default
140
+ return raw.strip().lower() in ("1", "true", "yes", "on")
141
+
142
+
143
+ def build_metal_defaults(d) -> str:
144
+ """The shared llama-server flags used by every model."""
145
+ parts = [
146
+ f"--host {(d.get('host') or '127.0.0.1').strip()}",
147
+ "--port ${PORT}",
148
+ f"-ngl {(d.get('n_gpu_layers') or '999').strip()}",
149
+ f"-fa {(d.get('flash_attn') or 'on').strip()}",
150
+ ]
151
+ if is_truthy(d.get("jinja"), default=True):
152
+ parts.append("--jinja")
153
+ parts += [
154
+ f"--cache-type-k {(d.get('cache_type_k') or 'q8_0').strip()}",
155
+ f"--cache-type-v {(d.get('cache_type_v') or 'q8_0').strip()}",
156
+ f"--threads {(d.get('threads') or '-1').strip()}",
157
+ "--no-warmup",
158
+ "--no-mmap",
159
+ ]
160
+ return " ".join(parts)
161
+
162
+
163
+ def build_cmd(tier, section, *, use_next: bool = False) -> str:
164
+ """The multi-line ``cmd`` literal block scalar for one tier.
165
+
166
+ Sampling defaults (``--temp`` / ``--top-p`` / ``--top-k`` /
167
+ ``--min-p`` / ``--repeat-penalty``) are baked into the llama-server
168
+ startup command line for gguf tiers. They come from the tier's
169
+ ``sampler = ...`` line in ``models.ini`` (already parsed into
170
+ ``tier.sampler``). llama-server then applies them as its defaults
171
+ for any request that does not override them in the body.
172
+
173
+ This keeps the per-request injection path (in
174
+ :func:`llmstack.app._inject_sampler`) Bedrock-only -- gguf
175
+ sampling is a server-startup concern, since the CLI flags
176
+ survive across requests and don't break any backend's schema.
177
+ """
178
+ rope = parse_rope(section.get("rope_scaling", ""))
179
+ sampler = tier.sampler
180
+
181
+ has_queued = bool(tier.file_next)
182
+ running_next = use_next and has_queued
183
+ if running_next:
184
+ active_repo = tier.repo_next or tier.repo
185
+ active_file = tier.file_next
186
+ else:
187
+ active_repo = tier.repo
188
+ active_file = tier.file
189
+
190
+ lines: list[str] = ["${llama_server} ${metal_defaults}"]
191
+ if running_next:
192
+ lines += [
193
+ f"# >>> RUNNING NEXT ({tier.name}): this YAML was generated with --use-next.",
194
+ "# To revert, regenerate without --use-next (default for `llmstack start`).",
195
+ "# Permanent promotion: edit hf_file in models.ini and re-run `llmstack install`.",
196
+ "# Previous current file (still cached, still loadable):",
197
+ f"# -hff {tier.file}",
198
+ ]
199
+ else:
200
+ lines += [
201
+ f"# >>> UPGRADE-POINT ({tier.name}): swap the -hf/-hff pair below to change this tier.",
202
+ "# See UPGRADING.md. To change permanently, edit models.ini and re-run `llmstack install`.",
203
+ ]
204
+ if has_queued:
205
+ lines += [
206
+ "# Queued upgrade target (already pre-fetched if `llmstack download` has run):",
207
+ f"# -hff {tier.file_next}",
208
+ "# Try it without committing: llmstack start --next",
209
+ ]
210
+
211
+ lines += [
212
+ f"-hf {active_repo}",
213
+ f"-hff {active_file}",
214
+ f"--alias {tier.name}",
215
+ f"-c {tier.ctx_size}",
216
+ ]
217
+ if rope:
218
+ scale, orig_ctx = rope
219
+ lines += [
220
+ "--rope-scaling yarn",
221
+ f"--rope-scale {scale}",
222
+ f"--yarn-orig-ctx {orig_ctx}",
223
+ ]
224
+ if "temp" in sampler:
225
+ lines.append(f"--temp {sampler['temp']}")
226
+ if "top_p" in sampler:
227
+ lines.append(f"--top-p {sampler['top_p']}")
228
+ if "top_k" in sampler:
229
+ lines.append(f"--top-k {int(sampler['top_k'])}")
230
+ if "min_p" in sampler:
231
+ lines.append(f"--min-p {sampler['min_p']}")
232
+ if "rep_pen" in sampler:
233
+ lines.append(f"--repeat-penalty {sampler['rep_pen']}")
234
+
235
+ return "\n".join(lines) + "\n"
236
+
237
+
238
+ def aliases_for(tier, section) -> list[str]:
239
+ explicit = (section.get("aliases") or "").strip()
240
+ if explicit:
241
+ return [a.strip() for a in explicit.split(",") if a.strip()]
242
+ return list(ROLE_ALIASES.get(tier.role, [tier.role]))
243
+
244
+
245
+ def ttl_for(tier, section) -> int:
246
+ explicit = (section.get("ttl") or "").strip()
247
+ if explicit:
248
+ return _int(explicit, ROLE_TTL.get(tier.role, 1200))
249
+ return ROLE_TTL.get(tier.role, 1200)
250
+
251
+
252
+ def build_models_block(cfg, *, use_next: bool = False) -> dict:
253
+ tiers = load_tiers()
254
+ out: dict = {}
255
+ for name, tier in tiers.items():
256
+ if not tier.is_gguf:
257
+ # Hosted tiers (bedrock, ...) are dispatched by the router
258
+ # directly; llama-swap doesn't see them.
259
+ continue
260
+ section = cfg[name]
261
+ running_next = use_next and bool(tier.file_next)
262
+ size_key = "size_gb_next" if running_next else "size_gb"
263
+ quant_key = "quant_next" if running_next else "quant"
264
+ size_raw = section.get(size_key) or section.get("size_gb", "")
265
+ quant_raw = section.get(quant_key) or section.get("quant", "")
266
+ out[name] = {
267
+ "name": tier.description,
268
+ "description": tier.description,
269
+ "cmd": build_cmd(tier, section, use_next=use_next),
270
+ "ttl": ttl_for(tier, section),
271
+ "aliases": aliases_for(tier, section),
272
+ "metadata": {
273
+ "tier": (section.get("tier") or "").strip() or tier.role,
274
+ "role": tier.role,
275
+ "ctx_size": tier.ctx_size,
276
+ "size_gb": parse_size_gb(size_raw, default=0.0),
277
+ "quant": (quant_raw or "").strip(),
278
+ "channel": "next" if running_next else "current",
279
+ },
280
+ }
281
+ return out
282
+
283
+
284
+ def build_matrix(cfg) -> dict:
285
+ tiers = load_tiers()
286
+ vars_: dict[str, str] = {}
287
+ evict: dict[str, int] = {}
288
+
289
+ for name, tier in tiers.items():
290
+ if not tier.is_gguf:
291
+ continue
292
+ letter = ROLE_LETTER.get(tier.role)
293
+ if not letter or letter in vars_:
294
+ continue
295
+ vars_[letter] = name
296
+ size_gb = parse_size_gb(cfg[name].get("size_gb", ""), default=5.0)
297
+ evict[letter] = evict_cost(size_gb)
298
+
299
+ sets: dict[str, str] = {}
300
+ fast = "f"
301
+ if fast in vars_:
302
+ for letter, name in vars_.items():
303
+ if letter == fast:
304
+ continue
305
+ slug = (tiers[name].role or letter).replace("-", "_")
306
+ sets[f"{slug}_with_fast"] = f"{fast} & {letter}"
307
+ chat_letters = [letter for letter in vars_ if letter not in (fast, "c")]
308
+ if len(chat_letters) >= 2:
309
+ sets["all_chats_with_fast"] = f"{fast} & " + " & ".join(chat_letters)
310
+
311
+ return {"vars": vars_, "evict_costs": evict, "sets": sets}
312
+
313
+
314
+ def _str_presenter(dumper, data):
315
+ if "\n" in data:
316
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
317
+ return dumper.represent_scalar("tag:yaml.org,2002:str", data)
318
+
319
+
320
+ HEADER_CURRENT = """\
321
+ # yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
322
+ #
323
+ # AUTO-GENERATED by llmstack.generators.llama_swap from models.ini.
324
+ # Written by `llmstack start` on each fresh launch; hand edits will be
325
+ # overwritten next time the stack starts. To change behaviour, edit
326
+ # models.ini (per-tier or [DEFAULT]) and re-run `llmstack restart`.
327
+ """
328
+
329
+ HEADER_NEXT = """\
330
+ # yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
331
+ #
332
+ # AUTO-GENERATED by llmstack.generators.llama_swap --use-next from models.ini.
333
+ # This is the EPHEMERAL "next" config produced by `llmstack start --next`.
334
+ # Tiers with hf_file_next defined are running their queued upgrade target;
335
+ # all other tiers are unchanged. Do not commit this file. To make any of
336
+ # these promotions permanent, flip hf_file/hf_file_next in models.ini and
337
+ # re-run `llmstack restart` -- that regenerates the canonical yaml.
338
+ """
339
+
340
+
341
+ def build_config(*, use_next: bool = False) -> dict:
342
+ cfg = configparser.ConfigParser(
343
+ inline_comment_prefixes=(";",),
344
+ interpolation=None,
345
+ )
346
+ cfg.read(models_ini_path())
347
+ defaults = cfg["DEFAULT"]
348
+
349
+ llama_bin = (defaults.get("llama_server_bin") or LLAMA_SERVER_BIN_DEFAULT).strip()
350
+ metal_defaults = build_metal_defaults(defaults)
351
+
352
+ tiers = load_tiers()
353
+ preload = [name for name, t in tiers.items() if t.role == "fast" and t.is_gguf]
354
+
355
+ return {
356
+ "healthCheckTimeout": HEALTH_CHECK_TIMEOUT,
357
+ "logLevel": LOG_LEVEL,
358
+ "logToStdout": LOG_TO_STDOUT,
359
+ "startPort": START_PORT,
360
+ "sendLoadingState": True,
361
+ "includeAliasesInList": True,
362
+ "globalTTL": GLOBAL_TTL,
363
+ "macros": {
364
+ "llama_server": llama_bin,
365
+ "metal_defaults": metal_defaults,
366
+ },
367
+ "models": build_models_block(cfg, use_next=use_next),
368
+ "matrix": build_matrix(cfg),
369
+ "hooks": {
370
+ "on_startup": {"preload": preload},
371
+ },
372
+ }
373
+
374
+
375
+ def render(*, use_next: bool = False) -> str:
376
+ """Return the full YAML document (header + body) as a string."""
377
+ yaml.add_representer(str, _str_presenter, Dumper=yaml.SafeDumper)
378
+ body = yaml.safe_dump(
379
+ build_config(use_next=use_next),
380
+ sort_keys=False,
381
+ default_flow_style=False,
382
+ width=200,
383
+ )
384
+ header = HEADER_NEXT if use_next else HEADER_CURRENT
385
+ return header + "\n" + body
386
+
387
+
388
+ def validate(path: Path) -> None:
389
+ """Cheap structural sanity check: parses cleanly as YAML."""
390
+ yaml.safe_load(path.read_text())
391
+
392
+
393
+ def _parse_argv(argv: list[str]) -> tuple[str, bool]:
394
+ use_next = (
395
+ os.getenv(USE_NEXT_ENV, "").strip().lower() in ("1", "true", "yes", "on")
396
+ )
397
+ positional: list[str] = []
398
+ for arg in argv[1:]:
399
+ if arg == "--use-next":
400
+ use_next = True
401
+ elif arg in ("-h", "--help"):
402
+ sys.stdout.write(__doc__ or "")
403
+ sys.exit(0)
404
+ else:
405
+ positional.append(arg)
406
+ target = positional[0] if positional else "-"
407
+ return target, use_next
408
+
409
+
410
+ def main(argv: list[str]) -> int:
411
+ target, use_next = _parse_argv(argv)
412
+ text = render(use_next=use_next)
413
+ if target == "-":
414
+ sys.stdout.write(text)
415
+ else:
416
+ Path(target).write_text(text)
417
+ return 0
418
+
419
+
420
+ if __name__ == "__main__":
421
+ sys.exit(main(sys.argv))