shadowlm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
shadowlm/__init__.py ADDED
@@ -0,0 +1,43 @@
1
+ """ShadowLM Trainer — a fine-tuning SDK.
2
+
3
+ Any open model. Any harness. Any method.
4
+
5
+ import shadowlm as slm
6
+
7
+ ds = slm.Dataset.from_jsonl("data.jsonl").as_chat()
8
+ model = slm.load("mlx-community/Qwen2.5-0.5B-Instruct-4bit")
9
+ run = model.finetune(ds, method="lora", max_steps=60)
10
+ print(run.loss, run.sparkline())
11
+ print(model.generate("Hello!"))
12
+ model.save("out/", fmt="adapter")
13
+
14
+ datasets → finetune → inference. mlx on Apple Silicon, torch on CUDA (or CPU)
15
+ — accelerated by the shadow layer.
16
+ """
17
+
18
+ from . import methods, runs
19
+ from .capture import CaptureProxy, capture
20
+ from .data import Dataset
21
+ from .models import Model, Reply, load
22
+ from .rl import Trajectory, TrajectoryGroup, judge_group
23
+ from .training import Metric, TrainConfig, TrainingRun
24
+
25
+ __version__ = "0.1.0"
26
+
27
+ __all__ = [
28
+ "CaptureProxy",
29
+ "capture",
30
+ "Dataset",
31
+ "Model",
32
+ "Reply",
33
+ "load",
34
+ "methods",
35
+ "runs",
36
+ "Metric",
37
+ "TrainConfig",
38
+ "TrainingRun",
39
+ "Trajectory",
40
+ "TrajectoryGroup",
41
+ "judge_group",
42
+ "__version__",
43
+ ]
shadowlm/_quiet.py ADDED
@@ -0,0 +1,23 @@
1
+ """Swallow a backend's own stdout/stderr so shadowLM owns the console.
2
+
3
+ Backends (mlx-lm, transformers, huggingface_hub) print their own progress lines
4
+ and tqdm bars. shadowLM prints its own clean output to the real terminal
5
+ (`sys.__stdout__`, captured in models.py), which these redirects don't touch — so
6
+ we silence the backend chatter without losing our own. `SHADOWLM_DEBUG=1` shows
7
+ the raw output.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import contextlib
13
+ import os
14
+
15
+
16
+ @contextlib.contextmanager
17
+ def quiet_backend():
18
+ if os.environ.get("SHADOWLM_DEBUG"):
19
+ yield
20
+ return
21
+ with open(os.devnull, "w") as devnull, \
22
+ contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
23
+ yield
shadowlm/accel.py ADDED
@@ -0,0 +1,70 @@
1
+ """The shadow accelerator — shadowLM's in-house training optimization layer.
2
+
3
+ It sits on top of whichever backend is active (mlx / torch) and turns on the
4
+ memory and throughput optimizations that are *safe for the current model and
5
+ hardware*: gradient checkpointing, fused attention kernels, fused optimizers.
6
+
7
+ Modes:
8
+ "none" off — plain training
9
+ "shadow" force every applicable optimization on
10
+ "auto" enable what actually helps at the current size (the default)
11
+
12
+ `plan()` is pure and side-effect free; each backend reads the returned `ShadowPlan`
13
+ and applies the flags it understands.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass, field
19
+
20
+ # Below this layer count, the optimizations cost more overhead than they save, so
21
+ # "auto" leaves them off (forcing "shadow" still turns them on).
22
+ _BIG_MODEL_LAYERS = 24
23
+
24
+
25
+ @dataclass
26
+ class ShadowPlan:
27
+ grad_checkpoint: bool = False
28
+ flash_attention: bool = False
29
+ fused_optimizer: bool = False
30
+ enabled: list[str] = field(default_factory=list)
31
+
32
+ @property
33
+ def active(self) -> bool:
34
+ return bool(self.enabled)
35
+
36
+ @property
37
+ def note(self) -> str:
38
+ if not self.enabled:
39
+ return "[shadow] no extra optimizations needed at this size"
40
+ return "[shadow] enabled: " + ", ".join(self.enabled)
41
+
42
+
43
+ def plan(mode: str, *, backend: str, n_layers: int = 0, has_flash: bool = False) -> ShadowPlan:
44
+ """Decide which shadow optimizations to apply.
45
+
46
+ backend: "mlx" | "torch". n_layers: the model's transformer-block count.
47
+ has_flash: whether a flash-attention kernel is importable (torch only).
48
+ """
49
+ if mode not in ("none", "shadow", "auto"):
50
+ raise ValueError(f"unknown accelerator {mode!r} (expected auto|shadow|none)")
51
+ if mode == "none":
52
+ return ShadowPlan()
53
+ force = mode == "shadow"
54
+ big = n_layers >= _BIG_MODEL_LAYERS
55
+ p = ShadowPlan()
56
+
57
+ if backend == "mlx":
58
+ if force or big:
59
+ p.grad_checkpoint = True
60
+ p.enabled.append("gradient checkpointing")
61
+ else: # torch (cuda / cpu)
62
+ if has_flash:
63
+ p.flash_attention = True
64
+ p.enabled.append("flash-attention-2")
65
+ if force or big:
66
+ p.grad_checkpoint = True
67
+ p.enabled.append("gradient checkpointing")
68
+ p.fused_optimizer = True
69
+ p.enabled.append("fused optimizer")
70
+ return p
shadowlm/ascii.py ADDED
@@ -0,0 +1,79 @@
1
+ """Startup banner for shadow training."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import functools
6
+ import os
7
+
8
+ _HEART = r"""
9
+ ╔══════════════════════════════════════════════════════╗
10
+ ║ ♥ ♥♥ ║
11
+ ║ ♥♥♥ ♥♥♥♥ ║
12
+ ║ ♥♥♥♥♥ ♥♥♥♥♥ ║
13
+ ║ ♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥ ║
14
+ ║ ♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥ ║
15
+ ║ ♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥ ║
16
+ ║ ♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥ ║
17
+ ║ ♥♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥ ║
18
+ ║ ♥♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥♥ ║
19
+ ║ ♥♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥♥ ║
20
+ ║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
21
+ ║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
22
+ ║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
23
+ ║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
24
+ ║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
25
+ ║ ♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
26
+ ║ ♥♥♥♥♥♥♥ ║
27
+ ║ ♥♥ ║
28
+ ║ ║
29
+ ╚══════════════════════════════════════════════════════╝"""
30
+
31
+ _NAME = r"""
32
+ ███████╗ ██╗ ██╗ ██╗ ███╗ ███╗
33
+ ██╔════╝ ██║ ██║ ██║ ████╗ ████║
34
+ ███████╗ ███████╗ █████╗ ██████║ █████╗ ██╗ █╗ ██╗ ██║ ██╔████╔██║
35
+ ╚════██║ ██╔══██╗ ██╔══██║ ██╔══██║ ██╔══██╗ ██║███╗██║ ██║ ██║╚██╔╝██║
36
+ ███████║ ██║ ██║ ╚██████║ ╚██████║ ╚█████╔╝ ╚███╔███╔╝ ███████╗ ██║ ╚═╝ ██║
37
+ ╚══════╝ ╚═╝ ╚═╝ ╚═════╝ ╚═════╝ ╚════╝ ╚══╝╚══╝ ╚══════╝ ╚═╝ ╚═╝
38
+ T R A I N E R
39
+ any open model · any harness · any method
40
+ from Lyzr Research Labs · slm♥
41
+ """
42
+
43
+ # The full banner prints on the first training session of the process; later
44
+ # sessions get a compact one-liner so sweeps don't drown in hearts.
45
+ _sessions = 0
46
+
47
+
48
+ def run_on_main_rank(fn):
49
+ """Only run on the main process — rank 0 in a distributed (multi-GPU) launch.
50
+
51
+ Reads the usual launcher env vars; a plain single-process run is rank 0, so
52
+ the banner prints normally. Keeps worker ranks quiet once the studio fans
53
+ training out across GPUs.
54
+ """
55
+
56
+ @functools.wraps(fn)
57
+ def wrapper(*args, **kwargs):
58
+ try:
59
+ rank = int(os.environ.get("RANK") or os.environ.get("LOCAL_RANK") or 0)
60
+ except ValueError: # launcher set RANK to something non-numeric
61
+ rank = 0
62
+ if rank == 0:
63
+ return fn(*args, **kwargs)
64
+ return None
65
+
66
+ return wrapper
67
+
68
+
69
+ @run_on_main_rank
70
+ def print_ascii_art(*, once: bool = True) -> None:
71
+ """Print the ShadowLM Trainer banner — full art first, a compact line after."""
72
+ global _sessions
73
+ _sessions += 1
74
+ if once and _sessions > 1:
75
+ print(f"\nslm♥ ShadowLM Trainer · training session #{_sessions}\n")
76
+ return
77
+ print(_HEART)
78
+ print(_NAME)
79
+ print("Starting training session...\n")
@@ -0,0 +1,64 @@
1
+ """Backend selection.
2
+
3
+ `select_backend("auto")` picks the backend for the current hardware:
4
+ CUDA → torch, else Apple Silicon → mlx, else torch on CPU. If no backend is
5
+ installed, `load()` says what to install. Force one with backend="mlx" / "torch"
6
+ (and device="cpu" to pin torch to the CPU).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .base import Backend, Callbacks, FinetuneResult
12
+
13
+ __all__ = ["Backend", "Callbacks", "FinetuneResult", "select_backend"]
14
+
15
+
16
+ def _mlx_available() -> bool:
17
+ from .mlx import MLXBackend
18
+ return MLXBackend.is_available()
19
+
20
+
21
+ def _torch_available() -> bool:
22
+ from .torch import TorchBackend
23
+ return TorchBackend.is_available()
24
+
25
+
26
+ def _no_backend() -> RuntimeError:
27
+ return RuntimeError(
28
+ "No training backend available. Install one:\n"
29
+ " • Apple Silicon: pip install shadowlm[mlx]\n"
30
+ " • CUDA / CPU: pip install shadowlm[torch]"
31
+ )
32
+
33
+
34
+ def select_backend(name: str = "auto", *, accelerator: str = "auto",
35
+ device: str = "auto") -> Backend:
36
+ name = (name or "auto").lower()
37
+
38
+ if name == "auto":
39
+ from .torch import TorchBackend
40
+ if TorchBackend.has_cuda():
41
+ name = "torch"
42
+ elif _mlx_available():
43
+ name = "mlx"
44
+ elif _torch_available():
45
+ name, device = "torch", "cpu"
46
+ else:
47
+ raise _no_backend()
48
+
49
+ if name == "mlx":
50
+ from .mlx import MLXBackend
51
+ if not MLXBackend.is_available():
52
+ raise RuntimeError("mlx backend needs Apple Silicon + shadowlm[mlx].")
53
+ return MLXBackend(accelerator=accelerator)
54
+
55
+ if name == "torch":
56
+ # CPU is just this backend with device="cpu" (no separate "cpu" backend).
57
+ from .torch import TorchBackend
58
+ if not TorchBackend.is_available():
59
+ raise RuntimeError(
60
+ "torch backend needs shadowlm[torch] (torch, transformers, trl, peft, datasets)."
61
+ )
62
+ return TorchBackend(device=device, accelerator=accelerator)
63
+
64
+ raise ValueError(f"unknown backend {name!r} (expected auto|mlx|torch)")
@@ -0,0 +1,99 @@
1
+ """The backend contract.
2
+
3
+ A `Backend` is a stateful object that holds a loaded model and knows how to
4
+ finetune it and generate from it. Everything user-facing (`Model`, `load`,
5
+ `finetune`, `generate`) is backend-agnostic — swap mlx ↔ torch without touching
6
+ the SDK surface.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from abc import ABC, abstractmethod
12
+ from typing import Callable
13
+
14
+ from ..data import Dataset
15
+ from ..training import Metric, TrainConfig
16
+
17
+
18
+ class Callbacks:
19
+ """Bridges a backend's training loop back to the caller.
20
+
21
+ A backend calls `.step(metric)` for each logged step and `.log(msg)` for
22
+ status lines, and checks `.stopped()` to honour cancellation.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ on_step: Callable[[Metric], None] | None = None,
28
+ on_log: Callable[[str], None] | None = None,
29
+ on_eval: Callable[[Metric], None] | None = None,
30
+ should_stop: Callable[[], bool] | None = None,
31
+ ) -> None:
32
+ self._on_step = on_step
33
+ self._on_log = on_log
34
+ self._on_eval = on_eval
35
+ self._should_stop = should_stop
36
+
37
+ def step(self, metric: Metric) -> None:
38
+ if self._on_step:
39
+ self._on_step(metric)
40
+
41
+ def eval(self, metric: Metric) -> None:
42
+ if self._on_eval:
43
+ self._on_eval(metric)
44
+
45
+ def log(self, message: str) -> None:
46
+ if self._on_log:
47
+ self._on_log(message)
48
+
49
+ def stopped(self) -> bool:
50
+ return bool(self._should_stop and self._should_stop())
51
+
52
+
53
+ class FinetuneResult:
54
+ """What a backend returns from `finetune`: where the adapter/model landed."""
55
+
56
+ def __init__(self, checkpoint: str, final_loss: float | None) -> None:
57
+ self.checkpoint = checkpoint
58
+ self.final_loss = final_loss
59
+
60
+
61
+ class Backend(ABC):
62
+ name: str = "base"
63
+
64
+ def __init__(self, *, device: str = "auto", accelerator: str = "auto") -> None:
65
+ # device interpretation is backend-specific: mlx → gpu, torch → cuda|cpu.
66
+ # accelerator selects the shadow optimization layer: none | shadow | auto.
67
+ self.device = device
68
+ self.accelerator = accelerator
69
+ self.model_name: str | None = None
70
+
71
+ @classmethod
72
+ def is_available(cls) -> bool:
73
+ """Whether this backend can actually run in the current environment."""
74
+ return True
75
+
76
+ @abstractmethod
77
+ def load(self, name: str, *, load_in_4bit: bool, max_seq_length: int,
78
+ adapter: str | None = None, **kwargs) -> None:
79
+ ...
80
+
81
+ @abstractmethod
82
+ def finetune(self, dataset: Dataset, config: TrainConfig, callbacks: Callbacks,
83
+ output_dir: str, eval_dataset: Dataset | None = None,
84
+ reward_fns: list | None = None) -> FinetuneResult:
85
+ ...
86
+
87
+ @abstractmethod
88
+ def generate(self, prompt: str, *, max_new_tokens: int, temperature: float,
89
+ top_p: float, **kwargs) -> str:
90
+ ...
91
+
92
+ def chat(self, messages: list[dict], *, tools: list[dict] | None = None,
93
+ max_new_tokens: int, temperature: float, top_p: float, **kwargs) -> str:
94
+ """Multi-turn chat through the tokenizer's chat template, with optional
95
+ tool schemas. Returns the raw assistant text (tool-call markup included)."""
96
+ raise NotImplementedError(f"{self.name} backend does not implement chat()")
97
+
98
+ def save(self, path: str, *, fmt: str = "adapter") -> str:
99
+ raise NotImplementedError(f"{self.name} backend does not implement save()")