shadowlm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shadowlm/__init__.py +43 -0
- shadowlm/_quiet.py +23 -0
- shadowlm/accel.py +70 -0
- shadowlm/ascii.py +79 -0
- shadowlm/backends/__init__.py +64 -0
- shadowlm/backends/base.py +99 -0
- shadowlm/backends/mlx.py +884 -0
- shadowlm/backends/torch.py +863 -0
- shadowlm/bottleneck.py +128 -0
- shadowlm/capture.py +239 -0
- shadowlm/charts.py +112 -0
- shadowlm/cli.py +289 -0
- shadowlm/data.py +273 -0
- shadowlm/methods/__init__.py +54 -0
- shadowlm/methods/adapter.py +15 -0
- shadowlm/methods/base.py +82 -0
- shadowlm/methods/bitfit.py +17 -0
- shadowlm/methods/cpt.py +16 -0
- shadowlm/methods/dora.py +15 -0
- shadowlm/methods/dpo.py +17 -0
- shadowlm/methods/full.py +16 -0
- shadowlm/methods/grpo.py +26 -0
- shadowlm/methods/lora.py +14 -0
- shadowlm/methods/more.py +23 -0
- shadowlm/methods/ptuning.py +14 -0
- shadowlm/methods/qlora.py +15 -0
- shadowlm/methods/soft_prompt.py +15 -0
- shadowlm/models.py +329 -0
- shadowlm/more.py +288 -0
- shadowlm/rl.py +220 -0
- shadowlm/runs.py +77 -0
- shadowlm/training.py +332 -0
- shadowlm-0.1.0.dist-info/METADATA +491 -0
- shadowlm-0.1.0.dist-info/RECORD +38 -0
- shadowlm-0.1.0.dist-info/WHEEL +5 -0
- shadowlm-0.1.0.dist-info/entry_points.txt +2 -0
- shadowlm-0.1.0.dist-info/licenses/LICENSE +21 -0
- shadowlm-0.1.0.dist-info/top_level.txt +1 -0
shadowlm/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""ShadowLM Trainer — a fine-tuning SDK.
|
|
2
|
+
|
|
3
|
+
Any open model. Any harness. Any method.
|
|
4
|
+
|
|
5
|
+
import shadowlm as slm
|
|
6
|
+
|
|
7
|
+
ds = slm.Dataset.from_jsonl("data.jsonl").as_chat()
|
|
8
|
+
model = slm.load("mlx-community/Qwen2.5-0.5B-Instruct-4bit")
|
|
9
|
+
run = model.finetune(ds, method="lora", max_steps=60)
|
|
10
|
+
print(run.loss, run.sparkline())
|
|
11
|
+
print(model.generate("Hello!"))
|
|
12
|
+
model.save("out/", fmt="adapter")
|
|
13
|
+
|
|
14
|
+
datasets → finetune → inference. mlx on Apple Silicon, torch on CUDA (or CPU)
|
|
15
|
+
— accelerated by the shadow layer.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from . import methods, runs
|
|
19
|
+
from .capture import CaptureProxy, capture
|
|
20
|
+
from .data import Dataset
|
|
21
|
+
from .models import Model, Reply, load
|
|
22
|
+
from .rl import Trajectory, TrajectoryGroup, judge_group
|
|
23
|
+
from .training import Metric, TrainConfig, TrainingRun
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.0"
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"CaptureProxy",
|
|
29
|
+
"capture",
|
|
30
|
+
"Dataset",
|
|
31
|
+
"Model",
|
|
32
|
+
"Reply",
|
|
33
|
+
"load",
|
|
34
|
+
"methods",
|
|
35
|
+
"runs",
|
|
36
|
+
"Metric",
|
|
37
|
+
"TrainConfig",
|
|
38
|
+
"TrainingRun",
|
|
39
|
+
"Trajectory",
|
|
40
|
+
"TrajectoryGroup",
|
|
41
|
+
"judge_group",
|
|
42
|
+
"__version__",
|
|
43
|
+
]
|
shadowlm/_quiet.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Swallow a backend's own stdout/stderr so shadowLM owns the console.
|
|
2
|
+
|
|
3
|
+
Backends (mlx-lm, transformers, huggingface_hub) print their own progress lines
|
|
4
|
+
and tqdm bars. shadowLM prints its own clean output to the real terminal
|
|
5
|
+
(`sys.__stdout__`, captured in models.py), which these redirects don't touch — so
|
|
6
|
+
we silence the backend chatter without losing our own. `SHADOWLM_DEBUG=1` shows
|
|
7
|
+
the raw output.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import contextlib
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@contextlib.contextmanager
|
|
17
|
+
def quiet_backend():
|
|
18
|
+
if os.environ.get("SHADOWLM_DEBUG"):
|
|
19
|
+
yield
|
|
20
|
+
return
|
|
21
|
+
with open(os.devnull, "w") as devnull, \
|
|
22
|
+
contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
|
|
23
|
+
yield
|
shadowlm/accel.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""The shadow accelerator — shadowLM's in-house training optimization layer.
|
|
2
|
+
|
|
3
|
+
It sits on top of whichever backend is active (mlx / torch) and turns on the
|
|
4
|
+
memory and throughput optimizations that are *safe for the current model and
|
|
5
|
+
hardware*: gradient checkpointing, fused attention kernels, fused optimizers.
|
|
6
|
+
|
|
7
|
+
Modes:
|
|
8
|
+
"none" off — plain training
|
|
9
|
+
"shadow" force every applicable optimization on
|
|
10
|
+
"auto" enable what actually helps at the current size (the default)
|
|
11
|
+
|
|
12
|
+
`plan()` is pure and side-effect free; each backend reads the returned `ShadowPlan`
|
|
13
|
+
and applies the flags it understands.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
|
|
20
|
+
# Below this layer count, the optimizations cost more overhead than they save, so
|
|
21
|
+
# "auto" leaves them off (forcing "shadow" still turns them on).
|
|
22
|
+
_BIG_MODEL_LAYERS = 24
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class ShadowPlan:
|
|
27
|
+
grad_checkpoint: bool = False
|
|
28
|
+
flash_attention: bool = False
|
|
29
|
+
fused_optimizer: bool = False
|
|
30
|
+
enabled: list[str] = field(default_factory=list)
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def active(self) -> bool:
|
|
34
|
+
return bool(self.enabled)
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def note(self) -> str:
|
|
38
|
+
if not self.enabled:
|
|
39
|
+
return "[shadow] no extra optimizations needed at this size"
|
|
40
|
+
return "[shadow] enabled: " + ", ".join(self.enabled)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def plan(mode: str, *, backend: str, n_layers: int = 0, has_flash: bool = False) -> ShadowPlan:
|
|
44
|
+
"""Decide which shadow optimizations to apply.
|
|
45
|
+
|
|
46
|
+
backend: "mlx" | "torch". n_layers: the model's transformer-block count.
|
|
47
|
+
has_flash: whether a flash-attention kernel is importable (torch only).
|
|
48
|
+
"""
|
|
49
|
+
if mode not in ("none", "shadow", "auto"):
|
|
50
|
+
raise ValueError(f"unknown accelerator {mode!r} (expected auto|shadow|none)")
|
|
51
|
+
if mode == "none":
|
|
52
|
+
return ShadowPlan()
|
|
53
|
+
force = mode == "shadow"
|
|
54
|
+
big = n_layers >= _BIG_MODEL_LAYERS
|
|
55
|
+
p = ShadowPlan()
|
|
56
|
+
|
|
57
|
+
if backend == "mlx":
|
|
58
|
+
if force or big:
|
|
59
|
+
p.grad_checkpoint = True
|
|
60
|
+
p.enabled.append("gradient checkpointing")
|
|
61
|
+
else: # torch (cuda / cpu)
|
|
62
|
+
if has_flash:
|
|
63
|
+
p.flash_attention = True
|
|
64
|
+
p.enabled.append("flash-attention-2")
|
|
65
|
+
if force or big:
|
|
66
|
+
p.grad_checkpoint = True
|
|
67
|
+
p.enabled.append("gradient checkpointing")
|
|
68
|
+
p.fused_optimizer = True
|
|
69
|
+
p.enabled.append("fused optimizer")
|
|
70
|
+
return p
|
shadowlm/ascii.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Startup banner for shadow training."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
_HEART = r"""
|
|
9
|
+
╔══════════════════════════════════════════════════════╗
|
|
10
|
+
║ ♥ ♥♥ ║
|
|
11
|
+
║ ♥♥♥ ♥♥♥♥ ║
|
|
12
|
+
║ ♥♥♥♥♥ ♥♥♥♥♥ ║
|
|
13
|
+
║ ♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥ ║
|
|
14
|
+
║ ♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥ ║
|
|
15
|
+
║ ♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥ ║
|
|
16
|
+
║ ♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
17
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
18
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
19
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥ ♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
20
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
21
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
22
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
23
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
24
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
25
|
+
║ ♥♥♥♥♥♥♥♥♥♥♥♥♥ ║
|
|
26
|
+
║ ♥♥♥♥♥♥♥ ║
|
|
27
|
+
║ ♥♥ ║
|
|
28
|
+
║ ║
|
|
29
|
+
╚══════════════════════════════════════════════════════╝"""
|
|
30
|
+
|
|
31
|
+
_NAME = r"""
|
|
32
|
+
███████╗ ██╗ ██╗ ██╗ ███╗ ███╗
|
|
33
|
+
██╔════╝ ██║ ██║ ██║ ████╗ ████║
|
|
34
|
+
███████╗ ███████╗ █████╗ ██████║ █████╗ ██╗ █╗ ██╗ ██║ ██╔████╔██║
|
|
35
|
+
╚════██║ ██╔══██╗ ██╔══██║ ██╔══██║ ██╔══██╗ ██║███╗██║ ██║ ██║╚██╔╝██║
|
|
36
|
+
███████║ ██║ ██║ ╚██████║ ╚██████║ ╚█████╔╝ ╚███╔███╔╝ ███████╗ ██║ ╚═╝ ██║
|
|
37
|
+
╚══════╝ ╚═╝ ╚═╝ ╚═════╝ ╚═════╝ ╚════╝ ╚══╝╚══╝ ╚══════╝ ╚═╝ ╚═╝
|
|
38
|
+
T R A I N E R
|
|
39
|
+
any open model · any harness · any method
|
|
40
|
+
from Lyzr Research Labs · slm♥
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
# The full banner prints on the first training session of the process; later
|
|
44
|
+
# sessions get a compact one-liner so sweeps don't drown in hearts.
|
|
45
|
+
_sessions = 0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def run_on_main_rank(fn):
|
|
49
|
+
"""Only run on the main process — rank 0 in a distributed (multi-GPU) launch.
|
|
50
|
+
|
|
51
|
+
Reads the usual launcher env vars; a plain single-process run is rank 0, so
|
|
52
|
+
the banner prints normally. Keeps worker ranks quiet once the studio fans
|
|
53
|
+
training out across GPUs.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
@functools.wraps(fn)
|
|
57
|
+
def wrapper(*args, **kwargs):
|
|
58
|
+
try:
|
|
59
|
+
rank = int(os.environ.get("RANK") or os.environ.get("LOCAL_RANK") or 0)
|
|
60
|
+
except ValueError: # launcher set RANK to something non-numeric
|
|
61
|
+
rank = 0
|
|
62
|
+
if rank == 0:
|
|
63
|
+
return fn(*args, **kwargs)
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
return wrapper
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@run_on_main_rank
|
|
70
|
+
def print_ascii_art(*, once: bool = True) -> None:
|
|
71
|
+
"""Print the ShadowLM Trainer banner — full art first, a compact line after."""
|
|
72
|
+
global _sessions
|
|
73
|
+
_sessions += 1
|
|
74
|
+
if once and _sessions > 1:
|
|
75
|
+
print(f"\nslm♥ ShadowLM Trainer · training session #{_sessions}\n")
|
|
76
|
+
return
|
|
77
|
+
print(_HEART)
|
|
78
|
+
print(_NAME)
|
|
79
|
+
print("Starting training session...\n")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Backend selection.
|
|
2
|
+
|
|
3
|
+
`select_backend("auto")` picks the backend for the current hardware:
|
|
4
|
+
CUDA → torch, else Apple Silicon → mlx, else torch on CPU. If no backend is
|
|
5
|
+
installed, `load()` says what to install. Force one with backend="mlx" / "torch"
|
|
6
|
+
(and device="cpu" to pin torch to the CPU).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .base import Backend, Callbacks, FinetuneResult
|
|
12
|
+
|
|
13
|
+
__all__ = ["Backend", "Callbacks", "FinetuneResult", "select_backend"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _mlx_available() -> bool:
|
|
17
|
+
from .mlx import MLXBackend
|
|
18
|
+
return MLXBackend.is_available()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _torch_available() -> bool:
|
|
22
|
+
from .torch import TorchBackend
|
|
23
|
+
return TorchBackend.is_available()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _no_backend() -> RuntimeError:
|
|
27
|
+
return RuntimeError(
|
|
28
|
+
"No training backend available. Install one:\n"
|
|
29
|
+
" • Apple Silicon: pip install shadowlm[mlx]\n"
|
|
30
|
+
" • CUDA / CPU: pip install shadowlm[torch]"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def select_backend(name: str = "auto", *, accelerator: str = "auto",
|
|
35
|
+
device: str = "auto") -> Backend:
|
|
36
|
+
name = (name or "auto").lower()
|
|
37
|
+
|
|
38
|
+
if name == "auto":
|
|
39
|
+
from .torch import TorchBackend
|
|
40
|
+
if TorchBackend.has_cuda():
|
|
41
|
+
name = "torch"
|
|
42
|
+
elif _mlx_available():
|
|
43
|
+
name = "mlx"
|
|
44
|
+
elif _torch_available():
|
|
45
|
+
name, device = "torch", "cpu"
|
|
46
|
+
else:
|
|
47
|
+
raise _no_backend()
|
|
48
|
+
|
|
49
|
+
if name == "mlx":
|
|
50
|
+
from .mlx import MLXBackend
|
|
51
|
+
if not MLXBackend.is_available():
|
|
52
|
+
raise RuntimeError("mlx backend needs Apple Silicon + shadowlm[mlx].")
|
|
53
|
+
return MLXBackend(accelerator=accelerator)
|
|
54
|
+
|
|
55
|
+
if name == "torch":
|
|
56
|
+
# CPU is just this backend with device="cpu" (no separate "cpu" backend).
|
|
57
|
+
from .torch import TorchBackend
|
|
58
|
+
if not TorchBackend.is_available():
|
|
59
|
+
raise RuntimeError(
|
|
60
|
+
"torch backend needs shadowlm[torch] (torch, transformers, trl, peft, datasets)."
|
|
61
|
+
)
|
|
62
|
+
return TorchBackend(device=device, accelerator=accelerator)
|
|
63
|
+
|
|
64
|
+
raise ValueError(f"unknown backend {name!r} (expected auto|mlx|torch)")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""The backend contract.
|
|
2
|
+
|
|
3
|
+
A `Backend` is a stateful object that holds a loaded model and knows how to
|
|
4
|
+
finetune it and generate from it. Everything user-facing (`Model`, `load`,
|
|
5
|
+
`finetune`, `generate`) is backend-agnostic — swap mlx ↔ torch without touching
|
|
6
|
+
the SDK surface.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from typing import Callable
|
|
13
|
+
|
|
14
|
+
from ..data import Dataset
|
|
15
|
+
from ..training import Metric, TrainConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Callbacks:
|
|
19
|
+
"""Bridges a backend's training loop back to the caller.
|
|
20
|
+
|
|
21
|
+
A backend calls `.step(metric)` for each logged step and `.log(msg)` for
|
|
22
|
+
status lines, and checks `.stopped()` to honour cancellation.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
on_step: Callable[[Metric], None] | None = None,
|
|
28
|
+
on_log: Callable[[str], None] | None = None,
|
|
29
|
+
on_eval: Callable[[Metric], None] | None = None,
|
|
30
|
+
should_stop: Callable[[], bool] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
self._on_step = on_step
|
|
33
|
+
self._on_log = on_log
|
|
34
|
+
self._on_eval = on_eval
|
|
35
|
+
self._should_stop = should_stop
|
|
36
|
+
|
|
37
|
+
def step(self, metric: Metric) -> None:
|
|
38
|
+
if self._on_step:
|
|
39
|
+
self._on_step(metric)
|
|
40
|
+
|
|
41
|
+
def eval(self, metric: Metric) -> None:
|
|
42
|
+
if self._on_eval:
|
|
43
|
+
self._on_eval(metric)
|
|
44
|
+
|
|
45
|
+
def log(self, message: str) -> None:
|
|
46
|
+
if self._on_log:
|
|
47
|
+
self._on_log(message)
|
|
48
|
+
|
|
49
|
+
def stopped(self) -> bool:
|
|
50
|
+
return bool(self._should_stop and self._should_stop())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class FinetuneResult:
|
|
54
|
+
"""What a backend returns from `finetune`: where the adapter/model landed."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, checkpoint: str, final_loss: float | None) -> None:
|
|
57
|
+
self.checkpoint = checkpoint
|
|
58
|
+
self.final_loss = final_loss
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Backend(ABC):
|
|
62
|
+
name: str = "base"
|
|
63
|
+
|
|
64
|
+
def __init__(self, *, device: str = "auto", accelerator: str = "auto") -> None:
|
|
65
|
+
# device interpretation is backend-specific: mlx → gpu, torch → cuda|cpu.
|
|
66
|
+
# accelerator selects the shadow optimization layer: none | shadow | auto.
|
|
67
|
+
self.device = device
|
|
68
|
+
self.accelerator = accelerator
|
|
69
|
+
self.model_name: str | None = None
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def is_available(cls) -> bool:
|
|
73
|
+
"""Whether this backend can actually run in the current environment."""
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def load(self, name: str, *, load_in_4bit: bool, max_seq_length: int,
|
|
78
|
+
adapter: str | None = None, **kwargs) -> None:
|
|
79
|
+
...
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def finetune(self, dataset: Dataset, config: TrainConfig, callbacks: Callbacks,
|
|
83
|
+
output_dir: str, eval_dataset: Dataset | None = None,
|
|
84
|
+
reward_fns: list | None = None) -> FinetuneResult:
|
|
85
|
+
...
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def generate(self, prompt: str, *, max_new_tokens: int, temperature: float,
|
|
89
|
+
top_p: float, **kwargs) -> str:
|
|
90
|
+
...
|
|
91
|
+
|
|
92
|
+
def chat(self, messages: list[dict], *, tools: list[dict] | None = None,
|
|
93
|
+
max_new_tokens: int, temperature: float, top_p: float, **kwargs) -> str:
|
|
94
|
+
"""Multi-turn chat through the tokenizer's chat template, with optional
|
|
95
|
+
tool schemas. Returns the raw assistant text (tool-call markup included)."""
|
|
96
|
+
raise NotImplementedError(f"{self.name} backend does not implement chat()")
|
|
97
|
+
|
|
98
|
+
def save(self, path: str, *, fmt: str = "adapter") -> str:
|
|
99
|
+
raise NotImplementedError(f"{self.name} backend does not implement save()")
|