PyPI - sarasa - Versions diffs - 0.0.2__py3-none-any.whl - Mend

sarasa 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

sarasa/__init__.py +2 -0
sarasa/activation_checkpoint.py +81 -0
sarasa/checkpoint.py +112 -0
sarasa/config.py +279 -0
sarasa/data/__init__.py +36 -0
sarasa/data/hf_datasets.py +115 -0
sarasa/data/tokenizer.py +63 -0
sarasa/metrics.py +294 -0
sarasa/models/__init__.py +95 -0
sarasa/models/attention.py +84 -0
sarasa/models/llama3.py +129 -0
sarasa/models/nanochat_gpt.py +192 -0
sarasa/models/utils.py +39 -0
sarasa/optimizers/__init__.py +77 -0
sarasa/optimizers/utils.py +27 -0
sarasa/trainer.py +244 -0
sarasa/utils.py +163 -0
sarasa-0.0.2.dist-info/METADATA +138 -0
sarasa-0.0.2.dist-info/RECORD +21 -0
sarasa-0.0.2.dist-info/WHEEL +4 -0
sarasa-0.0.2.dist-info/licenses/LICENSE +201 -0

sarasa/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .config import Config as Config
2	+ from .trainer import Trainer as Trainer

sarasa/activation_checkpoint.py ADDED Viewed

@@ -0,0 +1,81 @@
+from collections import defaultdict
+import torch
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper
+from torch.utils.checkpoint import CheckpointPolicy, create_selective_checkpoint_contexts
+# for selective op activation checkpointing
+_ops_sac_save = {
+    torch.ops.aten.mm.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
+    torch.ops.aten._scaled_dot_product_cudnn_attention.default,
+    torch.ops.aten._scaled_dot_product_attention_math.default,
+    torch.ops.aten._scaled_dot_product_fused_attention_overrideable.default,
+    torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    # for low precision training, it's useful to always save
+    # the result of max, since the absolute maximum is
+    # used to compute the scaling factor for quantization.
+    torch.ops.aten.max.default,
+    torch._higher_order_ops.inductor_compiled_code,
+}
+def _op_sac_policy(
+    ops_to_save: set,
+    mm_recompute_shapes: set | None,
+    every_nth_mm: int,
+):
+    mm_recompute_shapes = mm_recompute_shapes or set()
+    def _get_custom_policy(meta: dict):
+        def _custom_policy(ctx, func, *args, **kwargs):
+            # special case, offload to CPU
+            if (
+                func == torch.ops.aten._to_copy.default
+                and "cuda" in str(args[0].device)
+                and str(kwargs.get("device", "")) == "cpu"
+            ):
+                return CheckpointPolicy.MUST_SAVE
+            # track mm ops
+            mode = "recompute" if ctx.is_recompute else "forward"
+            key = f"{mode}_mm_count"
+            if func == torch.ops.aten.mm.default:
+                if len(args) > 1 and args[1].shape in mm_recompute_shapes:
+                    # moe's router
+                    return CheckpointPolicy.PREFER_RECOMPUTE
+                meta[key] += 1
+            # save ops in save list, except every nth mm op
+            must_save = (func in ops_to_save) and not (
+                func == torch.ops.aten.mm.default and (meta[key] % every_nth_mm == 0)
+            )
+            return CheckpointPolicy.MUST_SAVE if must_save else CheckpointPolicy.PREFER_RECOMPUTE
+        return _custom_policy
+    def selective_checkpointing_context_fn():
+        return create_selective_checkpoint_contexts(_get_custom_policy(defaultdict(int)))
+    return selective_checkpointing_context_fn
+def apply_op_sac(
+    model: torch.nn.Module,
+    ops_to_save: set | None = None,
+    mm_recompute_shapes: set | None = None,
+    every_nth_mm: int = 2,
+) -> torch.nn.Module:
+    """Applies selective op activation checkpointing to the given model.
+    Ops like mm is expensive, so we want to store their activations for backward.
+    On the other hand, ops like activation functions are cheap, so we prefer to recompute them.
+    """
+    ops_to_save = ops_to_save or _ops_sac_save
+    return checkpoint_wrapper(
+        model,
+        _op_sac_policy(ops_to_save, mm_recompute_shapes, every_nth_mm),
+    )

sarasa/checkpoint.py ADDED Viewed

@@ -0,0 +1,112 @@
+import enum
+import gc
+import time
+from pathlib import Path
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+from loguru import logger
+from torch.distributed.checkpoint.staging import DefaultStager, StagingOptions
+from torch.distributed.checkpoint.state_dict import get_model_state_dict
+from torch.distributed.checkpoint.state_dict_saver import AsyncCheckpointerType
+from torch.distributed.checkpoint.stateful import Stateful
+from sarasa.config import Config
+class AsyncMode(enum.StrEnum):
+    none = enum.auto()
+    default = enum.auto()
+    mem_pinned = enum.auto()
+class ModelWrapper(Stateful):
+    def __init__(self, model: torch.nn.Module):
+        self.model = model
+    def state_dict(self) -> dict[str, torch.Tensor]:
+        return {"model": get_model_state_dict(self.model)}
+    def load_state_dict(self, state_dict: dict[str, torch.Tensor]) -> None:
+        raise NotImplementedError("...")
+class Checkpointer:
+    def __init__(
+        self,
+        config: Config,
+        model: torch.nn.Module,
+    ):
+        self.config = config
+        self.checkpoint_freq = config.checkpoint.save_freq
+        self.checkpoint_dir = Path(config.output_dir) / "checkpoints"
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        self.async_mode = AsyncMode(config.checkpoint.async_mode)
+        if self.async_mode != AsyncMode.none:
+            self.pg = dist.new_group(backend="gloo") if dist.is_initialized() else None
+        self.stager = None
+        self.save_future = None
+        self.stage_future = None
+        self.state = ModelWrapper(model)
+    @torch.no_grad()
+    def save(
+        self,
+        step: int,
+    ) -> None:
+        if step % self.checkpoint_freq != 0:
+            return
+        begin = time.perf_counter()
+        checkpoint_id = str(self.checkpoint_dir / f"checkpoint_{step:09d}")
+        # todo: save other states
+        state_dict = self.state.state_dict()
+        if self.async_mode == AsyncMode.default:
+            gc.collect(1)
+            if self.save_future is not None:
+                self.save_future.result()
+            self.save_future = dcp.async_save(
+                state_dict,
+                storage_writer=None,
+                checkpoint_id=checkpoint_id,
+                process_group=self.pg,
+            )
+            gc.collect(1)
+        elif self.async_mode == AsyncMode.mem_pinned:
+            gc.collect(1)
+            if self.save_future is not None:
+                self.save_future.result()
+            if self.stager is None:
+                self.stager = DefaultStager(StagingOptions(True, True, True, True))
+            ret = dcp.async_save(
+                state_dict,
+                storage_writer=None,
+                checkpoint_id=checkpoint_id,
+                process_group=self.pg,
+                async_checkpointer_type=AsyncCheckpointerType.PROCESS,
+                async_stager=self.stager,
+            )
+            self.save_future = ret.upload_completion
+            self.stage_future = ret.staging_completion
+        else:
+            ret = dcp.save(
+                state_dict,
+                storage_writer=None,
+                checkpoint_id=checkpoint_id,
+            )
+        logger.info(f"Finished saving checkpoint at step {step} in {time.perf_counter() - begin:.2f} seconds")
+    def wait_for_staging(self) -> None:
+        # no-op if not using mem_pinned async mode
+        if self.stage_future is not None:
+            self.stage_future.result()
+    def close(self) -> None:
+        if self.stager is not None:
+            self.stager.close()

sarasa/config.py ADDED Viewed

@@ -0,0 +1,279 @@
+from __future__ import annotations
+import dataclasses
+import sys
+from pathlib import Path
+from typing import Literal
+import torch
+"""
+Variable configuration dataclasses for model, optimizer, lr scheduler, and data
+These classes have `create` methods to instantiate the actual objects
+Users can define their own configuration dataclasses and pass them to Config.from_cli to use custom components
+"""
+from sarasa.data import DataConfig as Data  # noqa
+from sarasa.models import ModelConfig as Model  # noqa
+from sarasa.optimizers import AdamW  # noqa
+@dataclasses.dataclass
+class LRScheduler:
+    warmup_steps: int = 200
+    decay_ratio: float | None = None
+    """If set, the ratio of total steps to apply decay after warmup. If None, decay starts immediately after warmup."""
+    decay_type: Literal["linear", "cosine", "sqrt"] = "linear"
+    min_lr_factor: float = 0.0
+    def create(
+        self,
+        optimizer: torch.optim.Optimizer,
+        total_iters: int,
+    ) -> torch.optim.lr_scheduler._LRScheduler:
+        assert self.decay_ratio is None or (0 <= self.decay_ratio <= 1), "decay_ratio must be between 0 and 1"
+        warmup_steps = self.warmup_steps
+        stay_steps = 0 if self.decay_ratio is None else int(total_iters * (1 - self.decay_ratio)) - warmup_steps
+        decay_steps = total_iters - warmup_steps - stay_steps
+        assert warmup_steps >= 0 and decay_steps >= 0 and stay_steps >= 0, (
+            f"Invalid lr scheduler steps configuration: {warmup_steps=}, {decay_steps=}, {stay_steps=}"
+        )
+        # 1 / max(1, warmup_steps) to avoid division by zero
+        warmup = torch.optim.lr_scheduler.LinearLR(optimizer, 1 / max(1, warmup_steps), total_iters=warmup_steps)
+        stay = torch.optim.lr_scheduler.ConstantLR(optimizer=optimizer, factor=1.0, total_iters=stay_steps)
+        match self.decay_type:
+            case "linear":
+                decay = torch.optim.lr_scheduler.LinearLR(
+                    optimizer,
+                    start_factor=1.0,
+                    end_factor=self.min_lr_factor,
+                    total_iters=decay_steps,
+                )
+            case "sqrt":
+                decay = torch.optim.lr_scheduler.LambdaLR(
+                    optimizer,
+                    lr_lambda=lambda step: max(
+                        self.min_lr_factor,
+                        (decay_steps - step) / decay_steps,
+                    )
+                    ** 0.5,
+                )
+            case "cosine":
+                decay = torch.optim.lr_scheduler.CosineAnnealingLR(
+                    optimizer,
+                    T_max=decay_steps,
+                    eta_min=optimizer.param_groups[0]["lr"] * self.min_lr_factor,
+                )
+        scheduler = torch.optim.lr_scheduler.SequentialLR(
+            optimizer,
+            [warmup, stay, decay],
+            [self.warmup_steps, self.warmup_steps + stay_steps],
+        )
+        return scheduler
+"""
+Static configuration dataclasses
+These classes are not expected to be changed by the user
+"""
+@dataclasses.dataclass
+class Train:
+    steps: int = 10_000
+    grad_clip: float | None = None
+    dtype: Literal["bfloat16", "float32"] = "float32"
+    compile: bool = False
+    gc_freq: int = 50
+    """Garbage collection frequency (in steps). If -1, no periodic GC is performed."""
+    local_batch_size: int = 32
+    """local (per device) batch size"""
+    global_batch_size: int = 256
+    """
+    global (across all devices) batch size, used to compute
+    grad_accum_steps = global_batch_size // (local_batch_size * num_devices)
+    """
+    use_fa4: bool = True
+    """Whether to use FA4 flash attention if available."""
+    val_freq: int = -1
+    """Validation frequency (in steps). If -1, no validation is performed."""
+    use_sac: bool = False
+    """Whether to use selective activation checkpointing."""
+@dataclasses.dataclass
+class Metrics:
+    log_freq: int = 10
+    use_tensorboard: bool = False
+    all_node: bool = False
+@dataclasses.dataclass
+class Checkpoint:
+    save_freq: int = 1000
+    async_mode: Literal["none", "default", "mem_pinned"] = "default"
+@dataclasses.dataclass
+class Distributed:
+    backend: Literal["nccl", "gloo"] = "nccl"
+    init_timeout_seconds: int = 300
+    """Timeout for initializing the distributed process group."""
+    train_timeout_seconds: int = 100
+    """Timeout for distributed training operations after the first iteration."""
+    @property
+    def name(self) -> str:
+        return self.__class__.__name__.lower()
+@dataclasses.dataclass
+class DDP(Distributed):
+    pass
+@dataclasses.dataclass
+class FSDP(Distributed):
+    reshard_after_forward: bool = False
+    """Whether to reshard model parameters after each forward pass (FSDP only)."""
+@dataclasses.dataclass
+class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
+    # variable components
+    model: ModelT
+    optim: OptimizerT
+    lr_scheduler: LRSchedulerT
+    data: DataT
+    # static components
+    train: Train = dataclasses.field(default_factory=Train)
+    metrics: Metrics = dataclasses.field(default_factory=Metrics)
+    checkpoint: Checkpoint = dataclasses.field(default_factory=Checkpoint)
+    distributed: DDP | FSDP = dataclasses.field(default_factory=DDP)
+    seed: int = 0
+    debug: bool = False
+    """ Enable debug mode with more verbose logging and checks."""
+    output_dir: Path | str = Path("./outputs")
+    """Directory to save checkpoints and logs."""
+    config_file: Path | str | None = None
+    """Path to a config file (JSON or TOML) to load configuration from."""
+    def __post_init__(self):
+        if self.output_dir is not None:
+            self.output_dir.mkdir(parents=True, exist_ok=True)
+        if hasattr(self.model, "seq_len") and self.model.seq_len is None:
+            if self.data.seq_len is not None:
+                self.model.seq_len = self.data.seq_len
+            else:
+                raise ValueError("Either model.seq_len or data.seq_len must be set.")
+    @classmethod
+    def create(
+        cls,
+        model: ModelT,
+        optim: OptimizerT,
+        lr_scheduler: LRSchedulerT,
+        data: DataT,
+        **kwargs,
+    ) -> Config:
+        return cls(
+            model=model,
+            optim=optim,
+            lr_scheduler=lr_scheduler,
+            data=data,
+            **kwargs,
+        )
+    @classmethod
+    def from_cli(
+        cls,
+        *,
+        model_type: ModelT = Model,
+        optim_type: OptimizerT = AdamW,
+        lr_scheduler_type: LRSchedulerT = LRScheduler,
+        data_type: DataT = Data,
+    ) -> Config:
+        """
+        initialize JobConfig from command line arguments
+        update the values with the following priority: CLI arguments > config file > defaults
+        *_type can be used to specify custom dataclass types for each section
+        >> config = Config.from_cli(optim_type=CustomOptimizerConfig)
+        """
+        import importlib.util
+        import tyro
+        loaded_config = None
+        if (under := ("--config_file" in sys.argv)) or ("--config-file" in sys.argv):
+            config_file = sys.argv[sys.argv.index("--config_file" if under else "--config-file") + 1]
+            config_file = Path(config_file)
+            if not config_file.exists():
+                raise FileNotFoundError(f"Config file {config_file} does not exist.")
+            if config_file.suffix != ".py":
+                raise ValueError("Only Python config files are supported in this method.")
+            spec = importlib.util.spec_from_file_location("custom_config", config_file)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            configs = [
+                config
+                for config in module.__dict__.values()
+                if isinstance(config, cls) and not isinstance(config, type)
+            ]
+            if len(configs) == 0:
+                raise ValueError(f"No Config instance found in {config_file}.")
+            elif len(configs) > 1:
+                raise ValueError(f"Multiple Config instances found in {config_file}. Please keep only one.")
+            else:
+                loaded_config = configs[0]
+        return tyro.cli(
+            cls[
+                model_type,
+                optim_type,
+                lr_scheduler_type,
+                data_type,
+            ],
+            default=loaded_config,
+        )
+__all__ = [
+    "Config",
+    "Model",
+    "AdamW",
+    "LRScheduler",
+    "Data",
+    "Train",
+    "Metrics",
+    "Checkpoint",
+    "DDP",
+    "FSDP",
+]

sarasa/data/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+import dataclasses
+from pathlib import Path
+from typing import Any
+from torch.utils.data import DataLoader
+from sarasa.data.hf_datasets import Datasets, HFTextDataset
+from sarasa.data.tokenizer import HFTokenizerWrapper
+@dataclasses.dataclass
+class DataConfig:
+    dataset: Datasets = Datasets.fineweb_edu_100b
+    """Dataset to use for training. Can be a predefined dataset or a custom dataset path."""
+    tokenizer_path: Path | str = Path("./tokenizer")
+    """Path to `tokenizer.json` and `tokenizer_config.json` files."""
+    seq_len: int = 2048
+    num_workers: int = 4
+    pin_memory: bool = True
+    def create(
+        self,
+        batch_size: int,
+    ) -> dict[str, Any]:
+        # return {"tokenizer": tokenizer, "train_loader": train_loader, "val_loader": val_loader | None}
+        tokenizer = HFTokenizerWrapper(Path(self.tokenizer_path))
+        ds = HFTextDataset(self.dataset, "train", tokenizer, self.seq_len)
+        data_loader = DataLoader(ds, batch_size, num_workers=self.num_workers, pin_memory=self.pin_memory)
+        return {
+            "tokenizer": tokenizer,
+            "train_loader": data_loader,
+        }

sarasa/data/hf_datasets.py ADDED Viewed

@@ -0,0 +1,115 @@
+import enum
+from typing import Any, Callable
+import torch
+from datasets import disable_progress_bars, load_dataset
+from datasets.distributed import split_dataset_by_node
+from loguru import logger
+from torch.utils.data import IterableDataset
+from sarasa.utils import rank, world_size
+class Datasets(enum.StrEnum):
+    c4 = enum.auto()
+    fineweb_edu = enum.auto()
+    fineweb_edu_100b = enum.auto()
+    fineweb_edu_dedup = enum.auto()
+    def load(
+        self,
+        cache_dir: str | None,
+    ) -> Any:
+        match self:
+            case Datasets.c4:
+                return load_dataset(
+                    "allenai/c4",
+                    name="en",
+                    split="train",
+                    streaming=True,
+                    cache_dir=cache_dir,
+                )
+            case Datasets.fineweb_edu:
+                return load_dataset(
+                    "HuggingFaceFW/fineweb-edu",
+                    name="default",
+                    split="train",
+                    streaming=True,
+                    cache_dir=cache_dir,
+                )
+            case Datasets.fineweb_edu_100b:
+                return load_dataset(
+                    "HuggingFaceFW/fineweb-edu",
+                    name="sample-100BT",
+                    split="train",
+                    streaming=True,
+                    cache_dir=cache_dir,
+                )
+            case Datasets.fineweb_edu_dedup:
+                return load_dataset(
+                    "HuggingFaceTB/smollm-corpus",
+                    "fineweb-edu-dedup",
+                    split="train",
+                    streaming=True,
+                    cache_dir=cache_dir,
+                )
+class HFTextDataset(IterableDataset):
+    def __init__(
+        self,
+        dataset_name: Datasets | str,
+        split: str,
+        tokenizer: Callable[[str], list[int]],
+        seq_len: int,
+        infinite: bool = True,
+        cache_dir: str | None = None,
+    ):
+        if rank() != 0:
+            disable_progress_bars()
+        self.dataset_name = dataset_name
+        if dataset_name in Datasets:
+            ds = Datasets(dataset_name).load(cache_dir=cache_dir)
+        else:
+            logger.warning(f"Unknown dataset: {dataset_name}. Trying to use `load_dataset` directly.")
+            ds = load_dataset(dataset_name, split=split, streaming=True, cache_dir=cache_dir)
+        self.data = split_dataset_by_node(ds, rank=rank(), world_size=world_size())
+        self.tokenizer = tokenizer
+        self.seq_len = seq_len
+        self.token_buffer: list[int] = []
+    def _text_processor(
+        self,
+        sample: dict,
+    ) -> str:
+        # Default text processor: extract 'text' field
+        return sample["text"]
+    def __iter__(self):
+        max_buffer_token_len = 1 + self.seq_len
+        while True:
+            for sample in iter(self.data):
+                # Use the dataset-specific text processor
+                sample_text = self._text_processor(sample)
+                sample_tokens = self.tokenizer.encode(sample_text)
+                self.token_buffer.extend(sample_tokens)
+                while len(self.token_buffer) >= max_buffer_token_len:
+                    x = torch.LongTensor(self.token_buffer[:max_buffer_token_len])
+                    # update tokens to the remaining tokens
+                    self.token_buffer = self.token_buffer[max_buffer_token_len:]
+                    input = x[:-1]
+                    label = x[1:]
+                    yield {"input": input}, label
+            if not self.infinite:
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
+                break
+            else:
+                # Reset offset for the next iteration
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+                if hasattr(self.data, "set_epoch") and hasattr(self.data, "epoch"):
+                    self.data.set_epoch(self.data.epoch + 1)

sarasa/data/tokenizer.py ADDED Viewed

@@ -0,0 +1,63 @@
+import json
+from pathlib import Path
+from tokenizers import Tokenizer
+class BaseTokenizerWrapper:
+    def encode(self, *args, **kwargs) -> list[int]:
+        raise NotImplementedError
+    def decode(self, *args, **kwargs) -> str:
+        raise NotImplementedError
+    def __len__(self) -> int:
+        raise NotImplementedError
+class HFTokenizerWrapper(BaseTokenizerWrapper):
+    def __init__(
+        self,
+        tokenizer_path: Path,
+    ):
+        self.tokenizer = Tokenizer.from_file(str(tokenizer_path / "tokenizer.json"))
+        with (tokenizer_path / "tokenizer_config.json").open("r") as f:
+            config = json.load(f)
+        bos_token = self._get_tokens_from_config(config.get("bos_token", None))
+        if bos_token is None:
+            raise ValueError("BOS token must be specified in the tokenizer config.")
+        # check if tokenizer adds bos token automatically
+        test_encoding = self.tokenizer.encode("test").ids
+        self.bos_token_id = self.tokenizer.token_to_id(bos_token)
+        self.need_bos = self.bos_token_id not in test_encoding
+    def _get_tokens_from_config(
+        self,
+        token: dict[str, str] | str | None,
+    ) -> str | None:
+        if isinstance(token, dict):
+            token = token["content"]
+        return token
+    def encode(
+        self,
+        text: str,
+    ) -> list[int]:
+        token_ids = self.tokenizer.encode(text).ids
+        if self.need_bos:
+            token_ids = [self.bos_token_id] + token_ids
+        return token_ids
+    def decode(
+        self,
+        token_ids: list[int],
+        **kwargs,
+    ) -> str:
+        return self.tokenizer.decode(token_ids, **kwargs)
+    def __len__(self) -> int:
+        return self.tokenizer.get_vocab_size()