PyPI - sarasa - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

sarasa 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{sarasa-0.0.2 → sarasa-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sarasa
-Version: 0.0.2
+Version: 0.0.4
 Summary: Add your description here
 License-File: LICENSE
 Requires-Python: >=3.13
@@ -46,6 +46,8 @@ uv add sarasa[cpu|cu128|cu130]
 - Async distributed checkpoint saving
 - [ ] Checkpoint loading
+- [ ] FP8 training
+- [ ] Profiling
 ## Usage
@@ -100,18 +102,22 @@ if __name__ == "__main__":
     trainer.train()
 ```
+Thanks to [tyro](https://github.com/brentyi/tyro)'s type support, Sarasa can automatically recognize multiple custom optimizer types.
 From the command line, you can specify which custom optimizer to use:
 ```bash
 python script.py optim:custom_optim --optim.lr 0.001 ...
 ```
+(As tyro automatically converts config class names from CamelCase to snake_case, config class names are recommended not to include `Config` suffixes.)
 ### Config File Example
-It's very simple. IDE autocompletion will help you.
+It's very simple.
+IDE autocompletion will help you.
 ```python
-from sarasa.config import Config, Data, LRScheduler, Model, Train, LRScheduler
+from sarasa import Config, Data, LRScheduler, Model, Train, LRScheduler
 from custom_optim import CustomOptim
 # only one Config instance should be defined in each config file
@@ -135,4 +141,4 @@ config = Config.create(
 ## Acknowledgements
-This project is heavily inspired by and borrows code from `torchtitan`.
+This project is heavily inspired by and borrows code from [torchtitan](https://github.com/pytorch/torchtitan).

{sarasa-0.0.2 → sarasa-0.0.4}/README.md RENAMED Viewed

@@ -23,6 +23,8 @@ uv add sarasa[cpu|cu128|cu130]
 - Async distributed checkpoint saving
 - [ ] Checkpoint loading
+- [ ] FP8 training
+- [ ] Profiling
 ## Usage
@@ -77,18 +79,22 @@ if __name__ == "__main__":
     trainer.train()
 ```
+Thanks to [tyro](https://github.com/brentyi/tyro)'s type support, Sarasa can automatically recognize multiple custom optimizer types.
 From the command line, you can specify which custom optimizer to use:
 ```bash
 python script.py optim:custom_optim --optim.lr 0.001 ...
 ```
+(As tyro automatically converts config class names from CamelCase to snake_case, config class names are recommended not to include `Config` suffixes.)
 ### Config File Example
-It's very simple. IDE autocompletion will help you.
+It's very simple.
+IDE autocompletion will help you.
 ```python
-from sarasa.config import Config, Data, LRScheduler, Model, Train, LRScheduler
+from sarasa import Config, Data, LRScheduler, Model, Train, LRScheduler
 from custom_optim import CustomOptim
 # only one Config instance should be defined in each config file
@@ -112,4 +118,4 @@ config = Config.create(
 ## Acknowledgements
-This project is heavily inspired by and borrows code from `torchtitan`.
+This project is heavily inspired by and borrows code from [torchtitan](https://github.com/pytorch/torchtitan).

{sarasa-0.0.2 → sarasa-0.0.4}/configs/example.py RENAMED Viewed

@@ -1,11 +1,14 @@
 from sarasa.config import AdamW, Config, Data, LRScheduler, Model, Train
 config = Config.create(
-    model=Model(num_layers=12),
+    model=Model(
+        name="nanochat_gpt",
+        num_layers=12,
+        qk_norm=True,
+    ),
     train=Train(
         local_batch_size=16,
         global_batch_size=256,
-        dtype="bfloat16",
     ),
     data=Data(tokenizer_path="./tokenizer"),
     lr_scheduler=LRScheduler(

{sarasa-0.0.2 → sarasa-0.0.4}/configs/llama3-1b.py RENAMED Viewed

@@ -2,17 +2,18 @@ from sarasa.config import FSDP, AdamW, Config, Data, LRScheduler, Model, Train
 config = Config.create(
     model=Model(
+        name="llama3",
         hidden_dim=2048,
         num_layers=16,
         num_heads=32,
         num_kv_heads=8,
         head_dim=64,
-        name="llama3",
+        rms_eps=1e-5,
+        rms_learnable=True,
     ),
     train=Train(
         local_batch_size=32,
-        global_batch_size=256,
-        dtype="bfloat16",
+        global_batch_size=1024,
         use_sac=True,
     ),
     data=Data(tokenizer_path="./tokenizer"),

sarasa-0.0.4/sarasa/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .config import DDP as DDP
+from .config import FSDP as FSDP
+from .config import AdamW as AdamW
+from .config import Checkpoint as Checkpoint
+from .config import Config as Config
+from .config import Data as Data
+from .config import LRScheduler as LRScheduler
+from .config import Metrics as Metrics
+from .config import Model as Model
+from .config import Train as Train
+from .train import Trainer as Trainer

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/checkpoint.py RENAMED Viewed

@@ -110,3 +110,13 @@ class Checkpointer:
     def close(self) -> None:
         if self.stager is not None:
             self.stager.close()
+        if self.save_future is not None:
+            self.save_future.result()
+        if self.pg is not None:
+            dist.destroy_process_group(self.pg)
+            self.pg = None
+    def __del__(self):
+        self.close()

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/config.py RENAMED Viewed

@@ -91,6 +91,10 @@ class Train:
     grad_clip: float | None = None
     dtype: Literal["bfloat16", "float32"] = "float32"
+    """Dtype used for model initialization"""
+    amp_dtype: Literal["bfloat16", "float16", "float32"] = "bfloat16"
+    """Dtype used for automatic mixed precision training"""
     compile: bool = False
@@ -154,6 +158,12 @@ class FSDP(Distributed):
     reshard_after_forward: bool = False
     """Whether to reshard model parameters after each forward pass (FSDP only)."""
+    dtype: str | None = None
+    """Dtype for FSDP reduce operations. If None, uses train.dtype."""
+    amp_dtype: str | None = None
+    """Dtype for FSDP parameter storage. If None, uses train.amp_dtype."""
 @dataclasses.dataclass
 class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
@@ -183,11 +193,15 @@ class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
         if self.output_dir is not None:
             self.output_dir.mkdir(parents=True, exist_ok=True)
-        if hasattr(self.model, "seq_len") and self.model.seq_len is None:
-            if self.data.seq_len is not None:
+        if hasattr(self.model, "seq_len"):
+            if self.model.seq_len is None and self.data.seq_len is not None:
                 self.model.seq_len = self.data.seq_len
-            else:
-                raise ValueError("Either model.seq_len or data.seq_len must be set.")
+            if self.model.seq_len is None:
+                raise ValueError("seq_len must be specified in either model or data configuration.")
+        if isinstance(self.distributed, FSDP):
+            self.distributed.dtype = self.distributed.dtype or self.train.dtype
+            self.distributed.amp_dtype = self.distributed.amp_dtype or self.train.amp_dtype
     @classmethod
     def create(
@@ -227,6 +241,8 @@ class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
         import tyro
+        from sarasa.utils import rank
         loaded_config = None
         if (under := ("--config_file" in sys.argv)) or ("--config-file" in sys.argv):
@@ -262,6 +278,7 @@ class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
                 data_type,
             ],
             default=loaded_config,
+            console_outputs=(rank() == 0),
         )

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/models/__init__.py RENAMED Viewed

@@ -21,6 +21,8 @@ class ModelConfig:
     vocab_size: int | None = None  # set later based on tokenizer
     seq_len: int | None = None  # set later based on data config
     qk_norm: bool = False  # whether to use RMSNorm on q/k
+    rms_eps: float | None = None  # epsilon for RMSNorm, default to library default if None
+    rms_learnable: bool = False  # whether RMSNorm has learnable scale parameter
     def __post_init__(self):
         # infer hidden_dim, num_heads, num_kv_heads if not provided using the rules presented in nanochat

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/models/attention.py RENAMED Viewed

@@ -3,7 +3,7 @@ from torch import nn
 from torch.nn import functional as F
 from sarasa.models import ModelConfig
-from sarasa.models.utils import RMSNorm, RoPE
+from sarasa.models.utils import RoPE
 class SDPAttention(nn.Module):
@@ -57,7 +57,11 @@ class CausalSelfAttention(nn.Module):
         self.c_k = nn.Linear(self.hidden_dim, self.num_kv_heads * self.head_dim, bias=False)
         self.c_v = nn.Linear(self.hidden_dim, self.num_kv_heads * self.head_dim, bias=False)
         self.c_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
-        self.qk_norm = RMSNorm(self.head_dim) if config.qk_norm else nn.Identity()
+        self.qk_norm = (
+            nn.RMSNorm(self.head_dim, eps=config.rms_eps, elementwise_affine=config.rms_learnable)
+            if config.qk_norm
+            else nn.Identity()
+        )
         # todo: support varlen etc and kv caching
         self.attn = SDPAttention(is_causal=True, enable_gqa=self.num_heads != self.num_kv_heads)

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/models/llama3.py RENAMED Viewed

@@ -4,7 +4,7 @@ from torch.nn import functional as F
 from sarasa.models import BaseModel, ModelConfig
 from sarasa.models.attention import CausalSelfAttention
-from sarasa.models.utils import RMSNorm, RoPE
+from sarasa.models.utils import RoPE
 class MLP(nn.Module):
@@ -41,15 +41,16 @@ class Block(nn.Module):
         self.layer_idx = layer_idx
         self.attention = CausalSelfAttention(config)
         self.mlp = MLP(config, multiple_of, ffn_dim_multiplier)
-        self.norm = RMSNorm(config.hidden_dim)
+        self.attn_norm = nn.RMSNorm(config.hidden_dim, eps=config.rms_eps)
+        self.mlp_norm = nn.RMSNorm(config.hidden_dim, eps=config.rms_eps)
     def forward(
         self,
         x: torch.Tensor,
         cos_sin: tuple[torch.Tensor, torch.Tensor],
     ) -> torch.Tensor:
-        x = x + self.attention(self.norm(x), cos_sin)
-        x = x + self.mlp(self.norm(x))
+        x = x + self.attention(self.attn_norm(x), cos_sin)
+        x = x + self.mlp(self.mlp_norm(x))
         return x
@@ -71,7 +72,7 @@ class Llama3(BaseModel):
         self.blocks = nn.ModuleList([
             Block(config, layer_idx, multiple_of, ffn_dim_multiplier) for layer_idx in range(config.num_layers)
         ])
-        self.norm = RMSNorm(config.hidden_dim)
+        self.norm = nn.RMSNorm(config.hidden_dim, eps=config.rms_eps)
         self.output = nn.Linear(config.hidden_dim, config.vocab_size, bias=False)
     @torch.no_grad()
@@ -101,16 +102,26 @@ class Llama3(BaseModel):
             b=cutoff_factor * final_out_std,
         )
+        for mod in self.modules():
+            if isinstance(mod, nn.RMSNorm):
+                mod.reset_parameters()
     def param_groups(self) -> dict[str, list[nn.Parameter]]:
-        matrix_params = list(self.blocks.parameters())
+        matrix_params = [param for param in self.blocks.parameters() if param.ndim == 2]
         embedding_params = list(self.token_emb.parameters())
         lm_head_params = list(self.output.parameters())
-        assert len(list(self.parameters())) == (len(matrix_params) + len(embedding_params) + len(lm_head_params))
+        rms_norm_params = [
+            mod.weight for mod in self.modules() if isinstance(mod, nn.RMSNorm) and mod.elementwise_affine
+        ]
+        assert len(list(self.parameters())) == (
+            len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(rms_norm_params)
+        )
         return {
             "matrix": matrix_params,
             "embedding": embedding_params,
             "lm_head": lm_head_params,
+            "rms_norm": rms_norm_params,
         }
     def forward(

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/models/nanochat_gpt.py RENAMED Viewed

@@ -8,7 +8,16 @@ from torch.nn import functional as F
 from sarasa.models import BaseModel, ModelConfig
 from sarasa.models.attention import CausalSelfAttention
-from sarasa.models.utils import RMSNorm, RoPE
+from sarasa.models.utils import RoPE
+class RMSNorm(torch.nn.RMSNorm):
+    # RMSNorm without affine parameters
+    def __init__(
+        self,
+        normalized_shape: int,
+    ):
+        super().__init__(normalized_shape, eps=None, elementwise_affine=False)
 class MLP(nn.Module):

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/models/utils.py RENAMED Viewed

@@ -1,15 +1,6 @@
 import torch
-class RMSNorm(torch.nn.RMSNorm):
-    # RMSNorm without affine parameters
-    def __init__(
-        self,
-        normalized_shape: int,
-    ):
-        super().__init__(normalized_shape, eps=None, elementwise_affine=False)
 class RoPE:
     @staticmethod
     def precompute(

sarasa-0.0.2/sarasa/trainer.py → sarasa-0.0.4/sarasa/train.py RENAMED Viewed

@@ -7,6 +7,7 @@ import torch
 import torch.distributed as dist
 from loguru import logger
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.nn import functional as F
 from sarasa.activation_checkpoint import apply_op_sac
 from sarasa.checkpoint import Checkpointer
@@ -48,9 +49,6 @@ class Trainer:
         vocab_size = len(self.tokenizer)
         self.config.model.vocab_size = vocab_size
-        # todo: support other loss functions
-        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX, reduction="sum")
         # setup model, optimizer, lr scheduler
         with torch.device("meta"), set_dtype(getattr(torch, config.train.dtype)):
             self.model = self.config.model.create()
@@ -68,9 +66,9 @@ class Trainer:
         if config.train.compile:
             logger.info("Compiling the model")
             for block in self.model.blocks:
-                block.compile(fullgraph=True)
+                block.compile(fullgraph=True, dynamic=False)
             self.model.compile(dynamic=False)
-            self.loss_fn.compile()
+            self.loss_fn = torch.compile(self.loss_fn, fullgraph=True, dynamic=False)
         if world_size() > 1:
             apply_distributed(
@@ -102,7 +100,10 @@ class Trainer:
         self.amp_context = contextlib.nullcontext()
         if config.distributed.name != "fsdp":
-            self.amp_context = torch.autocast(device_type=self.device.type, dtype=getattr(torch, config.train.dtype))
+            self.amp_context = torch.autocast(
+                device_type=self.device.type,
+                dtype=getattr(torch, config.train.amp_dtype),
+            )
         # todo: setup profiler context
         self.profile_context = contextlib.nullcontext()
@@ -116,40 +117,36 @@ class Trainer:
                     f"Failed to activate FA4 flash attention: {e}. Install sarasa with `flash_attn` extra for better performance."
                 )
-    def __del__(self) -> None:
-        # cleanup distributed
-        if world_size() > 1:
-            try:
-                dist.destroy_process_group()
-            except Exception as e:
-                logger.warning(f"Failed to destroy process group: {e}")
     @record
     def train(self):
-        logger.info("Starting training...")
-        self.model.train()
-        with self.profile_context:
-            data_iter = self.batch_generator(self.data_loader)
-            for _ in range(self.config.train.steps):
-                self.step += 1
-                self.gc.collect(self.step)
-                try:
-                    self.train_step(data_iter)
-                except StopIteration:
-                    logger.warning("Data loader exhausted during training.")
-                    break
-                if self.checkpointer is not None:
-                    self.checkpointer.save(self.step)
-                if self.config.train.val_freq > 0 and self.step % self.config.train.val_freq == 0:
-                    self.evaluate()
-                if world_size() > 1 and self.step == 1:
-                    update_timeout(self.config.distributed.train_timeout_seconds, self.device)
-        logger.info("Training completed.")
+        try:
+            logger.info("Starting training...")
+            self.model.train()
+            with self.profile_context:
+                data_iter = self.batch_generator(self.data_loader)
+                for _ in range(self.config.train.steps):
+                    self.step += 1
+                    self.gc.collect(self.step)
+                    try:
+                        self.train_step(data_iter)
+                    except StopIteration:
+                        logger.warning("Data loader exhausted during training.")
+                        break
+                    if self.checkpointer is not None:
+                        self.checkpointer.save(self.step)
+                    if self.config.train.val_freq > 0 and self.step % self.config.train.val_freq == 0:
+                        self.evaluate()
+                    if world_size() > 1 and self.step == 1:
+                        update_timeout(self.config.distributed.train_timeout_seconds, self.device)
+            logger.info("Training completed.")
+        finally:
+            logger.info("Cleaning up trainer...")
+            self.close()
     def batch_generator(
         self,
@@ -190,7 +187,7 @@ class Trainer:
             with self.amp_context:
                 pred = self.model(**input_dict)
-                loss = self.loss_fn(pred.flatten(0, 1), target.flatten(0, 1)) / valid_tokens
+                loss = self.loss_fn(pred, target) / valid_tokens
             del pred
             loss.backward()
@@ -234,6 +231,18 @@ class Trainer:
             },
         )
+    def loss_fn(
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+    ) -> torch.Tensor:
+        return F.cross_entropy(
+            pred.flatten(0, 1).float(),
+            target.flatten(0, 1),
+            ignore_index=IGNORE_INDEX,
+            reduction="sum",
+        )
     def evaluate(self):
         raise NotImplementedError
@@ -242,3 +251,20 @@ class Trainer:
         batch_iter: Iterable[tuple[dict[str, torch.Tensor], torch.Tensor]],
     ) -> None:
         raise NotImplementedError
+    def close(self) -> None:
+        if self.checkpointer is not None:
+            self.checkpointer.close()
+        if self.metrics_processor is not None:
+            self.metrics_processor.close()
+        # cleanup distributed
+        if world_size() > 1:
+            try:
+                dist.destroy_process_group()
+            except Exception as e:
+                logger.warning(f"Failed to destroy process group: {e}")
+    def __del__(self):
+        self.close()

{sarasa-0.0.2 → sarasa-0.0.4}/sarasa/utils.py RENAMED Viewed

@@ -3,6 +3,7 @@ import gc
 import os
 import sys
 import time
+import typing
 from datetime import timedelta
 from functools import cache
@@ -11,8 +12,11 @@ from loguru import logger
 from torch import distributed as dist
 from torch import nn
+if typing.TYPE_CHECKING:
+    from sarasa.config import Config, Distributed
-def setup_logger(config) -> None:
+def setup_logger(config: Config) -> None:
     logger.remove()
     if config.debug:
         logger_format = f"<blue>RANK={rank()}</blue> | " + (
@@ -128,7 +132,7 @@ def update_timeout(
 def apply_distributed(
-    config,
+    config: Distributed,
     model: nn.Module,
     device: torch.device,
     compile: bool,
@@ -149,8 +153,8 @@ def apply_distributed(
         # todo: make dtypes configurable
         mp_policy = MixedPrecisionPolicy(
-            param_dtype=torch.bfloat16,
-            reduce_dtype=torch.float32,
+            param_dtype=getattr(torch, config.amp_dtype),
+            reduce_dtype=getattr(torch, config.dtype),
         )
         for block in model.blocks:

{sarasa-0.0.2 → sarasa-0.0.4}/tests/test_config.py RENAMED Viewed

@@ -73,3 +73,10 @@ def test_config_loading_filetype_error(monkeypatch, tmp_path):
     monkeypatch.setattr(sys, "argv", ["program", "--config_file", str(config_file)])
     with pytest.raises(ValueError):
         Config.from_cli()
+def test_config_post_init(monkeypatch):
+    monkeypatch.setattr(sys, "argv", ["program", "distributed:fsdp"])
+    config = Config.from_cli()  # just check no error is raised
+    assert config.distributed.dtype == config.train.dtype
+    assert config.distributed.amp_dtype == config.train.amp_dtype