PyPI - sarasa - Versions diffs - 0.0.3__tar.gz → 0.0.4__tar.gz - Mend

sarasa 0.0.3tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{sarasa-0.0.3 → sarasa-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sarasa
-Version: 0.0.3
+Version: 0.0.4
 Summary: Add your description here
 License-File: LICENSE
 Requires-Python: >=3.13
@@ -46,6 +46,8 @@ uv add sarasa[cpu|cu128|cu130]
 - Async distributed checkpoint saving
 - [ ] Checkpoint loading
+- [ ] FP8 training
+- [ ] Profiling
 ## Usage
@@ -100,18 +102,22 @@ if __name__ == "__main__":
     trainer.train()
 ```
+Thanks to [tyro](https://github.com/brentyi/tyro)'s type support, Sarasa can automatically recognize multiple custom optimizer types.
 From the command line, you can specify which custom optimizer to use:
 ```bash
 python script.py optim:custom_optim --optim.lr 0.001 ...
 ```
+(As tyro automatically converts config class names from CamelCase to snake_case, config class names are recommended not to include `Config` suffixes.)
 ### Config File Example
-It's very simple. IDE autocompletion will help you.
+It's very simple.
+IDE autocompletion will help you.
 ```python
-from sarasa.config import Config, Data, LRScheduler, Model, Train, LRScheduler
+from sarasa import Config, Data, LRScheduler, Model, Train, LRScheduler
 from custom_optim import CustomOptim
 # only one Config instance should be defined in each config file
@@ -135,4 +141,4 @@ config = Config.create(
 ## Acknowledgements
-This project is heavily inspired by and borrows code from `torchtitan`.
+This project is heavily inspired by and borrows code from [torchtitan](https://github.com/pytorch/torchtitan).

{sarasa-0.0.3 → sarasa-0.0.4}/README.md RENAMED Viewed

@@ -23,6 +23,8 @@ uv add sarasa[cpu|cu128|cu130]
 - Async distributed checkpoint saving
 - [ ] Checkpoint loading
+- [ ] FP8 training
+- [ ] Profiling
 ## Usage
@@ -77,18 +79,22 @@ if __name__ == "__main__":
     trainer.train()
 ```
+Thanks to [tyro](https://github.com/brentyi/tyro)'s type support, Sarasa can automatically recognize multiple custom optimizer types.
 From the command line, you can specify which custom optimizer to use:
 ```bash
 python script.py optim:custom_optim --optim.lr 0.001 ...
 ```
+(As tyro automatically converts config class names from CamelCase to snake_case, config class names are recommended not to include `Config` suffixes.)
 ### Config File Example
-It's very simple. IDE autocompletion will help you.
+It's very simple.
+IDE autocompletion will help you.
 ```python
-from sarasa.config import Config, Data, LRScheduler, Model, Train, LRScheduler
+from sarasa import Config, Data, LRScheduler, Model, Train, LRScheduler
 from custom_optim import CustomOptim
 # only one Config instance should be defined in each config file
@@ -112,4 +118,4 @@ config = Config.create(
 ## Acknowledgements
-This project is heavily inspired by and borrows code from `torchtitan`.
+This project is heavily inspired by and borrows code from [torchtitan](https://github.com/pytorch/torchtitan).

{sarasa-0.0.3 → sarasa-0.0.4}/configs/example.py RENAMED Viewed

@@ -1,11 +1,14 @@
 from sarasa.config import AdamW, Config, Data, LRScheduler, Model, Train
 config = Config.create(
-    model=Model(num_layers=12),
+    model=Model(
+        name="nanochat_gpt",
+        num_layers=12,
+        qk_norm=True,
+    ),
     train=Train(
         local_batch_size=16,
         global_batch_size=256,
-        dtype="bfloat16",
     ),
     data=Data(tokenizer_path="./tokenizer"),
     lr_scheduler=LRScheduler(

{sarasa-0.0.3 → sarasa-0.0.4}/configs/llama3-1b.py RENAMED Viewed

@@ -2,17 +2,18 @@ from sarasa.config import FSDP, AdamW, Config, Data, LRScheduler, Model, Train
 config = Config.create(
     model=Model(
+        name="llama3",
         hidden_dim=2048,
         num_layers=16,
         num_heads=32,
         num_kv_heads=8,
         head_dim=64,
-        name="llama3",
+        rms_eps=1e-5,
+        rms_learnable=True,
     ),
     train=Train(
         local_batch_size=32,
-        global_batch_size=256,
-        dtype="bfloat16",
+        global_batch_size=1024,
         use_sac=True,
     ),
     data=Data(tokenizer_path="./tokenizer"),

sarasa-0.0.4/sarasa/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .config import DDP as DDP
+from .config import FSDP as FSDP
+from .config import AdamW as AdamW
+from .config import Checkpoint as Checkpoint
+from .config import Config as Config
+from .config import Data as Data
+from .config import LRScheduler as LRScheduler
+from .config import Metrics as Metrics
+from .config import Model as Model
+from .config import Train as Train
+from .train import Trainer as Trainer

{sarasa-0.0.3 → sarasa-0.0.4}/sarasa/checkpoint.py RENAMED Viewed

@@ -110,3 +110,13 @@ class Checkpointer:
     def close(self) -> None:
         if self.stager is not None:
             self.stager.close()
+        if self.save_future is not None:
+            self.save_future.result()
+        if self.pg is not None:
+            dist.destroy_process_group(self.pg)
+            self.pg = None
+    def __del__(self):
+        self.close()

{sarasa-0.0.3 → sarasa-0.0.4}/sarasa/models/__init__.py RENAMED Viewed

@@ -21,6 +21,8 @@ class ModelConfig:
     vocab_size: int | None = None  # set later based on tokenizer
     seq_len: int | None = None  # set later based on data config
     qk_norm: bool = False  # whether to use RMSNorm on q/k
+    rms_eps: float | None = None  # epsilon for RMSNorm, default to library default if None
+    rms_learnable: bool = False  # whether RMSNorm has learnable scale parameter
     def __post_init__(self):
         # infer hidden_dim, num_heads, num_kv_heads if not provided using the rules presented in nanochat

{sarasa-0.0.3 → sarasa-0.0.4}/sarasa/models/attention.py RENAMED Viewed

@@ -3,7 +3,7 @@ from torch import nn
 from torch.nn import functional as F
 from sarasa.models import ModelConfig
-from sarasa.models.utils import RMSNorm, RoPE
+from sarasa.models.utils import RoPE
 class SDPAttention(nn.Module):
@@ -57,7 +57,11 @@ class CausalSelfAttention(nn.Module):
         self.c_k = nn.Linear(self.hidden_dim, self.num_kv_heads * self.head_dim, bias=False)
         self.c_v = nn.Linear(self.hidden_dim, self.num_kv_heads * self.head_dim, bias=False)
         self.c_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
-        self.qk_norm = RMSNorm(self.head_dim) if config.qk_norm else nn.Identity()
+        self.qk_norm = (
+            nn.RMSNorm(self.head_dim, eps=config.rms_eps, elementwise_affine=config.rms_learnable)
+            if config.qk_norm
+            else nn.Identity()
+        )
         # todo: support varlen etc and kv caching
         self.attn = SDPAttention(is_causal=True, enable_gqa=self.num_heads != self.num_kv_heads)

{sarasa-0.0.3 → sarasa-0.0.4}/sarasa/models/llama3.py RENAMED Viewed

@@ -4,7 +4,7 @@ from torch.nn import functional as F
 from sarasa.models import BaseModel, ModelConfig
 from sarasa.models.attention import CausalSelfAttention
-from sarasa.models.utils import RMSNorm, RoPE
+from sarasa.models.utils import RoPE
 class MLP(nn.Module):
@@ -41,15 +41,16 @@ class Block(nn.Module):
         self.layer_idx = layer_idx
         self.attention = CausalSelfAttention(config)
         self.mlp = MLP(config, multiple_of, ffn_dim_multiplier)
-        self.norm = RMSNorm(config.hidden_dim)
+        self.attn_norm = nn.RMSNorm(config.hidden_dim, eps=config.rms_eps)
+        self.mlp_norm = nn.RMSNorm(config.hidden_dim, eps=config.rms_eps)
     def forward(
         self,
         x: torch.Tensor,
         cos_sin: tuple[torch.Tensor, torch.Tensor],
     ) -> torch.Tensor:
-        x = x + self.attention(self.norm(x), cos_sin)
-        x = x + self.mlp(self.norm(x))
+        x = x + self.attention(self.attn_norm(x), cos_sin)
+        x = x + self.mlp(self.mlp_norm(x))
         return x
@@ -71,7 +72,7 @@ class Llama3(BaseModel):
         self.blocks = nn.ModuleList([
             Block(config, layer_idx, multiple_of, ffn_dim_multiplier) for layer_idx in range(config.num_layers)
         ])
-        self.norm = RMSNorm(config.hidden_dim)
+        self.norm = nn.RMSNorm(config.hidden_dim, eps=config.rms_eps)
         self.output = nn.Linear(config.hidden_dim, config.vocab_size, bias=False)
     @torch.no_grad()
@@ -101,16 +102,26 @@ class Llama3(BaseModel):
             b=cutoff_factor * final_out_std,
         )
+        for mod in self.modules():
+            if isinstance(mod, nn.RMSNorm):
+                mod.reset_parameters()
     def param_groups(self) -> dict[str, list[nn.Parameter]]:
-        matrix_params = list(self.blocks.parameters())
+        matrix_params = [param for param in self.blocks.parameters() if param.ndim == 2]
         embedding_params = list(self.token_emb.parameters())
         lm_head_params = list(self.output.parameters())
-        assert len(list(self.parameters())) == (len(matrix_params) + len(embedding_params) + len(lm_head_params))
+        rms_norm_params = [
+            mod.weight for mod in self.modules() if isinstance(mod, nn.RMSNorm) and mod.elementwise_affine
+        ]
+        assert len(list(self.parameters())) == (
+            len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(rms_norm_params)
+        )
         return {
             "matrix": matrix_params,
             "embedding": embedding_params,
             "lm_head": lm_head_params,
+            "rms_norm": rms_norm_params,
         }
     def forward(

{sarasa-0.0.3 → sarasa-0.0.4}/sarasa/models/nanochat_gpt.py RENAMED Viewed

@@ -8,7 +8,16 @@ from torch.nn import functional as F
 from sarasa.models import BaseModel, ModelConfig
 from sarasa.models.attention import CausalSelfAttention
-from sarasa.models.utils import RMSNorm, RoPE
+from sarasa.models.utils import RoPE
+class RMSNorm(torch.nn.RMSNorm):
+    # RMSNorm without affine parameters
+    def __init__(
+        self,
+        normalized_shape: int,
+    ):
+        super().__init__(normalized_shape, eps=None, elementwise_affine=False)
 class MLP(nn.Module):

{sarasa-0.0.3 → sarasa-0.0.4}/sarasa/models/utils.py RENAMED Viewed

@@ -1,15 +1,6 @@
 import torch
-class RMSNorm(torch.nn.RMSNorm):
-    # RMSNorm without affine parameters
-    def __init__(
-        self,
-        normalized_shape: int,
-    ):
-        super().__init__(normalized_shape, eps=None, elementwise_affine=False)
 class RoPE:
     @staticmethod
     def precompute(

sarasa-0.0.3/sarasa/trainer.py → sarasa-0.0.4/sarasa/train.py RENAMED Viewed

@@ -7,6 +7,7 @@ import torch
 import torch.distributed as dist
 from loguru import logger
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.nn import functional as F
 from sarasa.activation_checkpoint import apply_op_sac
 from sarasa.checkpoint import Checkpointer
@@ -48,9 +49,6 @@ class Trainer:
         vocab_size = len(self.tokenizer)
         self.config.model.vocab_size = vocab_size
-        # todo: support other loss functions
-        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX, reduction="sum")
         # setup model, optimizer, lr scheduler
         with torch.device("meta"), set_dtype(getattr(torch, config.train.dtype)):
             self.model = self.config.model.create()
@@ -68,9 +66,9 @@ class Trainer:
         if config.train.compile:
             logger.info("Compiling the model")
             for block in self.model.blocks:
-                block.compile(fullgraph=True)
+                block.compile(fullgraph=True, dynamic=False)
             self.model.compile(dynamic=False)
-            self.loss_fn.compile()
+            self.loss_fn = torch.compile(self.loss_fn, fullgraph=True, dynamic=False)
         if world_size() > 1:
             apply_distributed(
@@ -119,14 +117,6 @@ class Trainer:
                     f"Failed to activate FA4 flash attention: {e}. Install sarasa with `flash_attn` extra for better performance."
                 )
-    def __del__(self) -> None:
-        # cleanup distributed
-        if world_size() > 1:
-            try:
-                dist.destroy_process_group()
-            except Exception as e:
-                logger.warning(f"Failed to destroy process group: {e}")
     @record
     def train(self):
         try:
@@ -197,7 +187,7 @@ class Trainer:
             with self.amp_context:
                 pred = self.model(**input_dict)
-                loss = self.loss_fn(pred.flatten(0, 1), target.flatten(0, 1)) / valid_tokens
+                loss = self.loss_fn(pred, target) / valid_tokens
             del pred
             loss.backward()
@@ -241,6 +231,18 @@ class Trainer:
             },
         )
+    def loss_fn(
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+    ) -> torch.Tensor:
+        return F.cross_entropy(
+            pred.flatten(0, 1).float(),
+            target.flatten(0, 1),
+            ignore_index=IGNORE_INDEX,
+            reduction="sum",
+        )
     def evaluate(self):
         raise NotImplementedError
@@ -256,3 +258,13 @@ class Trainer:
         if self.metrics_processor is not None:
             self.metrics_processor.close()
+        # cleanup distributed
+        if world_size() > 1:
+            try:
+                dist.destroy_process_group()
+            except Exception as e:
+                logger.warning(f"Failed to destroy process group: {e}")
+    def __del__(self):
+        self.close()

{sarasa-0.0.3 → sarasa-0.0.4}/sarasa/utils.py RENAMED Viewed

@@ -3,6 +3,7 @@ import gc
 import os
 import sys
 import time
+import typing
 from datetime import timedelta
 from functools import cache
@@ -11,8 +12,11 @@ from loguru import logger
 from torch import distributed as dist
 from torch import nn
+if typing.TYPE_CHECKING:
+    from sarasa.config import Config, Distributed
-def setup_logger(config) -> None:
+def setup_logger(config: Config) -> None:
     logger.remove()
     if config.debug:
         logger_format = f"<blue>RANK={rank()}</blue> | " + (
@@ -128,7 +132,7 @@ def update_timeout(
 def apply_distributed(
-    config,
+    config: Distributed,
     model: nn.Module,
     device: torch.device,
     compile: bool,