PyPI - sarasa - Versions diffs - 0.0.2__tar.gz → 0.0.3__tar.gz - Mend

sarasa 0.0.2tar.gz → 0.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{sarasa-0.0.2 → sarasa-0.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sarasa
-Version: 0.0.2
+Version: 0.0.3
 Summary: Add your description here
 License-File: LICENSE
 Requires-Python: >=3.13

{sarasa-0.0.2 → sarasa-0.0.3}/sarasa/config.py RENAMED Viewed

@@ -91,6 +91,10 @@ class Train:
     grad_clip: float | None = None
     dtype: Literal["bfloat16", "float32"] = "float32"
+    """Dtype used for model initialization"""
+    amp_dtype: Literal["bfloat16", "float16", "float32"] = "bfloat16"
+    """Dtype used for automatic mixed precision training"""
     compile: bool = False
@@ -154,6 +158,12 @@ class FSDP(Distributed):
     reshard_after_forward: bool = False
     """Whether to reshard model parameters after each forward pass (FSDP only)."""
+    dtype: str | None = None
+    """Dtype for FSDP reduce operations. If None, uses train.dtype."""
+    amp_dtype: str | None = None
+    """Dtype for FSDP parameter storage. If None, uses train.amp_dtype."""
 @dataclasses.dataclass
 class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
@@ -183,11 +193,15 @@ class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
         if self.output_dir is not None:
             self.output_dir.mkdir(parents=True, exist_ok=True)
-        if hasattr(self.model, "seq_len") and self.model.seq_len is None:
-            if self.data.seq_len is not None:
+        if hasattr(self.model, "seq_len"):
+            if self.model.seq_len is None and self.data.seq_len is not None:
                 self.model.seq_len = self.data.seq_len
-            else:
-                raise ValueError("Either model.seq_len or data.seq_len must be set.")
+            if self.model.seq_len is None:
+                raise ValueError("seq_len must be specified in either model or data configuration.")
+        if isinstance(self.distributed, FSDP):
+            self.distributed.dtype = self.distributed.dtype or self.train.dtype
+            self.distributed.amp_dtype = self.distributed.amp_dtype or self.train.amp_dtype
     @classmethod
     def create(
@@ -227,6 +241,8 @@ class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
         import tyro
+        from sarasa.utils import rank
         loaded_config = None
         if (under := ("--config_file" in sys.argv)) or ("--config-file" in sys.argv):
@@ -262,6 +278,7 @@ class Config[ModelT, OptimizerT, LRSchedulerT, DataT]:
                 data_type,
             ],
             default=loaded_config,
+            console_outputs=(rank() == 0),
         )

{sarasa-0.0.2 → sarasa-0.0.3}/sarasa/trainer.py RENAMED Viewed

@@ -102,7 +102,10 @@ class Trainer:
         self.amp_context = contextlib.nullcontext()
         if config.distributed.name != "fsdp":
-            self.amp_context = torch.autocast(device_type=self.device.type, dtype=getattr(torch, config.train.dtype))
+            self.amp_context = torch.autocast(
+                device_type=self.device.type,
+                dtype=getattr(torch, config.train.amp_dtype),
+            )
         # todo: setup profiler context
         self.profile_context = contextlib.nullcontext()
@@ -126,30 +129,34 @@ class Trainer:
     @record
     def train(self):
-        logger.info("Starting training...")
-        self.model.train()
-        with self.profile_context:
-            data_iter = self.batch_generator(self.data_loader)
-            for _ in range(self.config.train.steps):
-                self.step += 1
-                self.gc.collect(self.step)
-                try:
-                    self.train_step(data_iter)
-                except StopIteration:
-                    logger.warning("Data loader exhausted during training.")
-                    break
-                if self.checkpointer is not None:
-                    self.checkpointer.save(self.step)
-                if self.config.train.val_freq > 0 and self.step % self.config.train.val_freq == 0:
-                    self.evaluate()
-                if world_size() > 1 and self.step == 1:
-                    update_timeout(self.config.distributed.train_timeout_seconds, self.device)
-        logger.info("Training completed.")
+        try:
+            logger.info("Starting training...")
+            self.model.train()
+            with self.profile_context:
+                data_iter = self.batch_generator(self.data_loader)
+                for _ in range(self.config.train.steps):
+                    self.step += 1
+                    self.gc.collect(self.step)
+                    try:
+                        self.train_step(data_iter)
+                    except StopIteration:
+                        logger.warning("Data loader exhausted during training.")
+                        break
+                    if self.checkpointer is not None:
+                        self.checkpointer.save(self.step)
+                    if self.config.train.val_freq > 0 and self.step % self.config.train.val_freq == 0:
+                        self.evaluate()
+                    if world_size() > 1 and self.step == 1:
+                        update_timeout(self.config.distributed.train_timeout_seconds, self.device)
+            logger.info("Training completed.")
+        finally:
+            logger.info("Cleaning up trainer...")
+            self.close()
     def batch_generator(
         self,
@@ -242,3 +249,10 @@ class Trainer:
         batch_iter: Iterable[tuple[dict[str, torch.Tensor], torch.Tensor]],
     ) -> None:
         raise NotImplementedError
+    def close(self) -> None:
+        if self.checkpointer is not None:
+            self.checkpointer.close()
+        if self.metrics_processor is not None:
+            self.metrics_processor.close()

{sarasa-0.0.2 → sarasa-0.0.3}/sarasa/utils.py RENAMED Viewed

@@ -149,8 +149,8 @@ def apply_distributed(
         # todo: make dtypes configurable
         mp_policy = MixedPrecisionPolicy(
-            param_dtype=torch.bfloat16,
-            reduce_dtype=torch.float32,
+            param_dtype=getattr(torch, config.amp_dtype),
+            reduce_dtype=getattr(torch, config.dtype),
         )
         for block in model.blocks:

{sarasa-0.0.2 → sarasa-0.0.3}/tests/test_config.py RENAMED Viewed

@@ -73,3 +73,10 @@ def test_config_loading_filetype_error(monkeypatch, tmp_path):
     monkeypatch.setattr(sys, "argv", ["program", "--config_file", str(config_file)])
     with pytest.raises(ValueError):
         Config.from_cli()
+def test_config_post_init(monkeypatch):
+    monkeypatch.setattr(sys, "argv", ["program", "distributed:fsdp"])
+    config = Config.from_cli()  # just check no error is raised
+    assert config.distributed.dtype == config.train.dtype
+    assert config.distributed.amp_dtype == config.train.amp_dtype