PyPI - rxnn - Versions diffs - 0.2.29__tar.gz → 0.2.31__tar.gz - Mend

rxnn 0.2.29tar.gz → 0.2.31tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{rxnn-0.2.29 → rxnn-0.2.31}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.2.29
+Version: 0.2.31
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.2.29 → rxnn-0.2.31}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "rxnn"
-version = "0.2.29"
+version = "0.2.31"
 description = "RxNN: Reactive Neural Networks Platform"
 license = "Apache-2.0"

{rxnn-0.2.29 → rxnn-0.2.31}/src/rxnn/training/base.py RENAMED Viewed

@@ -8,6 +8,7 @@ import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel
 from typing import Callable
 from .callbacks import TrainerCallback
+from .ddp import get_os_ddp_config, distributed_value_mean
 class BaseTrainer(ABC):
@@ -91,8 +92,7 @@ class BaseTrainer(ABC):
             optimizer = self.optimizer
         if self.use_ddp:
-            rank = int(os.environ['RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
+            rank, world_size = get_os_ddp_config()
             dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
             self.model = DistributedDataParallel(self.model, device_ids=[self.device.index])
             train_sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
@@ -218,10 +218,9 @@ class BaseTrainer(ABC):
         if self.validation_dataset:
             self.validation_steps = 0
             val_loss, val_metrics = self.validate(batch_size)
-            val_loss_tensor = torch.tensor(val_loss).to(self.device)
             if self.use_ddp:
-                dist.all_reduce(val_loss_tensor, op=dist.ReduceOp.SUM)
-                val_loss = val_loss_tensor.item() / dist.get_world_size()
+                val_loss = distributed_value_mean(val_loss, device=self.device)
             self.validation_metrics[epoch] = val_metrics
             if self.writer:

{rxnn-0.2.29 → rxnn-0.2.31}/src/rxnn/training/bml.py RENAMED Viewed

@@ -7,6 +7,7 @@ import torch.distributed as dist
 from ..transformers.models import ReactiveTransformerDecoder
 from ..training.base import BaseTrainer
 from .models import MLMTrainingModel, JointTrainingModel
+from .ddp import distributed_mean
 class MLMTrainer(BaseTrainer):
     def __init__(
@@ -96,8 +97,7 @@ class MLMTrainer(BaseTrainer):
         acc = (correct / total * 100) if total > 0 else torch.tensor(0.0).to(self.device)
         node_acc = acc.item()
         if self.use_ddp:
-            dist.all_reduce(acc, op=dist.ReduceOp.SUM)
-            acc = acc / dist.get_world_size()
+            acc = distributed_mean(acc)
         metrics = {
             'accuracy': acc.item(),
@@ -198,8 +198,7 @@ class AutoregressiveTrainer(BaseTrainer):
         acc = (correct / total * 100) if total > 0 else torch.tensor(0.0).to(self.device)
         node_acc = acc.item()
         if self.use_ddp:
-            dist.all_reduce(acc, op=dist.ReduceOp.SUM)
-            acc = acc / dist.get_world_size()
+            acc = distributed_mean(acc)
         metrics = {
             'accuracy': acc.item(),
@@ -347,14 +346,10 @@ class JointLMTrainer(BaseTrainer):
         node_mlm_acc = mlm_acc.item()
         node_alm_acc = alm_acc.item()
         if self.use_ddp:
-            dist.all_reduce(avg_dec_loss, op=dist.ReduceOp.SUM)
-            dist.all_reduce(avg_enc_loss, op=dist.ReduceOp.SUM)
-            dist.all_reduce(mlm_acc, op=dist.ReduceOp.SUM)
-            dist.all_reduce(alm_acc, op=dist.ReduceOp.SUM)
-            avg_dec_loss = avg_dec_loss / dist.get_world_size()
-            avg_enc_loss = avg_enc_loss / dist.get_world_size()
-            mlm_acc = mlm_acc / dist.get_world_size()
-            alm_acc = alm_acc / dist.get_world_size()
+            avg_dec_loss = distributed_mean(avg_dec_loss)
+            avg_enc_loss = distributed_mean(avg_enc_loss)
+            mlm_acc = distributed_mean(mlm_acc)
+            alm_acc = distributed_mean(alm_acc)
         metrics = {
             'accuracy': {

{rxnn-0.2.29 → rxnn-0.2.31}/src/rxnn/training/callbacks.py RENAMED Viewed

@@ -536,6 +536,9 @@ class MrlTrainerCallback:
     def on_reward(self, actor: nn.Module, reward: float, generated: str, reference: str, saved_data: str, eval_mode: bool) -> None:
         pass
+    def on_update_epoch_start(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int) -> None:
+        pass
     def on_batch_updated(self, actor: nn.Module, epoch: int, step: int, policy_loss: float) -> None:
         pass
@@ -543,6 +546,9 @@ class MrlTrainerCallback:
                           critic_loss: float) -> None:
         pass
+    def on_update_epoch_end(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int, policy_loss: float, critic_loss: float) -> None:
+        pass
     def on_training_end(self, actor: nn.Module, critic: nn.Module, curriculum_config: dict) -> None:
         pass
@@ -572,6 +578,9 @@ class MrlPrintCallback(MrlTrainerCallback):
                   reference: dict[str, torch.Tensor], saved_data: dict[str, torch.Tensor], eval_mode: bool) -> None:
         print(f"{'Eval' if eval_mode else 'Train'} | Collected reward {reward}")
+    def on_update_epoch_start(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int) -> None:
+        print(f'Epoch {global_epoch} | Starting update epoch {update_epoch}')
     def on_batch_updated(self, actor: nn.Module, epoch: int, step: int, policy_loss: float) -> None:
         print(f'Epoch {epoch} | Step {step} - updated policy loss {policy_loss}')
@@ -579,6 +588,9 @@ class MrlPrintCallback(MrlTrainerCallback):
                           critic_loss: float) -> None:
         print(f'Epoch {epoch} | Step {step} - updated critic loss {critic_loss}')
+    def on_update_epoch_end(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int, policy_loss: float, critic_loss: float) -> None:
+        print(f'Epoch {global_epoch} | Update epoch {update_epoch} - mean policy loss {policy_loss} | mean critic loss {critic_loss}')
     def on_training_end(self, actor: nn.Module, critic: nn.Module, curriculum_config: dict) -> None:
         print(f'Finished training for {curriculum_config["steps"]} steps in {curriculum_config["strategy"]} strategy.')

rxnn-0.2.31/src/rxnn/training/ddp.py ADDED Viewed

@@ -0,0 +1,26 @@
+import torch
+import torch.distributed as dist
+import os
+from ..utils import set_random_seed
+def get_os_ddp_config():
+    rank = int(os.environ['RANK'])
+    world_size = int(os.environ['WORLD_SIZE'])
+    return rank, world_size
+def distributed_mean(x: torch.Tensor) -> torch.Tensor:
+    """Average tensor across all devices"""
+    x = x.clone()
+    dist.all_reduce(x, op=dist.ReduceOp.SUM)
+    x /= dist.get_world_size()
+    return x
+def distributed_value_mean(value: float, device: torch.device = None) -> float:
+    """Average float value across all devices"""
+    tensor = torch.tensor(value, device=device)
+    reduced = distributed_mean(tensor)
+    return reduced.item()
+def set_distributed_random_seed(seed: int):
+    rank = dist.get_rank() if dist.is_initialized() else get_os_ddp_config()[0]
+    set_random_seed(seed + rank)

rxnn 0.2.29__tar.gz → 0.2.31__tar.gz

rxnn 0.2.29tar.gz → 0.2.31tar.gz