PyPI - rxnn - Versions diffs - 0.2.33__tar.gz → 0.2.35__tar.gz - Mend

rxnn 0.2.33tar.gz → 0.2.35tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{rxnn-0.2.33 → rxnn-0.2.35}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.2.33
+Version: 0.2.35
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.2.33 → rxnn-0.2.35}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "rxnn"
-version = "0.2.33"
+version = "0.2.35"
 description = "RxNN: Reactive Neural Networks Platform"
 license = "Apache-2.0"

{rxnn-0.2.33 → rxnn-0.2.35}/src/rxnn/training/models.py RENAMED Viewed

@@ -6,6 +6,7 @@ from huggingface_hub import PyTorchModelHubMixin
 from ..transformers.models import ReactiveTransformerEncoder, ReactiveTransformerDecoder
 from ..transformers.ff import GatedLinearUnit, get_activation_layer
 class MLMHead(nn.Module, PyTorchModelHubMixin, license="apache-2.0"):
     def __init__(self, embed_dim: int, vocab_size: int, *args, **kwargs):
         super(MLMHead, self).__init__(*args, **kwargs)
@@ -38,6 +39,7 @@ class MLMTrainingModel(nn.Module):
         y = self.mlm_head(h)
         return y
 class JointTrainingModel(nn.Module):
     def __init__(
             self,
@@ -59,10 +61,12 @@ class JointTrainingModel(nn.Module):
         y_d = self.decoder(x_d, attention_mask=attention_mask)
         return y_e, y_d
 class MrlActorAction(Enum):
     DECODE = 1
     UPDATE = 2
 class MrlActorModel(nn.Module):
     def __init__(
             self,
@@ -154,15 +158,18 @@ class MrlActorModel(nn.Module):
             list(self.memory_attention.parameters())
         ))
-    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None, action: MrlActorAction = MrlActorAction.DECODE) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None,
+                action: MrlActorAction = MrlActorAction.DECODE) -> torch.Tensor:
         if action == MrlActorAction.DECODE:
             return self.decoder(x, attention_mask=attention_mask)
         else:
             _, ed = self.encoder(x, attention_mask=attention_mask)
             return self.memory_attention(ed, attention_mask=attention_mask)
 class MrlCriticModel(nn.Module, PyTorchModelHubMixin, license="apache-2.0", pipeline_tag="text-classification"):
-    def __init__(self, encoder: nn.Module, embed_dim: int, out_activation: Literal['sigmoid', 'tanh', 'linear'] = 'sigmoid', output_scale: float = 1.0,  **kwargs):
+    def __init__(self, encoder: nn.Module, embed_dim: int,
+                 out_activation: Literal['sigmoid', 'tanh', 'linear'] = 'sigmoid', output_scale: float = 1.0, **kwargs):
         super(MrlCriticModel, self).__init__(**kwargs)
         self.encoder = encoder
         self.value_head = nn.Sequential(
@@ -173,6 +180,12 @@ class MrlCriticModel(nn.Module, PyTorchModelHubMixin, license="apache-2.0", pipe
         )
         self.output_scale = output_scale
+    def head_parameters(self) -> Iterator[nn.Parameter]:
+        return self.value_head.parameters()
+    def encoder_parameters(self) -> Iterator[nn.Parameter]:
+        return self.encoder.parameters()
     def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
         x, _ = self.encoder(x, attention_mask=attention_mask)
@@ -183,4 +196,3 @@ class MrlCriticModel(nn.Module, PyTorchModelHubMixin, license="apache-2.0", pipe
             x = x.mean(dim=1)
         return self.value_head(x) * self.output_scale

{rxnn-0.2.33 → rxnn-0.2.35}/src/rxnn/training/mrl.py RENAMED Viewed

@@ -15,11 +15,13 @@ from .reward import MrlRewardMode, MrlRewardModel
 from .models import MrlActorAction, MrlActorModel, MrlCriticModel
 from .ddp import get_os_ddp_config, distributed_mean
 class MrlConfig(TypedDict):
     lr: float
     separate_memory_lr: Optional[bool]
     memory_lr: Optional[float]
     critic_lr: float
+    critic_encoder_lr: float
     max_seq_len: int
     critic_max_len: int
     weight_decay: float
@@ -58,6 +60,7 @@ class CurriculumConfig(TypedDict):
     lr: Optional[float]
     memory_lr: Optional[float]
     critic_lr: Optional[float]
+    critic_encoder_lr: Optional[float]
     weight_decay: Optional[float]
     critic_weight_decay: Optional[float]
     update_epochs: Optional[int]
@@ -158,6 +161,7 @@ class MRLTrainer:
                 'critic_lr': config.get('critic_lr', 1e-4),
                 'weight_decay': config.get('weight_decay', 0.01),
                 'critic_weight_decay': config.get('critic_weight_decay', 0.01),
+                'critic_encoder_lr': config.get('critic_encoder_lr', config.get('critic_lr', 1e-4)),
             }
         else:
             self.base_optim_config = {
@@ -165,6 +169,7 @@ class MRLTrainer:
                 'critic_lr': config.get('critic_lr', 1e-4),
                 'weight_decay': config.get('weight_decay', 0.01),
                 'critic_weight_decay': config.get('critic_weight_decay', 0.01),
+                'critic_encoder_lr': config.get('critic_encoder_lr', config.get('critic_lr', 1e-4)),
             }
         self.optim_config = self.base_optim_config
@@ -202,6 +207,7 @@ class MRLTrainer:
             critic_lr: float,
             weight_decay: float,
             critic_weight_decay: float,
+            critic_encoder_lr: float,
             memory_lr: Optional[float] = None,
     ) -> tuple[torch.optim.Optimizer, torch.optim.Optimizer]:
         if memory_lr is not None:
@@ -219,8 +225,10 @@ class MRLTrainer:
             )
         critic_optimizer = torch.optim.AdamW(
-            self.critic.parameters(),
-            lr=critic_lr,
+            [
+                {'params': self.critic.head_parameters(), 'lr': critic_lr},
+                {'params': self.critic.encoder_parameters(), 'lr': critic_encoder_lr},
+            ],
             weight_decay=critic_weight_decay,
         )
@@ -633,7 +641,8 @@ class MRLTrainer:
                 for i, t in enumerate(episode['steps'])
             ]
             values = torch.stack([
-                self._critic_values_with_memory(r, *self._move_multiple_batches(*t['state'])) for t, r in flat_trajectories
+                self._critic_values_with_memory(r, *self._move_multiple_batches(*t['state'])) for t, r in
+                flat_trajectories
             ]).to(self.device)
             rewards = torch.stack([torch.tensor(t['reward']) for t, _ in flat_trajectories]).to(self.device)
             dones = torch.stack([torch.tensor(t['done']) for t, _ in flat_trajectories]).to(self.device)
@@ -646,7 +655,8 @@ class MRLTrainer:
             dones = torch.stack([torch.tensor(t['done']) for t in flat_trajectories]).to(self.device)
         return values, rewards, dones
-    def _critic_values_with_memory(self, reset_stm: bool, *moved_state: tuple[TokenizedDict, TokenizedDict, TokenizedDict]) -> torch.Tensor:
+    def _critic_values_with_memory(self, reset_stm: bool,
+                                   *moved_state: tuple[TokenizedDict, TokenizedDict, TokenizedDict]) -> torch.Tensor:
         # 1. Calculate critic values in memory aware version - reset/update STM before calculating values
         with torch.no_grad():
             # 2. Reset STM if it was reset in trajectory collection
@@ -933,6 +943,7 @@ class MRLTrainer:
                     'weight_decay': config.get('weight_decay', self.base_optim_config['weight_decay']),
                     'critic_weight_decay': config.get('critic_weight_decay',
                                                       self.base_optim_config['critic_weight_decay']),
+                    'critic_encoder_lr': config.get('critic_encoder_lr', self.base_optim_config['critic_encoder_lr']),
                     'memory_lr': config.get('memory_lr', self.base_optim_config['memory_lr']),
                 }
             else:
@@ -942,6 +953,7 @@ class MRLTrainer:
                     'weight_decay': config.get('weight_decay', self.base_optim_config['weight_decay']),
                     'critic_weight_decay': config.get('critic_weight_decay',
                                                       self.base_optim_config['critic_weight_decay']),
+                    'critic_encoder_lr': config.get('critic_encoder_lr', self.base_optim_config['critic_encoder_lr']),
                 }
             self.optimizer, self.critic_optimizer = self._init_optimizers(**self.optim_config)
         elif self.optim_config != self.base_optim_config:

{rxnn-0.2.33 → rxnn-0.2.35}/src/rxnn/training/rl.py RENAMED Viewed

@@ -10,7 +10,7 @@ from .ddp import distributed_mean
 class RlAlgorithm(ABC):
     def __init__(self):
         super(RlAlgorithm, self).__init__()
-        self.critic_loss = nn.MSELoss()
+        self.critic_loss_fn = nn.MSELoss()
     @abstractmethod
     def policy_loss(self, query: TokenizedDict, answer: TokenizedDict, logits: torch.Tensor,
@@ -22,7 +22,7 @@ class RlAlgorithm(ABC):
         pass
     def critic_loss(self, values: torch.Tensor, ref_values: torch.Tensor) -> torch.Tensor:
-        return self.critic_loss(values, ref_values)
+        return self.critic_loss_fn(values, ref_values)
 class PPOConfig(TypedDict):
@@ -55,7 +55,7 @@ class PPOAlgorithm(RlAlgorithm):
         # Critic loss with clipped values
         if self.clip_critic_values:
             values = torch.clamp(values, -self.critic_value_clip, self.critic_value_clip)
-        return self.critic_loss(values, ref_values)
+        return self.critic_loss_fn(values, ref_values)
     def policy_loss(self, query: TokenizedDict, answer: TokenizedDict, logits: torch.Tensor,
                     old_log_probs: torch.Tensor, advantages: torch.Tensor) -> torch.Tensor:
@@ -114,15 +114,17 @@ class PPOAlgorithm(RlAlgorithm):
         next_value = last_value
         next_done = torch.zeros(batch_size, device=dones.device)  # Last state is terminal
         dones = dones.float()
-        for t in reversed(range(trajectory_len)):
-            # Check if next state is terminal
-            non_terminal = 1.0 - next_done
-            # Delta should not include next_value if next is terminal
-            delta = rewards[t] + self.gae_gamma * next_value * non_terminal - values[t]
-            advantages[t] = delta + self.gae_gamma * self.gae_lambda * non_terminal * last_advantage
+        for t in reversed(range(trajectory_len)):
+            if t == trajectory_len - 1:
+                # For the last step, use the provided last_value
+                delta = rewards[t] + self.gae_gamma * next_value * (1 - next_done) - values[t]
+            else:
+                # For other steps, use the next value in the trajectory
+                delta = rewards[t] + self.gae_gamma * values[t + 1] * (1 - dones[t + 1]) - values[t]
+            advantages[t] = delta + self.gae_gamma * self.gae_lambda * (1 - dones[t]) * last_advantage
             last_advantage = advantages[t]
-            next_value = values[t]
             next_done = dones[t]
         returns = advantages + values