PyPI - rxnn - Versions diffs - 0.2.29__py3-none-any.whl → 0.2.30__py3-none-any.whl - Mend

rxnn 0.2.29py3-none-any.whl → 0.2.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

rxnn/training/callbacks.py +12 -0
rxnn/training/mrl.py +255 -210
rxnn/training/rl.py +32 -5
{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/METADATA +1 -1
{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/RECORD +7 -7
{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/LICENSE +0 -0
{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/WHEEL +0 -0

rxnn/training/callbacks.py CHANGED Viewed

@@ -536,6 +536,9 @@ class MrlTrainerCallback:
     def on_reward(self, actor: nn.Module, reward: float, generated: str, reference: str, saved_data: str, eval_mode: bool) -> None:
         pass
+    def on_update_epoch_start(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int) -> None:
+        pass
     def on_batch_updated(self, actor: nn.Module, epoch: int, step: int, policy_loss: float) -> None:
         pass
@@ -543,6 +546,9 @@ class MrlTrainerCallback:
                           critic_loss: float) -> None:
         pass
+    def on_update_epoch_end(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int, policy_loss: float, critic_loss: float) -> None:
+        pass
     def on_training_end(self, actor: nn.Module, critic: nn.Module, curriculum_config: dict) -> None:
         pass
@@ -572,6 +578,9 @@ class MrlPrintCallback(MrlTrainerCallback):
                   reference: dict[str, torch.Tensor], saved_data: dict[str, torch.Tensor], eval_mode: bool) -> None:
         print(f"{'Eval' if eval_mode else 'Train'} | Collected reward {reward}")
+    def on_update_epoch_start(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int) -> None:
+        print(f'Epoch {global_epoch} | Starting update epoch {update_epoch}')
     def on_batch_updated(self, actor: nn.Module, epoch: int, step: int, policy_loss: float) -> None:
         print(f'Epoch {epoch} | Step {step} - updated policy loss {policy_loss}')
@@ -579,6 +588,9 @@ class MrlPrintCallback(MrlTrainerCallback):
                           critic_loss: float) -> None:
         print(f'Epoch {epoch} | Step {step} - updated critic loss {critic_loss}')
+    def on_update_epoch_end(self, actor: nn.Module, critic: nn.Module, global_epoch: int, update_epoch: int, policy_loss: float, critic_loss: float) -> None:
+        print(f'Epoch {global_epoch} | Update epoch {update_epoch} - mean policy loss {policy_loss} | mean critic loss {critic_loss}')
     def on_training_end(self, actor: nn.Module, critic: nn.Module, curriculum_config: dict) -> None:
         print(f'Finished training for {curriculum_config["steps"]} steps in {curriculum_config["strategy"]} strategy.')

rxnn/training/mrl.py CHANGED Viewed

@@ -24,6 +24,7 @@ class MrlConfig(TypedDict):
     critic_max_len: int
     weight_decay: float
     critic_weight_decay: float
+    update_epochs: int
 class MrlStrategy(Enum):
@@ -31,9 +32,11 @@ class MrlStrategy(Enum):
     MULTI_STEP_STRATEGY = 2
     LONG_RANGE_STRATEGY = 3
 UnfreezeItem = Union[int, tuple[int, float]]
 UnfreezeEpochsStrategy: TypeAlias = Union[int, tuple[UnfreezeItem, UnfreezeItem, UnfreezeItem, int]]
 class CurriculumConfig(TypedDict):
     steps: int
     epochs: int
@@ -52,6 +55,7 @@ class CurriculumConfig(TypedDict):
     critic_lr: Optional[float]
     weight_decay: Optional[float]
     critic_weight_decay: Optional[float]
+    update_epochs: Optional[int]
 class SamplerConfig(TypedDict):
@@ -66,6 +70,7 @@ class MrlTrajectoryStep(TypedDict):
     log_probs: torch.Tensor
     reward: list[float]
     reference: TokenizedDict
+    done: bool
 class MrlTrajectoryEpisode(TypedDict):
@@ -107,6 +112,9 @@ class MRLTrainer:
         self.device = device
         self.max_seq_len = config.get('max_seq_len', 256)
         self.critic_max_len = config.get('critic_max_len', 512)
+        # Internal update epochs config
+        self.shared_update_epochs = config.get('update_epochs', 10)
+        self.update_epochs = self.shared_update_epochs
         # Move models to device
         if use_amp:
@@ -187,8 +195,8 @@ class MRLTrainer:
     ) -> tuple[torch.optim.Optimizer, torch.optim.Optimizer]:
         if memory_lr is not None:
             optimizer = torch.optim.AdamW([
-                { 'params': self.actor.not_memory_parameters(), 'lr': lr },
-                { 'params': self.actor.memory_parameters(), 'lr': memory_lr },
+                {'params': self.actor.not_memory_parameters(), 'lr': lr},
+                {'params': self.actor.memory_parameters(), 'lr': memory_lr},
             ],
                 weight_decay=weight_decay,
             )
@@ -207,11 +215,9 @@ class MRLTrainer:
         return optimizer, critic_optimizer
     def _init_steps(self):
         return {
             'collect': 0,
-            'critic': 0,
             'rl': 0,
             'eval': 0,
         }
@@ -351,7 +357,7 @@ class MRLTrainer:
                     # state from existing one, instead of new random one)
                     reset_done = self.reset_stm()
-                    # 4. Reset reward prev data running mean - it's calculated for multi-step retention, we have to reset it before episode
+                    # 4. Reset reward prev data running mean - it's calculated for multistep retention, we have to reset it before episode
                     self.reward.reset_running_mean()
                     # 5. Get first batch of interactions (data to save) and follow-up interactions for current episode, based on curriculum step
@@ -406,6 +412,7 @@ class MRLTrainer:
                             'log_probs': log_probs.detach().cpu(),
                             'reward': reward,
                             'reference': interaction['answer'],
+                            'done': is_last_interaction,
                         }
                         episode_steps.append(trajectory)
                         episode_rewards.append(reward)
@@ -432,92 +439,23 @@ class MRLTrainer:
         return trajectories
-    def _critic_loss(self, inputs: TokenizedDict, rewards: torch.Tensor) -> torch.Tensor:
+    def _critic_loss(self, inputs: TokenizedDict, ref_values: torch.Tensor) -> torch.Tensor:
         # 1. Calculate values with critic encoder
         values = self.critic(
             inputs['input_ids'],
             attention_mask=inputs['attention_mask'],
         ).squeeze()
         # 2. Calculate critic loss
-        loss = self.rl_algorithm.critic_loss(values, rewards)
+        loss = self.rl_algorithm.critic_loss(values, ref_values)
         return loss
     def _critic_writer(self, critic_loss: float, epoch: int):
         if self.writer is not None:
-            self.writer.add_scalar('Loss/critic (global)', critic_loss, self.global_step['critic'])
+            self.writer.add_scalar('Loss/critic (global)', critic_loss, self.global_step['rl'])
             self.writer.add_scalar(f'Loss/critic (steps: {self.curriculum_steps}, epoch: {epoch})', critic_loss,
-                                   self.epoch_step['critic'])
+                                   self.epoch_step['rl'])
             self.writer.add_scalar(f'Loss/critic (steps: {self.curriculum_steps})', critic_loss,
-                                   self.stage_step['critic'])
-    def update_critic(self, states: list[tuple[TokenizedDict, TokenizedDict, TokenizedDict]],
-                      rewards: list[torch.Tensor], epoch: int):
-        """Update critic network using MSE loss."""
-        # 1. Run critic updates for all collected batches
-        critic_losses = []
-        for step_idx, (state, reward) in enumerate(zip(states, rewards)):
-            self._increment_steps('critic')
-            # 2. Move state batches to training device (GPU)
-            prev_query, prev_answer, next_query = self._move_multiple_batches(*state)
-            # 3. Reset critic gradients
-            self.critic_optimizer.zero_grad()
-            # 4. Run critic and calculate loss - in autocast on/off mode
-            if self.use_amp:
-                # Move tensors to training device and calculate loss in autocast mode
-                batch_rewards = reward.to(self.device)
-                with torch.amp.autocast(device_type=self.device.type, dtype=self.dtype):
-                    # Concatenate state into single critic input sequence
-                    inputs = smart_concat_critic_states(
-                        prev_query, prev_answer, next_query,
-                        max_length=self.critic_max_len,
-                        pad_token_id=self.pad_token_id,
-                    )
-                    loss = self._critic_loss(inputs, batch_rewards)
-                # Run backpropagation with scaler
-                self.critic_scaler.scale(loss).backward()
-                # Unscale and clip gradients
-                self.critic_scaler.unscale_(self.critic_optimizer)
-                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0, error_if_nonfinite=False)
-                # Run scaled optimization step
-                self.critic_scaler.step(self.critic_optimizer)
-                self.critic_scaler.update()
-            else:
-                # Concatenate state into single critic input sequence
-                inputs = smart_concat_critic_states(
-                    prev_query, prev_answer, next_query,
-                    max_length=self.critic_max_len,
-                    pad_token_id=self.pad_token_id,
-                )
-                # Calculate loss
-                loss = self._critic_loss(inputs, reward.to(self.device, dtype=self.dtype))
-                # Run backpropagation
-                loss.backward()
-                # Clip gradients
-                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0, error_if_nonfinite=False)
-                # Run optimizer step
-                self.critic_optimizer.step()
-            critic_loss = loss.item()
-            self._critic_writer(critic_loss, epoch)
-            # 5. Run "on critic updated" callbacks
-            for cb in self.callbacks:
-                cb.on_critic_updated(self.actor, self.critic, epoch, step_idx, critic_loss)
-            # 6. Accumulate loss for epoch callbacks
-            critic_losses.append(critic_loss)
-        # 7. Calculate mean loss for epoch callbacks
-        critic_mean_loss = torch.tensor(critic_losses).mean().item()
-        return critic_mean_loss
-    def _critic_advantages(self, critic_state: TokenizedDict, rewards: torch.Tensor) -> torch.Tensor:
-        with torch.no_grad():
-            values = self.critic(critic_state['input_ids'],
-                                 attention_mask=critic_state['attention_mask']).squeeze()
-        return self.rl_algorithm.calculate_advantages(rewards, values)
+                                   self.stage_step['rl'])
     def _rl_writer(self, policy_loss: float, epoch: int):
         if self.writer is not None:
@@ -526,107 +464,208 @@ class MRLTrainer:
                                    self.epoch_step['rl'])
             self.writer.add_scalar(f'Loss/policy (steps: {self.curriculum_steps})', policy_loss, self.stage_step['rl'])
-    def rl_step(self, trajectories: list[MrlTrajectoryEpisode], epoch: int):
+    def _update_critic(self, state: tuple[TokenizedDict, TokenizedDict, TokenizedDict], ref_values: torch.Tensor,
+                       epoch: int) -> float:
+        # 1. Reset critic gradients
+        self.critic_optimizer.zero_grad()
+        # 2. Update critic - with autocast on/off
+        if self.use_amp:
+            with torch.amp.autocast(device_type=self.device.type, dtype=self.dtype):
+                # 2.1 Concat states and calculate critic loss
+                critic_state = smart_concat_critic_states(*state, max_length=self.critic_max_len,
+                                                          pad_token_id=self.pad_token_id)
+                critic_loss = self._critic_loss(critic_state, ref_values)
+            # 2.2 Run backpropagation with scaler
+            self.critic_scaler.scale(critic_loss).backward()
+            # 2.3 Unscale and clip gradients
+            self.critic_scaler.unscale_(self.critic_optimizer)
+            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0, error_if_nonfinite=False)
+            # 2.4 Run scaled optimization step
+            self.critic_scaler.step(self.critic_optimizer)
+            self.critic_scaler.update()
+        else:
+            # 2.1 Concat states and calculate critic loss
+            critic_state = smart_concat_critic_states(*state, max_length=self.critic_max_len,
+                                                      pad_token_id=self.pad_token_id)
+            critic_loss = self._critic_loss(critic_state, ref_values)
+            # 2.2 Run backpropagation
+            critic_loss.backward()
+            # 2.3 Clip gradients
+            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=1.0, error_if_nonfinite=False)
+            # 2.4 Run optimizer step
+            self.critic_optimizer.step()
+        # 3. Get float loss value for callbacks/writer
+        critic_loss_item = critic_loss.item()
+        # 4. Write to TensorBoard
+        self._critic_writer(critic_loss_item, epoch)
+        # 5. Run "on critic updated" callbacks
+        for cb in self.callbacks:
+            cb.on_critic_updated(self.actor, self.critic, epoch, self.epoch_step['rl'], critic_loss_item)
+        # 6. Return loss item
+        return critic_loss_item
+    def _update_actor(self, state: tuple[TokenizedDict, TokenizedDict, TokenizedDict], action: TokenizedDict,
+                      advantages: torch.Tensor, old_log_probs: torch.Tensor, epoch: int) -> float:
+        # 1. Reset actor gradients
+        self.optimizer.zero_grad()
+        # 2. Unpack state dicts
+        query, answer, next_query = state
+        # 3. Encode and update STM on each step, to include encoder and memory attention gradients in loss
+        self.encode_and_update_stm(query, answer)
+        # 4. Update actor - with autocast on/off
+        if self.use_amp:
+            with torch.amp.autocast(device_type=self.device.type, dtype=self.dtype):
+                # 4.1 Concatenate next query and action and get action logits from decoder
+                inputs = smart_concat(next_query, action, max_length=self.max_seq_len,
+                                      pad_token_id=self.pad_token_id)
+                logits = self.actor(inputs['input_ids'], attention_mask=inputs['attention_mask'],
+                                    action=MrlActorAction.DECODE)
+                # 4.2 Calculate policy loss with selected algorithm
+                policy_loss = self.rl_algorithm.policy_loss(next_query, action, logits, old_log_probs,
+                                                            advantages)
+            # 4.3 Run backpropagation with scaler
+            self.scaler.scale(policy_loss).backward(retain_graph=True)
+            # 4.4 Unscale and clip gradient norms
+            self.scaler.unscale_(self.optimizer)
+            torch.nn.utils.clip_grad_norm_(self.actor.unique_parameters(), max_norm=1.0,
+                                           error_if_nonfinite=False)
+            # 4.5 Run scaled optimization step
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+        else:
+            # 4.1 Concatenate next query and action and get action logits from decoder
+            inputs = smart_concat(next_query, action, max_length=self.max_seq_len,
+                                  pad_token_id=self.pad_token_id)
+            logits = self.actor(inputs['input_ids'], attention_mask=inputs['attention_mask'],
+                                action=MrlActorAction.DECODE)
+            # 4.2 Calculate policy loss with selected algorithm
+            policy_loss = self.rl_algorithm.policy_loss(next_query, action, logits, old_log_probs, advantages)
+            # 4.3 Run backpropagation
+            policy_loss.backward(retain_graph=True)
+            # 4.4 Clip gradient norms
+            torch.nn.utils.clip_grad_norm_(self.actor.unique_parameters(), max_norm=1.0,
+                                           error_if_nonfinite=False)
+            # 4.5 Run scaled optimization step
+            self.optimizer.step()
+        # 5. Get float loss value for callbacks/writer
+        policy_loss_item = policy_loss.item()
+        # 6. Write to TensorBoard
+        self._rl_writer(policy_loss_item, epoch)
+        # 7. Run "on batch updated" callback
+        for cb in self.callbacks:
+            cb.on_batch_updated(self.actor, epoch, self.epoch_step['rl'], policy_loss_item)
+        # 8. Return loss item
+        return policy_loss_item
+    def rl_step(self, trajectories: list[MrlTrajectoryEpisode], advantages: torch.Tensor, ref_values: torch.Tensor,
+                epoch: int, batch_size: int) -> tuple[float, float]:
         """Perform PPO update step using trajectories."""
         # 1. Run update separately for episodes in trajectory - we have to reset memory before each episode, and update
         # memory, based on collected episode data
         all_losses = []
-        trajectories_len = len(trajectories)
+        critic_losses = []
         for episode_idx, episode in enumerate(trajectories):
             episode_steps = episode['steps']
             should_reset_stm = episode['reset_stm']
-            # 2. Reset memory for current batch episode
+            # 2. Get advantages and reference values for current full episode (batch_size * episode_steps)
+            start = episode_idx * episode_steps
+            end = start + episode_steps
+            episode_critic_values = ref_values[start:end]
+            episode_advantages = advantages[start:end]
+            # 3. Reset memory for current batch episode
             if should_reset_stm:
                 self.reset_stm()
-            # 3. Run episode steps - each episode has number of steps depending on curriculum stage. Each step is run for all batch
-            for step in episode_steps:
+            # 4. Run episode steps - each episode has number of steps depending on curriculum stage. Each step is run for all batch
+            for step_idx, step in enumerate(episode_steps):
                 self._increment_steps('rl')
-                state, action, reward, log_probs = step['state'], step['action'], step['reward'], step['log_probs']
+                # 5. Get and move to device collected states, action and log probs
+                state, action, _, log_probs = step['state'], step['action'], step['reward'], step['log_probs']
                 query, answer, next_query = self._move_multiple_batches(*state)
                 action = self._move_batch(action)
                 log_probs = log_probs.to(self.device)
-                rewards = torch.tensor(reward).to(self.device)
-                # 4. Compute advantages using critic
-                if self.use_amp:
-                    with torch.amp.autocast(device_type=self.device.type, dtype=self.dtype):
-                        critic_state = smart_concat_critic_states(query, answer, next_query,
-                                                                  max_length=self.critic_max_len,
-                                                                  pad_token_id=self.pad_token_id)
-                        advantages = self._critic_advantages(critic_state, rewards)
-                else:
-                    critic_state = smart_concat_critic_states(query, answer, next_query, max_length=self.critic_max_len,
-                                                              pad_token_id=self.pad_token_id)
-                    advantages = self._critic_advantages(critic_state, rewards)
-                # 5. Encode and update STM on each step, to include encoder and memory attention gradients in loss
-                self.encode_and_update_stm(query, answer)
-                # 6. Concatenate next query and action and get action logits from decoder
-                if self.use_amp:
-                    with torch.amp.autocast(device_type=self.device.type, dtype=self.dtype):
-                        inputs = smart_concat(next_query, action, max_length=self.max_seq_len,
-                                              pad_token_id=self.pad_token_id)
-                        logits = self.actor(inputs['input_ids'], attention_mask=inputs['attention_mask'],
-                                            action=MrlActorAction.DECODE)
-                else:
-                    inputs = smart_concat(next_query, action, max_length=self.max_seq_len,
-                                          pad_token_id=self.pad_token_id)
-                    logits = self.actor(inputs['input_ids'], attention_mask=inputs['attention_mask'],
-                                        action=MrlActorAction.DECODE)
-                # 7. Calculate RL Algorithm (PPO etc.) loss
-                policy_loss = self.rl_algorithm.policy_loss(next_query, action, logits, log_probs, advantages)
-                # 8. Reset gradients
-                self.optimizer.zero_grad()
-                # 9. Update the model in AMP or regular mode
-                if self.use_amp:
-                    self.scaler.scale(policy_loss).backward(retain_graph=True)
-                    self.scaler.unscale_(self.optimizer)
-                    torch.nn.utils.clip_grad_norm_(self.actor.unique_parameters(), max_norm=1.0,
-                                                   error_if_nonfinite=False)
-                    self.scaler.step(self.optimizer)
-                    self.scaler.update()
-                else:
-                    policy_loss.backward(retain_graph=True)
-                    torch.nn.utils.clip_grad_norm_(self.actor.unique_parameters(), max_norm=1.0,
-                                                   error_if_nonfinite=False)
-                    self.optimizer.step()
-                policy_loss_item = policy_loss.item()
-                self._rl_writer(policy_loss_item, epoch)
-                all_losses.append(policy_loss_item)
+                # 6. Select advantages and reference values for current step (batch_size)
+                step_critic_values = episode_critic_values[step_idx]
+                step_advantages = episode_advantages[step_idx]
-                # 10. Run "on batch updated" callback
-                for cb in self.callbacks:
-                    cb.on_batch_updated(self.actor, epoch, self.epoch_step['rl'], policy_loss_item)
+                # 7. Update critic
+                critic_loss_item = self._update_critic((query, answer, next_query), step_critic_values, epoch)
-        return torch.mean(torch.tensor(all_losses)).item()
+                # 8. Accumulate critic loss for epoch callbacks
+                critic_losses.append(critic_loss_item)
-    def _critic_states_and_rewards(self, trajectories: list[MrlTrajectoryEpisode]):
+                # 9. Update actor
+                policy_loss_item = self._update_actor((query, answer, next_query), action, step_advantages, log_probs,
+                                                      epoch)
+                all_losses.append(policy_loss_item)
+        # 10. Return mean losses for epoch callbacks
+        return torch.mean(torch.tensor(all_losses)).item(), torch.mean(torch.tensor(critic_losses)).item()
+    def _critic_values_rewards_and_dones(self, trajectories: list[MrlTrajectoryEpisode]):
         flat_trajectories = [t for episode in trajectories for t in episode['steps']]
-        states = [t['state'] for t in flat_trajectories]
-        rewards = [torch.tensor(t['reward']) for t in flat_trajectories]
-        return states, rewards
+        values = [
+            self._critic_values(
+                smart_concat_critic_states(
+                    *self._move_multiple_batches(*t['state']),
+                    max_length=self.critic_max_len,
+                    pad_token_id=self.pad_token_id,
+                )
+            ) for t in flat_trajectories
+        ]
+        values = torch.stack(values).to(self.device)
+        rewards = torch.stack([torch.tensor(t['reward']) for t in flat_trajectories]).to(self.device)
+        dones = torch.stack([torch.BoolTensor(t['done']) for t in flat_trajectories]).to(self.device)
+        return values, rewards, dones
+    def _critic_values(self, inputs: TokenizedDict) -> torch.Tensor:
+        with torch.no_grad():
+            return self.critic(inputs['input_ids'],
+                                 attention_mask=inputs['attention_mask']).squeeze()
+        # return self.rl_algorithm.calculate_advantages(rewards, values)
     def train_epoch(self, dataloader: DataLoader, epoch: int, batch_size: int):
         """Train for one epoch."""
         # 1. Collect trajectories for current epoch
         trajectories = self.collect_trajectories(dataloader, epoch, batch_size)
-        # 2. Flatten trajectories and collect state and rewards for critic update
-        states, rewards = self._critic_states_and_rewards(trajectories)
-        # 3. Update critic model, based on states and rewards
-        critic_loss = self.update_critic(states, rewards, epoch)
+        # 2. Flatten trajectories, call critic and collect values, dones and rewards, and calculate advantages
+        if self.use_amp:
+            with torch.amp.autocast(device_type=self.device.type, dtype=self.dtype):
+                values, rewards, dones = self._critic_values_rewards_and_dones(trajectories)
+                advantages, ref_values = self.rl_algorithm.calculate_advantages(rewards, values, dones)
+        else:
+            values, rewards, dones = self._critic_values_rewards_and_dones(trajectories)
+            advantages, ref_values = self.rl_algorithm.calculate_advantages(rewards, values, dones)
-        # 4. Run PPO algorithm step
-        policy_loss = self.rl_step(trajectories, epoch)
+        # 3. Run internal update epochs
+        critic_loss_sum, policy_loss_sum = 0.0, 0.0
+        for update_epoch in range(self.update_epochs):
+            # 4. Run 'on update epoch start' callbacks
+            for cb in self.callbacks:
+                cb.on_update_epoch_start(self.actor, self.critic, epoch, update_epoch)
+            # 5. Run RL algorithm step
+            policy_loss, critic_loss = self.rl_step(trajectories[:-1], advantages, ref_values, epoch, batch_size)
-        # 5. Return policy and critic mean losses for epoch callbacks
-        return policy_loss, critic_loss
+            for cb in self.callbacks:
+                cb.on_update_epoch_end(self.actor, self.critic, epoch, update_epoch, policy_loss, critic_loss)
+            critic_loss_sum += critic_loss
+            policy_loss_sum += policy_loss
+        # 6. Return policy and critic mean losses for epoch callbacks
+        return policy_loss_sum / self.update_epochs, critic_loss_sum / self.update_epochs
     def _eval_loader(self, batch_size: int):
         if self.use_ddp:
@@ -747,55 +786,6 @@ class MRLTrainer:
         return should_stop_stage
-    def _setup_curriculum_step(self, config: CurriculumConfig) -> tuple[tuple[int, UnfreezeEpochsStrategy], tuple[bool, int, float]]:
-        # 1. Set common fields based on config
-        self.curriculum_steps = config.get('steps', 1)  # number of steps to run in episode
-        self.train_dataset = config.get('dataset', None)  # training dataset for current curriculum stage
-        self.eval_dataset = config.get('eval_dataset', None)  # evaluation dataset for current curriculum stage
-        self.callbacks = config.get('callbacks',
-                                    self.shared_callbacks)  # trainer callbacks for current curriculum stage
-        self.strategy = config.get('strategy',
-                                   MrlStrategy.MULTI_STEP_STRATEGY)  # MRL strategy for given curriculum stage
-        self.reward = config.get('reward_model', self.shared_reward_model)  # MRL Reward Model for curriculum stage
-        if config['lr'] is not None or config['critic_lr'] is not None or config['weight_decay'] is not None or config['critic_weight_decay'] is not None or (config['separate_memory_lr'] and config['memory_lr'] is not None):
-            if config.get('separate_memory_lr', False):
-                self.optim_config = {
-                    'lr': config.get('lr', self.base_optim_config['lr']),
-                    'critic_lr': config.get('critic_lr', self.base_optim_config['critic_lr']),
-                    'weight_decay': config.get('weight_decay', self.base_optim_config['weight_decay']),
-                    'critic_weight_decay': config.get('critic_weight_decay', self.base_optim_config['critic_weight_decay']),
-                    'memory_lr': config.get('memory_lr', self.base_optim_config['memory_lr']),
-                }
-            else:
-                self.optim_config = {
-                    'lr': config.get('lr', self.base_optim_config['lr']),
-                    'critic_lr': config.get('critic_lr', self.base_optim_config['critic_lr']),
-                    'weight_decay': config.get('weight_decay', self.base_optim_config['weight_decay']),
-                    'critic_weight_decay': config.get('critic_weight_decay', self.base_optim_config['critic_weight_decay']),
-                }
-            self.optimizer, self.critic_optimizer = self._init_optimizers(**self.optim_config)
-        elif self.optim_config != self.base_optim_config:
-            self.optim_config = self.base_optim_config
-            self.optimizer, self.critic_optimizer = self._init_optimizers(**self.optim_config)
-        # 2. Get epochs and random resets configs
-        epochs = config.get('epochs', 5)  # number of epochs for current stage
-        unfreeze_epoch = config.get('unfreeze_epoch',
-                                    0)  # epoch when components (other than memory) are unfrozen (before epoch starts)
-        random_resets = config.get('random_resets',
-                                   False)  # flag for using random STM resets (recommended, as model should learn transitions between different states)
-        random_resets_from = config.get('random_resets_from', None)  # epoch from which random STM resets are started
-        random_resets_ratio = config.get('random_resets_ratio',
-                                         None)  # ratio of random STM resets - 1.0 is "always reset", 0.0 is "no resets"
-        # 3. Reset stage step counter
-        self.stage_step = self._init_steps()
-        return (epochs, unfreeze_epoch), (random_resets, random_resets_from, random_resets_ratio)
     def _apply_unfreeze_strategy(self, epoch: int, unfreeze_epoch: UnfreezeEpochsStrategy):
         is_staged_unfreeze = isinstance(unfreeze_epoch, tuple)
         if is_staged_unfreeze:
@@ -808,28 +798,31 @@ class MRLTrainer:
                     self.optimizer = self._init_unfreeze_optimizer('update', cross_att_lr)
                     print(f"Activating 'update' unfreeze strategy with custom cross_att_lr: {cross_att_lr}")
             elif epoch == update_epoch:
-                 self.actor.freeze_components('update')
-                 print(f"Activating 'update' unfreeze strategy - mem-att trainable / cross-att frozen / rest model frozen")
+                self.actor.freeze_components('update')
+                print(
+                    f"Activating 'update' unfreeze strategy - mem-att trainable / cross-att frozen / rest model frozen")
             if isinstance(fetch_epoch, tuple):
                 switch_epoch, mem_att_lr = fetch_epoch
-                if epoch == fetch_epoch:
+                if epoch == switch_epoch:
                     self.actor.freeze_components('joint')
                     self.optimizer = self._init_unfreeze_optimizer('fetch', mem_att_lr)
                     print(f"Activating 'fetch' unfreeze strategy with custom mem_att_lr: {mem_att_lr}")
             elif epoch == fetch_epoch:
                 self.actor.freeze_components('fetch')
-                print(f"Activating 'fetch' unfreeze strategy - mem-att frozen / cross-att trainable / rest model frozen")
+                print(
+                    f"Activating 'fetch' unfreeze strategy - mem-att frozen / cross-att trainable / rest model frozen")
             if isinstance(joint_epoch, tuple):
                 switch_epoch, model_lr = joint_epoch
-                if epoch == joint_epoch:
+                if epoch == switch_epoch:
                     self.actor.unfreeze_components()
                     self.optimizer = self._init_unfreeze_optimizer('joint', model_lr)
                     print(f"Activating 'joint' unfreeze strategy with custom model_lr: {model_lr}")
             elif epoch == joint_epoch:
-                    self.actor.freeze_components('joint')
-                    print(f"Activating 'joint' unfreeze strategy - mem-att/cross-att trainable / rest model frozen")
+                self.actor.freeze_components('joint')
+                print(f"Activating 'joint' unfreeze strategy - mem-att/cross-att trainable / rest model frozen")
             if epoch == all_epoch:
                 self.actor.unfreeze_components()
                 self.optimizer = self._init_unfreeze_optimizer('all', 0.)
@@ -871,6 +864,56 @@ class MRLTrainer:
         return torch.optim.AdamW(params, weight_decay=self.optim_config['weight_decay'])
+    def _setup_curriculum_step(self, config: CurriculumConfig) -> tuple[
+        tuple[int, UnfreezeEpochsStrategy], tuple[bool, int, float]]:
+        # 1. Set common fields based on config
+        self.curriculum_steps = config.get('steps', 1)  # number of steps to run in episode
+        self.train_dataset = config.get('dataset', None)  # training dataset for current curriculum stage
+        self.eval_dataset = config.get('eval_dataset', None)  # evaluation dataset for current curriculum stage
+        self.callbacks = config.get('callbacks',
+                                    self.shared_callbacks)  # trainer callbacks for current curriculum stage
+        self.strategy = config.get('strategy',
+                                   MrlStrategy.MULTI_STEP_STRATEGY)  # MRL strategy for given curriculum stage
+        self.reward = config.get('reward_model', self.shared_reward_model)  # MRL Reward Model for curriculum stage
+        self.update_epochs = config.get('update_epochs', self.shared_update_epochs)  # Internal update epochs
+        if config['lr'] is not None or config['critic_lr'] is not None or config['weight_decay'] is not None or config[
+            'critic_weight_decay'] is not None or (config['separate_memory_lr'] and config['memory_lr'] is not None):
+            if config.get('separate_memory_lr', False):
+                self.optim_config = {
+                    'lr': config.get('lr', self.base_optim_config['lr']),
+                    'critic_lr': config.get('critic_lr', self.base_optim_config['critic_lr']),
+                    'weight_decay': config.get('weight_decay', self.base_optim_config['weight_decay']),
+                    'critic_weight_decay': config.get('critic_weight_decay',
+                                                      self.base_optim_config['critic_weight_decay']),
+                    'memory_lr': config.get('memory_lr', self.base_optim_config['memory_lr']),
+                }
+            else:
+                self.optim_config = {
+                    'lr': config.get('lr', self.base_optim_config['lr']),
+                    'critic_lr': config.get('critic_lr', self.base_optim_config['critic_lr']),
+                    'weight_decay': config.get('weight_decay', self.base_optim_config['weight_decay']),
+                    'critic_weight_decay': config.get('critic_weight_decay',
+                                                      self.base_optim_config['critic_weight_decay']),
+                }
+            self.optimizer, self.critic_optimizer = self._init_optimizers(**self.optim_config)
+        elif self.optim_config != self.base_optim_config:
+            self.optim_config = self.base_optim_config
+            self.optimizer, self.critic_optimizer = self._init_optimizers(**self.optim_config)
+        # 2. Get epochs and random resets configs
+        epochs = config.get('epochs', 5)  # number of epochs for current stage
+        unfreeze_epoch = config.get('unfreeze_epoch',
+                                    0)  # epoch when components (other than memory) are unfrozen (before epoch starts)
+        random_resets = config.get('random_resets',
+                                   False)  # flag for using random STM resets (recommended, as model should learn transitions between different states)
+        random_resets_from = config.get('random_resets_from', None)  # epoch from which random STM resets are started
+        random_resets_ratio = config.get('random_resets_ratio',
+                                         None)  # ratio of random STM resets - 1.0 is "always reset", 0.0 is "no resets"
+        # 3. Reset stage step counter
+        self.stage_step = self._init_steps()
+        return (epochs, unfreeze_epoch), (random_resets, random_resets_from, random_resets_ratio)
     def __call__(self, curriculum_config: list[CurriculumConfig], batch_size: int):
         """Start Memory Reinforcement Learning Curriculum."""
@@ -899,9 +942,11 @@ class MRLTrainer:
             if unfreeze_epoch != 0:
                 self.actor.freeze_components('joint')
                 if isinstance(unfreeze_epoch, tuple):
-                    print(f"Starting training with unfreeze strategies - 'warmup' - mem-att/cross-att trainable / rest model frozen")
+                    print(
+                        f"Starting training with unfreeze strategies - 'warmup' - mem-att/cross-att trainable / rest model frozen")
                 else:
-                    print(f"Starting training with simple unfreeze - 'joint' - mem-att/cross-att trainable / rest model frozen")
+                    print(
+                        f"Starting training with simple unfreeze - 'joint' - mem-att/cross-att trainable / rest model frozen")
             # 5. Setup train DataLoader
             if self.use_ddp:

rxnn/training/rl.py CHANGED Viewed

@@ -17,7 +17,7 @@ class RlAlgorithm(ABC):
         pass
     @abstractmethod
-    def calculate_advantages(self, rewards: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
+    def calculate_advantages(self, rewards: torch.Tensor, values: torch.Tensor, dones: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         pass
     def critic_loss(self, rewards: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
@@ -25,6 +25,9 @@ class RlAlgorithm(ABC):
 class PPOConfig(TypedDict):
     clip_eps: float
+    gae_lambda: float
+    gae_gamma: float
+    entropy_coef: float
 class PPOAlgorithm(RlAlgorithm):
     def __init__(self, config: PPOConfig):
@@ -32,6 +35,9 @@ class PPOAlgorithm(RlAlgorithm):
         # PPO Config
         self.clip_eps = config.get('clip_eps', 0.2)
+        self.gae_lambda = config.get('gae_lambda', 0.95)
+        self.gae_gamma = config.get('gae_gamma', 0.99)
+        self.entropy_coef = config.get('entropy_coef', 0.01)
     def policy_loss(self, query: TokenizedDict, answer: TokenizedDict, logits: torch.Tensor,
                     old_log_probs: torch.Tensor, advantages: torch.Tensor) -> torch.Tensor:
@@ -78,11 +84,32 @@ class PPOAlgorithm(RlAlgorithm):
         # d) Entropy bonus
         entropy = -torch.sum(new_probs * new_probs.exp(), dim=-1).mean()
-        policy_loss -= 0.01 * entropy
+        policy_loss -= self.entropy_coef * entropy
         return policy_loss
-    def calculate_advantages(self, rewards: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
-        advantages = rewards - values
+    def _compute_gae(self, rewards: torch.Tensor, values: torch.Tensor, next_value: torch.Tensor, dones: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        T, B = rewards.shape
+        advantages = torch.zeros_like(rewards, device=values.device)
+        last_advantage = 0
+        last_value = next_value.detach()
+        for t in reversed(range(T)):
+            if t == T - 1:
+                next_values = last_value
+            else:
+                next_values = values[t + 1]
+            # Mask next values if episode ended
+            next_values = next_values * (1 - dones[t])
+            delta = rewards[t] + self.gae_gamma * next_values - values[t]
+            advantages[t] = delta + self.gae_gamma * self.gae_lambda * last_advantage
+            last_advantage = advantages[t]
+        returns = advantages + values
+        return advantages, returns
+    def calculate_advantages(self, rewards: torch.Tensor, values: torch.Tensor, dones: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        advantages, ref_values = self._compute_gae(rewards[:-1], values[:-1], values[-1], dones[:-1])
         normalized_advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
-        return normalized_advantages
+        return normalized_advantages, ref_values

{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.2.29
+Version: 0.2.30
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/RECORD RENAMED Viewed

@@ -13,12 +13,12 @@ rxnn/rxt/models.py,sha256=r8wZeeNTC2VAhiiNe4y7LrbnB4wjFu_cupKiGkpdgjI,13002
 rxnn/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rxnn/training/base.py,sha256=_xik1GXE4RJ_nxwqLQ1ccXA5pRtBCi-jL-jeRFBdHBU,11851
 rxnn/training/bml.py,sha256=FJszaQXOLx2ZHBa1CQpyMrG8i4Kj14E-gzDAEK_Ei5k,17272
-rxnn/training/callbacks.py,sha256=-N0MQPpZQaUWCINdTOsjul4bDGbGr2JgQBqOIXBLS6o,35053
+rxnn/training/callbacks.py,sha256=p72lbzFAmFjpcUvyy4aUB3qd53I8C6Sk5w9nQvsKgTk,35852
 rxnn/training/dataset.py,sha256=7hTilFWPpqUEc6zNcMqBPjxFKxCfvTKKF3E8tVlwccQ,51250
 rxnn/training/models.py,sha256=2KhNT7yx0AgUke4nmsFqzQKx_YYp78QvsLWYZjWeUgQ,6812
-rxnn/training/mrl.py,sha256=MnLaYWxblc5cF261R5PNjIvddVQVNxyjAkEYtchBn9E,49299
+rxnn/training/mrl.py,sha256=DGevQoimkB9qBEkqIw1kkh5DfLqBM-XGvFkraqh-uYk,51545
 rxnn/training/reward.py,sha256=7MTVdNm5HnWmt6zFDi3TAYmnVSL_-24riOoY2F7z4x8,11290
-rxnn/training/rl.py,sha256=j-KNLoZjhaEKasYNOc8DxHtwvknAgAJFwvXKot6otFA,3272
+rxnn/training/rl.py,sha256=U-mlTK2hF0wZQslzjlvF4S_sMkeTuSqKsCB3IWEsd2A,4558
 rxnn/training/scheduler.py,sha256=LcjU35mEwz2U5x3U6tLfeeYlBqMxbFSxYzJYuXkWbSY,1408
 rxnn/training/tokenizer.py,sha256=umaLByMBx_NMrQElA45HLm9gkuzyKWDTFaKVd-CjXl0,8344
 rxnn/training/utils.py,sha256=Bw8nZLKIt7NQpUVCYkb_79kWKChVFOYgYXwODo4SvNc,5718
@@ -32,7 +32,7 @@ rxnn/transformers/moe.py,sha256=j6jEx6Ip0zttlUZKKn82azxo95lkLZs-H2GLSMD88hY,5859
 rxnn/transformers/positional.py,sha256=1PjcJybUzeQlIKJI4tahAGZcYgCRCL0otxs7mpsNuzM,4410
 rxnn/transformers/sampler.py,sha256=t6iiQTdLQ0TakUWnnhKkb5DKF2F_9-thXHBydDF3fxg,17389
 rxnn/utils.py,sha256=ihb6OTyDtPiocB_lOvnq7eOkjjpCkgs8wxvXUBNQ7mM,996
-rxnn-0.2.29.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
-rxnn-0.2.29.dist-info/METADATA,sha256=WVEyKmyYbMOb5sm7vjjnCN9j8ABz0QfGJCYkQbWvwT8,25960
-rxnn-0.2.29.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-rxnn-0.2.29.dist-info/RECORD,,
+rxnn-0.2.30.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
+rxnn-0.2.30.dist-info/METADATA,sha256=zRJ_oHLqUD0QDKJoGRJ6FH5MC-y0k8nOn_inZ_iEP8c,25960
+rxnn-0.2.30.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+rxnn-0.2.30.dist-info/RECORD,,

{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/LICENSE RENAMED Viewed

File without changes

{rxnn-0.2.29.dist-info → rxnn-0.2.30.dist-info}/WHEEL RENAMED Viewed

File without changes

rxnn 0.2.29__py3-none-any.whl → 0.2.30__py3-none-any.whl

rxnn 0.2.29py3-none-any.whl → 0.2.30py3-none-any.whl