PyPI - rxnn - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

rxnn 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

rxnn/training/base.py +5 -3
rxnn/training/bml.py +30 -21
rxnn/training/callbacks.py +89 -78
{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/METADATA +1 -1
{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/RECORD +7 -7
{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/LICENSE +0 -0
{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/WHEEL +0 -0

rxnn/training/base.py CHANGED Viewed

@@ -141,7 +141,8 @@ class BaseTrainer(ABC):
                     callback.on_batch_start(self.model, batch_idx, batch)
                 if self.get_batch_size(batch) == batch_size:
                     loss = self.train_step(batch, batch_idx)
-                    self.accumulated_loss += loss.item()
+                    orig_loss = loss.item()
+                    self.accumulated_loss += orig_loss
                     loss = loss / self.gradient_accumulation_steps
                     if self.use_amp:
@@ -192,7 +193,7 @@ class BaseTrainer(ABC):
                                                epoch * len(dataloader) + batch_idx)
                     for callback in self.callbacks:
-                        should_stop = callback.on_batch_end(self.model, batch_idx, loss.item() * self.gradient_accumulation_steps, batch)
+                        should_stop = callback.on_batch_end(self.model, batch_idx, orig_loss, batch)
                         if should_stop:
                             self.is_running = False
@@ -228,7 +229,8 @@ class BaseTrainer(ABC):
         self.writer.add_scalar('Loss/validation', val_loss, epoch)
         self.writer.add_scalar('Perplexity/validation', math.exp(val_loss), epoch)
         if val_metrics['accuracy']:
-            self.writer.add_scalar('Accuracy/validation', val_metrics['accuracy'], epoch)
+            self.writer.add_scalar('Node Accuracy/validation', val_metrics['node_accuracy'], epoch)
+            self.writer.add_scalar('Avg. Accuracy/validation', val_metrics['accuracy'], epoch)
     def valid_step(self, batch: dict[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
         if self.use_amp:

rxnn/training/bml.py CHANGED Viewed

@@ -90,12 +90,15 @@ class MLMTrainer(BaseTrainer):
                         total += valid_indices.sum()
         avg_loss = (val_loss / len(val_dataloader)).item()
+        acc = (correct / total * 100) if total > 0 else torch.tensor(0.0).to(self.device)
+        node_acc = acc.item()
         if self.use_ddp:
-            dist.all_reduce(correct, op=dist.ReduceOp.SUM)
-            dist.all_reduce(total, op=dist.ReduceOp.SUM)
+            dist.all_reduce(acc, op=dist.ReduceOp.SUM)
+            acc = acc / dist.get_world_size()
         metrics = {
-            'accuracy': (correct / total * 100).item() if total > 0 else 0.0
+            'accuracy': acc.item(),
+            'node_accuracy': node_acc,
         }
         self.model.train()
         return avg_loss, metrics
@@ -154,13 +157,15 @@ class AutoregressiveTrainer(BaseTrainer):
                         total += valid_indices.sum()
         avg_loss = (val_loss / len(val_dataloader)).item()
+        acc = (correct / total * 100) if total > 0 else torch.tensor(0.0).to(self.device)
+        node_acc = acc.item()
         if self.use_ddp:
-            dist.all_reduce(correct, op=dist.ReduceOp.SUM)
-            dist.all_reduce(total, op=dist.ReduceOp.SUM)
+            dist.all_reduce(acc, op=dist.ReduceOp.SUM)
+            acc = acc / dist.get_world_size()
         metrics = {
-            'accuracy': (correct / total * 100).item() if total > 0 else 0.0
+            'accuracy': acc.item(),
+            'node_accuracy': node_acc,
         }
         self.model.train()
         return avg_loss, metrics
@@ -260,8 +265,10 @@ class JointLMTrainer(BaseTrainer):
         self.writer.add_scalar('Loss/validation', val_loss, epoch)
         self.writer.add_scalar('Perplexity/validation', math.exp(val_loss), epoch)
         if val_metrics['accuracy']:
-            self.writer.add_scalar('Encoder accuracy/validation', val_metrics['accuracy']['encoder'], epoch)
-            self.writer.add_scalar('Decoder accuracy/validation', val_metrics['accuracy']['decoder'], epoch)
+            self.writer.add_scalar('Encoder node accuracy/validation', val_metrics['accuracy']['node_encoder'], epoch)
+            self.writer.add_scalar('Decoder node accuracy/validation', val_metrics['accuracy']['node_decoder'], epoch)
+            self.writer.add_scalar('Encoder avg. accuracy/validation', val_metrics['accuracy']['encoder'], epoch)
+            self.writer.add_scalar('Decoder avg. accuracy/validation', val_metrics['accuracy']['decoder'], epoch)
         if val_metrics['loss']:
             self.writer.add_scalar('Encoder loss/validation', val_metrics['loss']['encoder'], epoch)
             self.writer.add_scalar('Encoder perplexity/validation', math.exp(val_metrics['loss']['encoder']), epoch)
@@ -317,28 +324,30 @@ class JointLMTrainer(BaseTrainer):
         avg_loss = val_loss / loader_len
         avg_dec_loss = dec_loss / loader_len
         avg_enc_loss = enc_loss / loader_len
+        mlm_acc = (correct_mlm / total_mlm * 100) if total_mlm > 0 else torch.tensor(0.0).to(self.device)
+        alm_acc = (correct_alm / total_alm * 100) if total_alm > 0 else torch.tensor(0.0).to(self.device)
+        node_mlm_acc = mlm_acc.item()
+        node_alm_acc = alm_acc.item()
         if self.use_ddp:
             dist.all_reduce(avg_dec_loss, op=dist.ReduceOp.SUM)
             dist.all_reduce(avg_enc_loss, op=dist.ReduceOp.SUM)
-            dist.all_reduce(correct_mlm, op=dist.ReduceOp.SUM)
-            dist.all_reduce(total_mlm, op=dist.ReduceOp.SUM)
-            dist.all_reduce(correct_alm, op=dist.ReduceOp.SUM)
-            dist.all_reduce(total_alm, op=dist.ReduceOp.SUM)
+            dist.all_reduce(mlm_acc, op=dist.ReduceOp.SUM)
+            dist.all_reduce(alm_acc, op=dist.ReduceOp.SUM)
             avg_dec_loss = avg_dec_loss / dist.get_world_size()
             avg_enc_loss = avg_enc_loss / dist.get_world_size()
-        mlm_acc = (correct_mlm / total_mlm * 100).item() if total_mlm > 0 else 0.0
-        alm_acc = (correct_alm / total_alm * 100).item() if total_alm > 0 else 0.0
+            mlm_acc = mlm_acc / dist.get_world_size()
+            alm_acc = alm_acc / dist.get_world_size()
         metrics = {
             'accuracy': {
-                'encoder': mlm_acc,
-                'decoder': alm_acc,
+                'encoder': mlm_acc.item(),
+                'decoder': alm_acc.item(),
+                'node_encoder': node_mlm_acc,
+                'node_decoder': node_alm_acc,
             },
             'loss': {
-                'encoder': avg_enc_loss,
-                'decoder': avg_dec_loss,
+                'encoder': avg_enc_loss.item(),
+                'decoder': avg_dec_loss.item(),
             }
         }
         self.model.train()

rxnn/training/callbacks.py CHANGED Viewed

@@ -83,9 +83,12 @@ class PrintAccuracyCallback(TrainerCallback):
     def on_validation_end(self, model: nn.Module, epoch: int, val_loss: float, val_metrics: dict) -> None:
         if self.joint_mode:
+            print(f"Epoch {epoch} - encoder node accuracy: {val_metrics['accuracy']['node_encoder']:.4f}")
+            print(f"Epoch {epoch} - decoder node accuracy: {val_metrics['accuracy']['node_decoder']:.4f}")
             print(f"Epoch {epoch} - encoder accuracy: {val_metrics['accuracy']['encoder']:.4f}")
             print(f"Epoch {epoch} - decoder accuracy: {val_metrics['accuracy']['decoder']:.4f}")
         else:
+            print(f"Epoch {epoch} - node accuracy: {val_metrics['node_accuracy']:.4f}")
             print(f"Epoch {epoch} - accuracy: {val_metrics['accuracy']:.4f}")
@@ -130,6 +133,7 @@ class ModelSaveCallback(TrainerCallback):
             save_checkpoint_after_n_batches: int = None,
             push_batch_checkpoint: bool = False,
             display_exc_trace: bool = False,
+            use_ddp: bool = False,
     ):
         self.save_dir = save_dir
         self.save_best_only = save_best_only
@@ -146,10 +150,11 @@ class ModelSaveCallback(TrainerCallback):
         self.push_batch_checkpoint = push_batch_checkpoint
         self.finished_epochs = 0
         self.display_exc_trace = display_exc_trace
+        self.rank = int(os.environ['RANK']) if use_ddp else 0
     def on_batch_end(self, model: torch.nn.Module, batch_idx: int, loss: int, batch: dict[str, torch.Tensor]) -> Union[
         bool, None]:
-        if self.save_checkpoint_after_n_batches is not None and batch_idx != 0 and batch_idx % self.save_checkpoint_after_n_batches == 0:
+        if self.rank == 0 and self.save_checkpoint_after_n_batches is not None and batch_idx != 0 and batch_idx % self.save_checkpoint_after_n_batches == 0:
             if isinstance(model, DistributedDataParallel):
                 model = next(model.children())
             try:
@@ -195,90 +200,92 @@ class ModelSaveCallback(TrainerCallback):
             val_loss: float,
             val_metrics: dict
     ):
-        self.finished_epochs += 1
-        if val_loss < self.best_loss:
-            self.best_loss = val_loss
+        if self.rank == 0:
+            self.finished_epochs += 1
+            if val_loss < self.best_loss:
+                self.best_loss = val_loss
+                if isinstance(model, DistributedDataParallel):
+                    model = next(model.children())
+                try:
+                    if model.save_pretrained is not None:
+                        ckpt_path = os.path.join(
+                            self.save_dir,
+                            f'epoch_{epoch}_val_loss_{val_loss:.4f}'
+                        )
+                        path_exists = os.path.exists(ckpt_path)
+                        if not path_exists:
+                            os.makedirs(ckpt_path)
+                        model.save_pretrained(save_directory=ckpt_path)
+                    else:
+                        path_exists = os.path.exists(self.save_dir)
+                        if not path_exists:
+                            os.makedirs(self.save_dir)
+                        ckpt_path = os.path.join(
+                            self.save_dir,
+                            f'epoch_{epoch}_val_loss_{val_loss:.4f}.pt'
+                        )
+                        torch.save(model.state_dict(), ckpt_path)
+                    self.ckpt_paths.append(ckpt_path)
+                    # Keep only N best checkpoints
+                    if len(self.ckpt_paths) > self.max_keep:
+                        oldest_path = self.ckpt_paths.pop(0)
+                        if model.save_pretrained is not None:
+                            shutil.rmtree(oldest_path)
+                        else:
+                            os.remove(oldest_path)
+                except Exception as e:
+                    print(f"Error saving epoch checkpoint: {str(e)}")
+                    if self.display_exc_trace:
+                        traceback.print_exc()
+                try:
+                    if self.push_to_hub and self.push_checkpoint_weights and model.push_to_hub is not None and self.hub_model_id:
+                        model.push_to_hub(
+                            repo_id=self.hub_model_id,
+                            commit_message=f'Epoch {epoch} - Val loss {val_loss:.4f}',
+                            token=self.hf_token,
+                            private=self.private_repo,
+                        )
+                except Exception as e:
+                    print(f"Error pushing epoch checkpoint: {str(e)}")
+                    if self.display_exc_trace:
+                        traceback.print_exc()
+    def on_training_end(self, model: Union[torch.nn.Module, PyTorchModelHubMixin]):
+        if self.rank == 0:
             if isinstance(model, DistributedDataParallel):
                 model = next(model.children())
             try:
+                # Save final model
                 if model.save_pretrained is not None:
                     ckpt_path = os.path.join(
                         self.save_dir,
-                        f'epoch_{epoch}_val_loss_{val_loss:.4f}'
+                        'final_model'
                     )
-                    path_exists = os.path.exists(ckpt_path)
-                    if not path_exists:
-                        os.makedirs(ckpt_path)
                     model.save_pretrained(save_directory=ckpt_path)
                 else:
-                    path_exists = os.path.exists(self.save_dir)
-                    if not path_exists:
-                        os.makedirs(self.save_dir)
-                    ckpt_path = os.path.join(
-                        self.save_dir,
-                        f'epoch_{epoch}_val_loss_{val_loss:.4f}.pt'
-                    )
+                    ckpt_path = os.path.join(self.save_dir, 'final_model.pt')
                     torch.save(model.state_dict(), ckpt_path)
-                self.ckpt_paths.append(ckpt_path)
-                # Keep only N best checkpoints
-                if len(self.ckpt_paths) > self.max_keep:
-                    oldest_path = self.ckpt_paths.pop(0)
-                    if model.save_pretrained is not None:
-                        shutil.rmtree(oldest_path)
-                    else:
-                        os.remove(oldest_path)
+                print(f"Final model saved to {ckpt_path}")
             except Exception as e:
-                print(f"Error saving epoch checkpoint: {str(e)}")
+                print(f"Error saving final model: {str(e)}")
                 if self.display_exc_trace:
                     traceback.print_exc()
             try:
-                if self.push_to_hub and self.push_checkpoint_weights and model.push_to_hub is not None and self.hub_model_id:
+                if self.push_to_hub and model.push_to_hub is not None:
                     model.push_to_hub(
                         repo_id=self.hub_model_id,
-                        commit_message=f'Epoch {epoch} - Val loss {val_loss:.4f}',
+                        commit_message=self.final_commit_message or f'Final pre-trained model, after {self.finished_epochs} epochs',
                         token=self.hf_token,
                         private=self.private_repo,
                     )
+                print(f"Model uploaded to repo: {self.hub_model_id}")
             except Exception as e:
-                print(f"Error pushing epoch checkpoint: {str(e)}")
+                print(f"Error pushing final model: {str(e)}")
                 if self.display_exc_trace:
                     traceback.print_exc()
-    def on_training_end(self, model: Union[torch.nn.Module, PyTorchModelHubMixin]):
-        if isinstance(model, DistributedDataParallel):
-            model = next(model.children())
-        try:
-            # Save final model
-            if model.save_pretrained is not None:
-                ckpt_path = os.path.join(
-                    self.save_dir,
-                    'final_model'
-                )
-                model.save_pretrained(save_directory=ckpt_path)
-            else:
-                ckpt_path = os.path.join(self.save_dir, 'final_model.pt')
-                torch.save(model.state_dict(), ckpt_path)
-            print(f"Final model saved to {ckpt_path}")
-        except Exception as e:
-            print(f"Error saving final model: {str(e)}")
-            if self.display_exc_trace:
-                traceback.print_exc()
-        try:
-            if self.push_to_hub and model.push_to_hub is not None:
-                model.push_to_hub(
-                    repo_id=self.hub_model_id,
-                    commit_message=self.final_commit_message or f'Final pre-trained model, after {self.finished_epochs} epochs',
-                    token=self.hf_token,
-                    private=self.private_repo,
-                )
-            print(f"Model uploaded to repo: {self.hub_model_id}")
-        except Exception as e:
-            print(f"Error pushing final model: {str(e)}")
-            if self.display_exc_trace:
-                traceback.print_exc()
 class JointModelSaveCallback(TrainerCallback):
     def __init__(
@@ -298,6 +305,7 @@ class JointModelSaveCallback(TrainerCallback):
             push_batch_checkpoint: bool = False,
             mlm_mode: bool = False,
             display_exc_trace: bool = False,
+            use_ddp: bool = False,
     ):
         self.save_dir = save_dir
         self.save_best_only = save_best_only
@@ -317,6 +325,7 @@ class JointModelSaveCallback(TrainerCallback):
         self.finished_epochs = 0
         self.mlm_mode = mlm_mode
         self.display_exc_trace = display_exc_trace
+        self.rank = int(os.environ['RANK']) if use_ddp else 0
     def _save_batch(self, model: Union[nn.Module, PyTorchModelHubMixin], component: str, hub_id: str = None):
         try:
@@ -362,7 +371,7 @@ class JointModelSaveCallback(TrainerCallback):
     def on_batch_end(self, model: torch.nn.Module, batch_idx: int, loss: int, batch: dict[str, torch.Tensor]) -> Union[
         bool, None]:
-        if self.save_checkpoint_after_n_batches is not None and batch_idx != 0 and batch_idx % self.save_checkpoint_after_n_batches == 0:
+        if self.rank == 0 and self.save_checkpoint_after_n_batches is not None and batch_idx != 0 and batch_idx % self.save_checkpoint_after_n_batches == 0:
             if isinstance(model, DistributedDataParallel):
                 model = next(model.children())
             self._save_batch(model.encoder, 'encoder', hub_id=self.hub_model_encoder)
@@ -430,15 +439,16 @@ class JointModelSaveCallback(TrainerCallback):
             val_loss: float,
             val_metrics: dict
     ):
-        self.finished_epochs += 1
-        if val_loss < self.best_loss:
-            self.best_loss = val_loss
-            if isinstance(model, DistributedDataParallel):
-                model = next(model.children())
-            self._save_validation(model.encoder, 'encoder', epoch, val_loss, hub_id=self.hub_model_encoder)
-            if not self.mlm_mode:
-                self._save_validation(model.decoder, 'decoder', epoch, val_loss, hub_id=self.hub_model_decoder)
-            self._save_validation(model.mlm_head, 'head', epoch, val_loss, hub_id=self.hub_model_head)
+        if self.rank == 0:
+            self.finished_epochs += 1
+            if val_loss < self.best_loss:
+                self.best_loss = val_loss
+                if isinstance(model, DistributedDataParallel):
+                    model = next(model.children())
+                self._save_validation(model.encoder, 'encoder', epoch, val_loss, hub_id=self.hub_model_encoder)
+                if not self.mlm_mode:
+                    self._save_validation(model.decoder, 'decoder', epoch, val_loss, hub_id=self.hub_model_decoder)
+                self._save_validation(model.mlm_head, 'head', epoch, val_loss, hub_id=self.hub_model_head)
     def _save_final(self, model: Union[nn.Module, PyTorchModelHubMixin], component: str, hub_id: str = None):
         try:
@@ -482,9 +492,10 @@ class JointModelSaveCallback(TrainerCallback):
                 traceback.print_exc()
     def on_training_end(self, model: Union[torch.nn.Module, PyTorchModelHubMixin]):
-        if isinstance(model, DistributedDataParallel):
-            model = next(model.children())
-        self._save_final(model.encoder, 'encoder', hub_id=self.hub_model_encoder)
-        if not self.mlm_mode:
-            self._save_final(model.decoder, 'decoder', hub_id=self.hub_model_decoder)
-        self._save_final(model.mlm_head, 'head', hub_id=self.hub_model_head)
+        if self.rank == 0:
+            if isinstance(model, DistributedDataParallel):
+                model = next(model.children())
+            self._save_final(model.encoder, 'encoder', hub_id=self.hub_model_encoder)
+            if not self.mlm_mode:
+                self._save_final(model.decoder, 'decoder', hub_id=self.hub_model_decoder)
+            self._save_final(model.mlm_head, 'head', hub_id=self.hub_model_head)

{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.8
+Version: 0.1.10
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning

{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/RECORD RENAMED Viewed

@@ -7,9 +7,9 @@ rxnn/memory/stm.py,sha256=EsD8slSP4_9dLuq6aFPDmuFe8PWilxh90so5Z3nm-ig,2057
 rxnn/rxt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 rxnn/rxt/models.py,sha256=INTFeNcqzAsjyWhNtbBHL4Tx7tYDsaQHgm72tf6u20M,6918
 rxnn/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-rxnn/training/base.py,sha256=aZ_1cpCIt5aHI2LiW1QB20s5ltB7H6oDEe0psuLwVIA,11066
-rxnn/training/bml.py,sha256=pyK6aRLpXlPuLge6CQ9PD64Un57yUgbOpu8lUfTdV9k,14575
-rxnn/training/callbacks.py,sha256=IyVJAJ0ggJmfIWBZnpzV9U08URYCeWIStK_wbx7m3pg,21090
+rxnn/training/base.py,sha256=YOtSLlG6-h0r54OJtyU777k5rNkbSCps3YFfB-Fh35g,11176
+rxnn/training/bml.py,sha256=pEH0_pDy8QThsuYgfcT2lSdfMOnqGhlhu63xMFkUSOs,15246
+rxnn/training/callbacks.py,sha256=_YfMKY_eFdc-tubhO9nYH2PXDZDQwlSI74FVOoCXpQg,22108
 rxnn/training/dataset.py,sha256=vQ5mDF3bA0HXya474n4D4iL8Mn3AEpJukgzFNVkxjGU,5106
 rxnn/training/scheduler.py,sha256=ow6oALzWjWQmHSpcJEjv6tg4g4CDMvr73TypxfcefMc,712
 rxnn/training/tokenizer.py,sha256=4Y41f07uo2KPA_7bp3FCcwGKbXoS2hsckOoXUsXfQxY,8052
@@ -23,7 +23,7 @@ rxnn/transformers/moe.py,sha256=JQ5QSX4FS7S-fqB7-s1ZmJbPpOeD_Injn8o4vo7wGQE,4936
 rxnn/transformers/positional.py,sha256=2l38RS0Dini3f6Z3LUHr3XwWzg1UK7fO2C6wazWDAYU,4292
 rxnn/transformers/sampler.py,sha256=wSz_1wNloqtuiix5w2Mcsj5NhaO9QlY0j__TVG7wJnM,3938
 rxnn/utils.py,sha256=d5U8i5ukovgDyqiycc2AoxObTz_eF_bgo2MKvdtJ98s,467
-rxnn-0.1.8.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
-rxnn-0.1.8.dist-info/METADATA,sha256=wgvRRqLr_5KkkWTbVxsaeqtC9DYl4Gjqvt5Q4PQkI8M,14628
-rxnn-0.1.8.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-rxnn-0.1.8.dist-info/RECORD,,
+rxnn-0.1.10.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
+rxnn-0.1.10.dist-info/METADATA,sha256=dbmUcafrjisLl8YzU7Y9bBeSm0cJ2IaWnts8DdqWzMY,14629
+rxnn-0.1.10.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+rxnn-0.1.10.dist-info/RECORD,,

{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{rxnn-0.1.8.dist-info → rxnn-0.1.10.dist-info}/WHEEL RENAMED Viewed

File without changes

rxnn 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

rxnn 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl