PyPI - rxnn - Versions diffs - 0.1.82__tar.gz → 0.2.0__tar.gz - Mend

rxnn 0.1.82tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{rxnn-0.1.82 → rxnn-0.2.0}/PKG-INFO +174 -6
rxnn-0.2.0/README.md +219 -0
{rxnn-0.1.82 → rxnn-0.2.0}/pyproject.toml +1 -1
rxnn-0.2.0/src/rxnn/.DS_Store +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/attention.py +5 -0
rxnn-0.2.0/src/rxnn/memory/attention.py +42 -0
rxnn-0.2.0/src/rxnn/memory/stm.py +94 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/rxt/models.py +71 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/base.py +2 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/bml.py +2 -59
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/callbacks.py +304 -20
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/dataset.py +348 -1
rxnn-0.2.0/src/rxnn/training/models.py +142 -0
rxnn-0.2.0/src/rxnn/training/mrl.py +808 -0
rxnn-0.2.0/src/rxnn/training/reward.py +111 -0
rxnn-0.2.0/src/rxnn/training/rl.py +69 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/scheduler.py +18 -0
rxnn-0.2.0/src/rxnn/training/utils.py +148 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/attention.py +10 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/layers.py +6 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/models.py +16 -4
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/positional.py +7 -0
rxnn-0.2.0/src/rxnn/transformers/sampler.py +443 -0
rxnn-0.2.0/src/rxnn/utils.py +35 -0
rxnn-0.1.82/README.md +0 -52
rxnn-0.1.82/src/rxnn/memory/stm.py +0 -53
rxnn-0.1.82/src/rxnn/transformers/sampler.py +0 -169
rxnn-0.1.82/src/rxnn/utils.py +0 -14
{rxnn-0.1.82 → rxnn-0.2.0}/LICENSE +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/__init__.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/__init__.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/models.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/moe.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/memory/__init__.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/memory/norm.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/rxt/__init__.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/__init__.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/tokenizer.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/__init__.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/ff.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/mask.py +0 -0
{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/moe.py +0 -0

{rxnn-0.1.82 → rxnn-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: rxnn
-Version: 0.1.82
+Version: 0.2.0
 Summary: RxNN: Reactive Neural Networks Platform
 License: Apache-2.0
 Keywords: deep-learning,ai,machine-learning
@@ -23,13 +23,15 @@ Project-URL: Homepage, https://rxai.dev/rxnn
 Project-URL: Repository, https://github.com/RxAI-dev/rxnn/python
 Description-Content-Type: text/markdown
-<img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxai.webp" width="300" />
-<img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxnn.webp" width="300" />
+<span>
+  <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxai_v2.png" width="400" />
+  <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxnn_v2.png" width="400" />
+</span>
 # Reactive AI - RxNN
 ## Reactive Neural Networks Platform
-RxNN is AI/DeepLearning development platform made for Reactive Neural Networks and Event-driven AI, introduced by Reactive AI.
+RxNN is AI/Deep Learning development platform made for Reactive Neural Networks and Event-driven AI, introduced by Reactive AI.
 ## Reactive Neural Networks and Event-driven AI
 Reactive neural networks (RxNN) are a new family of memory-augmented neural networks that combine classical deep learning
@@ -61,8 +63,8 @@ We are working on three new reactive architectures, that progressively advance f
 Each new architecture is based on the previous one and adding new features/abilities. They will be progressively
 released with next versions of **RxNN** framework:
-- 0.1.x: Reactive Transformer base models, Base Model Learning (pre-training/fine-tuning) & Transformers extensions (MoE Attention, Short-Term Memory, etc.)
-- 0.2.x: Memory Reinforcement Learning (MRL) for Short-Term Memory & Reactive Transformer, Attention-based Memory System details
+- 0.1.x (Released): Reactive Transformer base models, Base Model Learning (pre-training/fine-tuning) & Transformers extensions (MoE Attention, Short-Term Memory, etc.)
+- 0.2.x (Released): Memory Reinforcement Learning (MRL) for Short-Term Memory & Reactive Transformer, Attention-based Memory System details
 - 0.3.x: Reinforcement Learning from Human Feedback for Reactive models (RxRLHF), basic Tensor Reactive
   Extensions (TRX/Rust) for full Reactive Transformer, RxT-Alpha release (+following models - RxT-Beta, etc.)
 - 0.4.x: Preactor base models, Tensor Database (TDB/Rust) for Long-Term Memory, mxRAG/revRAG subsystems
@@ -75,6 +77,172 @@ released with next versions of **RxNN** framework:
 - 1.x.x: Multimodal reactive models (could be released earlier, depending on progress)
 - 2.0.0: Real-Time Vision Reactor - Worker class models
 - x.x.x: ...and more!
+## Usage
+**RxNN** is made to train models based on reactive architectures, as well as transformer language models. Current version
+is based on PyTorch and HuggingFace libraries (Transformers/Datasets/Tokenizer/Hub), and is integrated with [HuggingFace Hub](https://hugginface.co)
+and [TensorBoard](https://github.com/tensorflow/tensorboard).
+> We are also planning a version for **TensorFlow**, more info soon
+### Install library and dependencies
+- RxNN and required deps: `pip install rxnn torch transformers tokenizers huggingface_hub`
+- Datasets are required only for training: `pip install datasets`
+- TensorBoard is optional: `pip install tensorboard`
+- [Flash Attention](https://github.com/Dao-AILab/flash-attention) is recommended for faster training/inference (required for models with explicit `use_flash_attention=True`) - check its separate [installation guide](#installing-flash-attention)
+- **NumPy** should be installed too: `pip install numpy`
+> ### Installing Flash Attention
+> Installing `flash-attn` could be very frustrating and may take hours (with standard method), only to result in some incompatibility
+> error. Fortunately, the prebuilt versions could be downloaded from GitHub and installed just in seconds. However, you should choose
+> the compatible version based on:
+> - Python version
+> - CUDA version
+> - PyTorch version (2.7 is currently not supported)
+> - ABI
+>
+> #### Steps
+> 1. Choose your version from [https://github.com/Dao-AILab/flash-attention/releases](https://github.com/Dao-AILab/flash-attention/releases)
+> 2. Download prebuilt release, in example: `wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
+> 3. Install it, in example: `pip install --no-dependencies --upgrade flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
+> 4. Verify: `flash_attn.__version__` (an incorrect version will cause the error when importing)
+>
+> #### Note on `use_flash_attention` option in models/layers
+> Explicit `use_flash_attention` option is made to enable direct calls to `flash_attn_func` without using **PyTorch** `scaled_dot_product_attention`. Even
+> if it's set to `False`, when `flash-attn` library is installed, **PyTorch** will try to use it implicitly through _SDPA backend_. It's better to set it
+> to `False` and use automatically, because of better compatibility. Explicit options could be used for research
+### Modules
+**RxNN** framework has multiple modules with models, layers, training and inference tools, made for complete development
+of _reactive models_, and could be also used for regular **Transformers**.
+#### Transformers
+Transformers module includes classes for models and layers. It includes **Reactive Transformers** as well as **Classic Transformers**
+Submodules:
+- `rxnn.transformers.attention` - basic, most common attention layers - `MultiHeadAttention`, `GroupedQueryAttention` and `MultiQueryAttention`
+  - additional attention layers, especially `SparseQueryAttention` could be found in `rxnn.experimental.attention` module
+  - `SparseQueryAttention` will be moved to `rxnn.transformers.attention` in 0.2.x version
+- `rxnn.transformers.positional` - positional encoding layers - `RotaryPositionalEmbedding` and legacy ones - `AbsolutePositionalEmbedding`/`RelativePositionalEmbedding`
+- `rxnn.transformers.ff` - dense feed forward layers, including gated layers (_SwiGLU_, etc.) - `FeedForward` & `GatedFeedForward` (recommended)
+- `rxnn.transformers.moe` - Mixture-of-Experts feed forward layers - `MoeFeedForward` & `GatedMoeFeedForward` (recommended)
+- `rxnn.transformer.layers` - complete reactive/classic transformer layers - `ReactiveTransformerLayer` & `ClassicTransformerLayer`
+- `rxnn.transformer.models` - reactive/classic transformer models - `ReactiveTransformerEncoder`, `ReactiveTransformerDecoder` & `ClassicTransformerEncoder`, `ClassicTransformerDecoder`
+- `rxnn.transformer.sampler` - samplers for reactive models (Sampler is the integral part of reactive architectures) - `Sampler`, `SampleDecoder`, `BatchSampler` & `BatchSampleDecoder`
+In **RxNN** models are initialized in declarative style by class composition, but then they are wrapped in imperative classes,
+to be compatible with HuggingFace **JSON** config. In example:
+```python
+from typing import TypedDict
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from rxnn.transformers.attention import GroupedQueryAttention
+from rxnn.transformers.positional import RotaryPositionalEmbedding
+from rxnn.transformers.layers import ReactiveTransformerLayer
+from rxnn.transformers.models import ReactiveTransformerDecoder
+from rxnn.memory.stm import ShortTermMemory
+class YourReactiveTransformerConfig(TypedDict):
+    num_layers: int
+    vocab_size: int
+    embed_dim: int
+    ff_dim: int
+    att_heads: int
+    seq_len: int
+    stm_size: int
+    att_groups: int
+    cross_att_groups: int
+class YourReactiveTransformerDecoder(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+            self,
+            config: YourReactiveTransformerConfig,
+            **kwargs
+    ):
+        super(YourReactiveTransformerDecoder, self).__init__(**kwargs)
+        embedding = nn.Embedding(config['vocab_size'], config['embed_dim'])
+        rope = RotaryPositionalEmbedding(config['embed_dim'] // config['att_heads'], config['seq_len'])
+        stm = ShortTermMemory(config['num_layers'], config['embed_dim'], config['stm_size'])
+        self.model = ReactiveTransformerDecoder(
+            stm=stm,
+            embedding=embedding,
+            own_layers=nn.ModuleList([
+                ReactiveTransformerLayer(
+                    config['embed_dim'],
+                    config['ff_dim'],
+                    use_gated=True,
+                    use_moe=False,
+                    ff_activation=nn.GELU(),
+                    ff_dropout=0.1,
+                    use_rms_norm=True,
+                    self_attention=GroupedQueryAttention(
+                        config['embed_dim'],
+                        config['att_heads'],
+                        config['att_groups'],
+                        rope=rope,
+                        dropout=0.1,
+                        max_seq_len=config['seq_len'],
+                        is_causal=True,
+                    ),
+                    memory_cross_attention=GroupedQueryAttention(
+                        config['embed_dim'],
+                        config['att_heads'],
+                        config['att_groups'],
+                        rope=rope,
+                        dropout=0.1,
+                        max_seq_len=config['seq_len'],
+                        is_causal=True,
+                        rope_only_for_query=True
+                    ),
+                ) for _ in range(config['num_layers'])
+            ])
+        )
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None):
+        return self.model(x, attention_mask=attention_mask)
+```
+#### Memory
+The _memory_ module includes **Short-Term Memory** and layers responsible for its update. In future versions it will also
+include **Long-Term Memory**.
+The main `ShortTermMemory` class is located in `rxnn.memory.stm` module - the usage example is in Transformers module description.
+> 0.2.x Memory modules docs in progress - will be released soon
+#### Training
+Training module includes **Trainers** for different training stages of reactive models and shared training utils.
+Submodules:
+- `rxnn.training.tokenizer` - custom Trainer for **HuggingFace** `tokenizers` and utils to load tokenizer from Hub
+  - Tokenizer could be loaded from Hub with `load_tokenizer_from_hf_hub(repo_id)`
+- `rxnn.training.dataset` - datasets for different training stages:
+  - `MaskedLMDataset` & `AutoregressiveLMDataset` are made for base models pre-training
+  - `EncoderSftDataset` & `DecoderSftDataset` are made for Interaction Supervised Fine-Tuning for reactive models
+  - `MrlCurriculumDataset` is the dataset for single MRL Curriculum step
+  - `MrlDatasets` is wrapping MRL datasets for all curriculum steps
+  - each dataset has `from_hf_hub` class method to load dataset from Hub
+  - they have also `concat_from_hf_hub` class method to load multiple Hub datasets into single training dataset
+  - if dataset has no validation/test split, each dataset has `get_subset(subset_size, from_start=False)` method - it
+    returns new subset and modifying existing one - i.e. `valid_dataset = train_dataset.get_subset(0.1)`
+  - for concatenated datasets, validation/test split could be created with `concat_from_hf_hub_with_subset` - it cuts the
+    same percentage of each loaded dataset
+- `rxnn.training.callbacks` contain Trainer callbacks, for different kind of utils (more info below)
+- `rxnn.training.scheduler` includes learning rate scheduler for training
+- `rxnn.training.bml` - Base Model Learning module with Trainers for pre-training and fine-tuning
+- `rxnn.training.mrl` - Memory Reinforcement Learning module with Trainers for MRL
+- `rxnn.training.rxrlhf` - Reinforcement Learning from Human Feedback for Reactive Models module (from 0.3.x)
+- `rxnn.training.brl` - Behavioral Reinforcement Learning module (Reactor / from 0.7.x)
+##### Base Model Learning
+Docs in progress
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/

rxnn-0.2.0/README.md ADDED Viewed

@@ -0,0 +1,219 @@
+<span>
+  <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxai_v2.png" width="400" />
+  <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxnn_v2.png" width="400" />
+</span>
+# Reactive AI - RxNN
+## Reactive Neural Networks Platform
+RxNN is AI/Deep Learning development platform made for Reactive Neural Networks and Event-driven AI, introduced by Reactive AI.
+## Reactive Neural Networks and Event-driven AI
+Reactive neural networks (RxNN) are a new family of memory-augmented neural networks that combine classical deep learning
+algorithms with reactive communication patterns. In Event-driven AI, input data (sequence) is treated as event, and memory
+state has to be kept between events/interactions. Technically, it's a specific kind of RNN that's storing data between
+processed sequences, instead of between sequence elements like in regular RNN. Then, their recurrence is on a higher level.
+In the case of reactive communication patterns, RxRNNs are stateful reactive data sources that you have to connect before
+you can send and receive messages.
+While RxNNs are using some RNN concepts, they are rather made to extend Transformer language/multi-modal models. In our
+opinion, the biggest downside of current LLMs is their stateless nature - conversational models have to process full chat
+history on every interaction! That's not real-time processing, and it's not how human's awareness is working. In RxNN based
+transformers, model is processing single messages, while all the previous interactions history should be saved and read
+from memory. That features are required for **Weak** Reactive Neural Networks specification, and it will be the first major
+step in transition from language models to awareness models - in Reactive AI ecosystem, it will be introduced in Reactive
+Transformer architecture.
+Additionally, to achieve awareness, **Strong** Reactive Neural Networks are working in reactive infinite reasoning loop,
+that's generating Infinite Chain-of-Thoughts and is communicating in push-based mode (model decides if and when return output).
+Reactive communication patterns in RxNN models are adapted to handle asynchronous nature of model - after it finish generating
+sequence, it has to process it and save it in memory, but it could be done in background.
+## Release plan
+We are working on three new reactive architectures, that progressively advance from language models to awareness models:
+- Reactive Transformer: Reactive Language Model (RLM) with Short-Term Memory
+- Preactor: extending Reactive Transformer with additional Long-Term Memory, providing theoretically infinite context (only
+  single message length is limited) and the ability to learn from interactions (Live Learning)
+- Reactor: AGI awareness model & Strong Reactive Neural Network, that's working in infinite reasoning loop and doesn't require explicit human commands
+Each new architecture is based on the previous one and adding new features/abilities. They will be progressively
+released with next versions of **RxNN** framework:
+- 0.1.x (Released): Reactive Transformer base models, Base Model Learning (pre-training/fine-tuning) & Transformers extensions (MoE Attention, Short-Term Memory, etc.)
+- 0.2.x (Released): Memory Reinforcement Learning (MRL) for Short-Term Memory & Reactive Transformer, Attention-based Memory System details
+- 0.3.x: Reinforcement Learning from Human Feedback for Reactive models (RxRLHF), basic Tensor Reactive
+  Extensions (TRX/Rust) for full Reactive Transformer, RxT-Alpha release (+following models - RxT-Beta, etc.)
+- 0.4.x: Preactor base models, Tensor Database (TDB/Rust) for Long-Term Memory, mxRAG/revRAG subsystems
+- 0.5.x: MRL for Long-Term Memory & Preactor, Live Learning for Preactor, PRx-Alpha release (+following models - PRx-Beta, etc.)
+- 0.6.x: Reactor base models, TRX full implementation, Receptors & Effectors Reactive RNNs
+- 0.7.x: Behavioral Reinforcement Learning (BRL) for Reactor's Infinite Chain-of-Thoughts, Continuous Live Learning for Reactor
+- 0.8.x: Rx-Alpha release
+- 0.9.x: Rx-Beta release
+- 1.0.0: Reactor AGI official release (Expert, Assistant & Utility class models)
+- 1.x.x: Multimodal reactive models (could be released earlier, depending on progress)
+- 2.0.0: Real-Time Vision Reactor - Worker class models
+- x.x.x: ...and more!
+## Usage
+**RxNN** is made to train models based on reactive architectures, as well as transformer language models. Current version
+is based on PyTorch and HuggingFace libraries (Transformers/Datasets/Tokenizer/Hub), and is integrated with [HuggingFace Hub](https://hugginface.co)
+and [TensorBoard](https://github.com/tensorflow/tensorboard).
+> We are also planning a version for **TensorFlow**, more info soon
+### Install library and dependencies
+- RxNN and required deps: `pip install rxnn torch transformers tokenizers huggingface_hub`
+- Datasets are required only for training: `pip install datasets`
+- TensorBoard is optional: `pip install tensorboard`
+- [Flash Attention](https://github.com/Dao-AILab/flash-attention) is recommended for faster training/inference (required for models with explicit `use_flash_attention=True`) - check its separate [installation guide](#installing-flash-attention)
+- **NumPy** should be installed too: `pip install numpy`
+> ### Installing Flash Attention
+> Installing `flash-attn` could be very frustrating and may take hours (with standard method), only to result in some incompatibility
+> error. Fortunately, the prebuilt versions could be downloaded from GitHub and installed just in seconds. However, you should choose
+> the compatible version based on:
+> - Python version
+> - CUDA version
+> - PyTorch version (2.7 is currently not supported)
+> - ABI
+>
+> #### Steps
+> 1. Choose your version from [https://github.com/Dao-AILab/flash-attention/releases](https://github.com/Dao-AILab/flash-attention/releases)
+> 2. Download prebuilt release, in example: `wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
+> 3. Install it, in example: `pip install --no-dependencies --upgrade flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
+> 4. Verify: `flash_attn.__version__` (an incorrect version will cause the error when importing)
+>
+> #### Note on `use_flash_attention` option in models/layers
+> Explicit `use_flash_attention` option is made to enable direct calls to `flash_attn_func` without using **PyTorch** `scaled_dot_product_attention`. Even
+> if it's set to `False`, when `flash-attn` library is installed, **PyTorch** will try to use it implicitly through _SDPA backend_. It's better to set it
+> to `False` and use automatically, because of better compatibility. Explicit options could be used for research
+### Modules
+**RxNN** framework has multiple modules with models, layers, training and inference tools, made for complete development
+of _reactive models_, and could be also used for regular **Transformers**.
+#### Transformers
+Transformers module includes classes for models and layers. It includes **Reactive Transformers** as well as **Classic Transformers**
+Submodules:
+- `rxnn.transformers.attention` - basic, most common attention layers - `MultiHeadAttention`, `GroupedQueryAttention` and `MultiQueryAttention`
+  - additional attention layers, especially `SparseQueryAttention` could be found in `rxnn.experimental.attention` module
+  - `SparseQueryAttention` will be moved to `rxnn.transformers.attention` in 0.2.x version
+- `rxnn.transformers.positional` - positional encoding layers - `RotaryPositionalEmbedding` and legacy ones - `AbsolutePositionalEmbedding`/`RelativePositionalEmbedding`
+- `rxnn.transformers.ff` - dense feed forward layers, including gated layers (_SwiGLU_, etc.) - `FeedForward` & `GatedFeedForward` (recommended)
+- `rxnn.transformers.moe` - Mixture-of-Experts feed forward layers - `MoeFeedForward` & `GatedMoeFeedForward` (recommended)
+- `rxnn.transformer.layers` - complete reactive/classic transformer layers - `ReactiveTransformerLayer` & `ClassicTransformerLayer`
+- `rxnn.transformer.models` - reactive/classic transformer models - `ReactiveTransformerEncoder`, `ReactiveTransformerDecoder` & `ClassicTransformerEncoder`, `ClassicTransformerDecoder`
+- `rxnn.transformer.sampler` - samplers for reactive models (Sampler is the integral part of reactive architectures) - `Sampler`, `SampleDecoder`, `BatchSampler` & `BatchSampleDecoder`
+In **RxNN** models are initialized in declarative style by class composition, but then they are wrapped in imperative classes,
+to be compatible with HuggingFace **JSON** config. In example:
+```python
+from typing import TypedDict
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from rxnn.transformers.attention import GroupedQueryAttention
+from rxnn.transformers.positional import RotaryPositionalEmbedding
+from rxnn.transformers.layers import ReactiveTransformerLayer
+from rxnn.transformers.models import ReactiveTransformerDecoder
+from rxnn.memory.stm import ShortTermMemory
+class YourReactiveTransformerConfig(TypedDict):
+    num_layers: int
+    vocab_size: int
+    embed_dim: int
+    ff_dim: int
+    att_heads: int
+    seq_len: int
+    stm_size: int
+    att_groups: int
+    cross_att_groups: int
+class YourReactiveTransformerDecoder(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+            self,
+            config: YourReactiveTransformerConfig,
+            **kwargs
+    ):
+        super(YourReactiveTransformerDecoder, self).__init__(**kwargs)
+        embedding = nn.Embedding(config['vocab_size'], config['embed_dim'])
+        rope = RotaryPositionalEmbedding(config['embed_dim'] // config['att_heads'], config['seq_len'])
+        stm = ShortTermMemory(config['num_layers'], config['embed_dim'], config['stm_size'])
+        self.model = ReactiveTransformerDecoder(
+            stm=stm,
+            embedding=embedding,
+            own_layers=nn.ModuleList([
+                ReactiveTransformerLayer(
+                    config['embed_dim'],
+                    config['ff_dim'],
+                    use_gated=True,
+                    use_moe=False,
+                    ff_activation=nn.GELU(),
+                    ff_dropout=0.1,
+                    use_rms_norm=True,
+                    self_attention=GroupedQueryAttention(
+                        config['embed_dim'],
+                        config['att_heads'],
+                        config['att_groups'],
+                        rope=rope,
+                        dropout=0.1,
+                        max_seq_len=config['seq_len'],
+                        is_causal=True,
+                    ),
+                    memory_cross_attention=GroupedQueryAttention(
+                        config['embed_dim'],
+                        config['att_heads'],
+                        config['att_groups'],
+                        rope=rope,
+                        dropout=0.1,
+                        max_seq_len=config['seq_len'],
+                        is_causal=True,
+                        rope_only_for_query=True
+                    ),
+                ) for _ in range(config['num_layers'])
+            ])
+        )
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None):
+        return self.model(x, attention_mask=attention_mask)
+```
+#### Memory
+The _memory_ module includes **Short-Term Memory** and layers responsible for its update. In future versions it will also
+include **Long-Term Memory**.
+The main `ShortTermMemory` class is located in `rxnn.memory.stm` module - the usage example is in Transformers module description.
+> 0.2.x Memory modules docs in progress - will be released soon
+#### Training
+Training module includes **Trainers** for different training stages of reactive models and shared training utils.
+Submodules:
+- `rxnn.training.tokenizer` - custom Trainer for **HuggingFace** `tokenizers` and utils to load tokenizer from Hub
+  - Tokenizer could be loaded from Hub with `load_tokenizer_from_hf_hub(repo_id)`
+- `rxnn.training.dataset` - datasets for different training stages:
+  - `MaskedLMDataset` & `AutoregressiveLMDataset` are made for base models pre-training
+  - `EncoderSftDataset` & `DecoderSftDataset` are made for Interaction Supervised Fine-Tuning for reactive models
+  - `MrlCurriculumDataset` is the dataset for single MRL Curriculum step
+  - `MrlDatasets` is wrapping MRL datasets for all curriculum steps
+  - each dataset has `from_hf_hub` class method to load dataset from Hub
+  - they have also `concat_from_hf_hub` class method to load multiple Hub datasets into single training dataset
+  - if dataset has no validation/test split, each dataset has `get_subset(subset_size, from_start=False)` method - it
+    returns new subset and modifying existing one - i.e. `valid_dataset = train_dataset.get_subset(0.1)`
+  - for concatenated datasets, validation/test split could be created with `concat_from_hf_hub_with_subset` - it cuts the
+    same percentage of each loaded dataset
+- `rxnn.training.callbacks` contain Trainer callbacks, for different kind of utils (more info below)
+- `rxnn.training.scheduler` includes learning rate scheduler for training
+- `rxnn.training.bml` - Base Model Learning module with Trainers for pre-training and fine-tuning
+- `rxnn.training.mrl` - Memory Reinforcement Learning module with Trainers for MRL
+- `rxnn.training.rxrlhf` - Reinforcement Learning from Human Feedback for Reactive Models module (from 0.3.x)
+- `rxnn.training.brl` - Behavioral Reinforcement Learning module (Reactor / from 0.7.x)
+##### Base Model Learning
+Docs in progress

{rxnn-0.1.82 → rxnn-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "rxnn"
-version = "0.1.82"
+version = "0.2.0"
 description = "RxNN: Reactive Neural Networks Platform"
 license = "Apache-2.0"

rxnn-0.2.0/src/rxnn/.DS_Store ADDED Viewed

Binary file

{rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/attention.py RENAMED Viewed

@@ -287,6 +287,7 @@ class SparseQueryAttention(MultiHeadAttention):
             k = self.k_proj(key).view(b, -1, self.num_groups, head_dim).transpose(1, 2)
             v = self.v_proj(value).view(b, -1, self.num_groups, head_dim).transpose(1, 2)
         else:
+            # Relative embedding version is not working without this strange mapping - it will be removed in next versions
             group_heads = self.num_heads // self.num_groups
             query_heads = self.num_heads // self.num_query_groups
             # Process Q
@@ -457,6 +458,7 @@ def init_experimental_attention(
         dropout: float = 0.0,
         rope: RotaryPositionalEmbedding = None,
         rope_only_for_query: bool = False,
+        rope_only_for_keys: bool = False,
         use_relative_embeddings: bool = False,
         max_seq_len: int = 1024,
         use_flash_attention: bool = False,
@@ -478,6 +480,7 @@ def init_experimental_attention(
             use_relative_embeddings=use_relative_embeddings,
             max_seq_len=max_seq_len,
             rope_only_for_query=rope_only_for_query,
+            rope_only_for_keys=rope_only_for_keys,
             use_flash_attention=use_flash_attention,
             is_causal=is_causal,
             use_bias=use_bias,
@@ -493,6 +496,7 @@ def init_experimental_attention(
             use_relative_embeddings=use_relative_embeddings,
             max_seq_len=max_seq_len,
             rope_only_for_query=rope_only_for_query,
+            rope_only_for_keys=rope_only_for_keys,
             use_flash_attention=use_flash_attention,
             is_causal=is_causal,
             use_bias=use_bias,
@@ -511,6 +515,7 @@ def init_experimental_attention(
             use_relative_embeddings=use_relative_embeddings,
             max_seq_len=max_seq_len,
             rope_only_for_query=rope_only_for_query,
+            rope_only_for_keys=rope_only_for_keys,
             use_flash_attention=use_flash_attention,
             is_causal=is_causal,
             use_bias=use_bias,

rxnn-0.2.0/src/rxnn/memory/attention.py ADDED Viewed

@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+from .stm import ShortTermMemory
+class StmMemoryAttention(nn.Module):
+    def __init__(
+            self,
+            stm: ShortTermMemory,
+            attention_layers: nn.ModuleList,
+            memory_norm_layers: nn.ModuleList,
+            *args,
+            **kwargs
+    ):
+        super(StmMemoryAttention, self).__init__(*args, **kwargs)
+        self.stm = stm
+        self.attention_layers = attention_layers
+        self.memory_norm_layers = memory_norm_layers
+        assert len(self.attention_layers) == len(self.memory_norm_layers) == self.stm.memory.size(0)
+        self.num_layers = len(attention_layers)
+    def update_max_len(self, max_seq_len: int):
+        for i in range(self.num_layers):
+            if self.attention_layers[i].rope is not None:
+                self.attention_layers[i].rope.update_max_len(max_seq_len)
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
+        mask = attention_mask.unsqueeze(1).unsqueeze(1).bool() if attention_mask is not None else None
+        new_stm = torch.zeros_like(self.stm.memory)
+        for i in range(self.num_layers):
+            layer_stm = self.stm(i)
+            # expand layer STM to batch size, if it's not in batch mode
+            if layer_stm.size(0) == 1:
+                layer_stm = layer_stm.expand(x.size(0), -1, -1)
+            encoded_layer_data = x[i]
+            normalized_layer_stm = self.memory_norm_layers[i](layer_stm)
+            new_layer_stm = self.attention_layers[i](normalized_layer_stm, encoded_layer_data, encoded_layer_data, mask=mask)
+            # self.stm.update_layer(i, new_layer_stm + layer_stm)
+            new_stm[i] = new_layer_stm + layer_stm # residual
+        self.stm.update_all(new_stm)
+        return self.stm.memory

rxnn-0.2.0/src/rxnn/memory/stm.py ADDED Viewed

@@ -0,0 +1,94 @@
+import torch
+import torch.nn as nn
+class ShortTermMemory(nn.Module):
+    """Short-term memory module for the Attention-based Memory System"""
+    def __init__(self, num_layers: int, embed_dim: int, stm_size: int, init_type: str = 'normal',
+                 is_trainable: bool = False, legacy_init: bool = True, *args, **kwargs):
+        super(ShortTermMemory, self).__init__(*args, **kwargs)
+        self.num_layers = num_layers
+        self.embed_dim = embed_dim
+        self.stm_size = stm_size
+        self.batch_size = 1 # setting 1 as initial batch size (it will be normally used in inference/pre-training. Bigger batches are for RL stages)
+        self.is_trainable = is_trainable
+        assert init_type in ['normal', 'standard', 'uniform', 'ones', 'zeros'], \
+            'STM init type must be one of "normal", "standard", "uniform", "ones", "zeros"'
+        self.init_type = init_type
+        stm = self._init_tensor()
+        if self.is_trainable:
+            self.memory = nn.Parameter(stm)
+        else:
+            self.register_buffer('memory', stm)
+        # Legacy init - temporary option to load old models with not-batched STM (they will be loaded, updated and then the option will be removed)
+        self.legacy_init = legacy_init
+    def _init_tensor(self, init_type: str = None):
+        init_type = init_type or self.init_type
+        stm_shape = (self.num_layers, self.stm_size, self.embed_dim) \
+            if self.legacy_init else (self.num_layers, self.batch_size, self.stm_size, self.embed_dim)
+        if init_type == 'normal':
+            return torch.normal(0, 0.02, stm_shape)
+        elif init_type == 'standard':
+            return torch.normal(0, 1, stm_shape)
+        elif init_type == 'uniform':
+            return torch.rand(*stm_shape) * 0.02
+        elif init_type == 'ones':
+            return torch.ones(*stm_shape)
+        else:
+            return torch.zeros(*stm_shape)
+    def reset_legacy_(self):
+        self.legacy_init = False
+        self.memory = self._init_tensor()
+    def forward(self, layer: int) -> torch.Tensor:
+        return self.memory[layer].unsqueeze(0) if self.legacy_init else self.memory[layer]
+    def update_layer(self, layer: int, new_stm: torch.Tensor):
+        self.memory[layer] = new_stm
+    def update_all(self, new_stm: torch.Tensor):
+        self.memory.copy_(new_stm)
+    def make_trainable(self):
+        if not self.is_trainable:
+            self.is_trainable = True
+            initial_stm = self.memory.clone()
+            del self.memory
+            self.memory = nn.Parameter(initial_stm)
+    def freeze(self):
+        if self.is_trainable:
+            self.requires_grad_(False)
+            trained_stm = self.memory.clone()
+            del self.memory
+            self.register_buffer('memory', trained_stm)
+    def reset(self, init_type: str = None):
+        self.memory = self._init_tensor(init_type)
+    def resize(self, new_stm_size: int, init_type: str = None):
+        self.stm_size = new_stm_size
+        self.memory = self._init_tensor(init_type)
+    def batched_memory(self, batch_size: int, init_type: str = None):
+        if init_type is not None:
+            assert init_type in ['normal', 'standard', 'uniform', 'ones', 'zeros'], \
+                'STM init type must be one of "normal", "standard", "uniform", "ones", "zeros"'
+            self.init_type = init_type
+        self.batch_size = batch_size
+        self.memory = self._init_tensor()
+    def single_memory(self, init_type: str = None, use_mean_from_batch: bool = False):
+        if init_type is not None:
+            assert init_type in ['normal', 'standard', 'uniform', 'ones', 'zeros'], \
+                'STM init type must be one of "normal", "standard", "uniform", "ones", "zeros"'
+            self.init_type = init_type
+        self.batch_size = 1
+        if use_mean_from_batch:
+            batch_mean = self.memory.mean(dim=(1, 2, 3), keepdim=True)
+            self.memory = self._init_tensor()
+            self.memory.copy_(batch_mean)
+        else:
+            self.memory = self._init_tensor()

rxnn 0.1.82__tar.gz → 0.2.0__tar.gz

rxnn 0.1.82tar.gz → 0.2.0tar.gz