rxnn 0.1.82__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {rxnn-0.1.82 → rxnn-0.2.0}/PKG-INFO +174 -6
  2. rxnn-0.2.0/README.md +219 -0
  3. {rxnn-0.1.82 → rxnn-0.2.0}/pyproject.toml +1 -1
  4. rxnn-0.2.0/src/rxnn/.DS_Store +0 -0
  5. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/attention.py +5 -0
  6. rxnn-0.2.0/src/rxnn/memory/attention.py +42 -0
  7. rxnn-0.2.0/src/rxnn/memory/stm.py +94 -0
  8. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/rxt/models.py +71 -0
  9. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/base.py +2 -0
  10. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/bml.py +2 -59
  11. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/callbacks.py +304 -20
  12. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/dataset.py +348 -1
  13. rxnn-0.2.0/src/rxnn/training/models.py +142 -0
  14. rxnn-0.2.0/src/rxnn/training/mrl.py +808 -0
  15. rxnn-0.2.0/src/rxnn/training/reward.py +111 -0
  16. rxnn-0.2.0/src/rxnn/training/rl.py +69 -0
  17. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/scheduler.py +18 -0
  18. rxnn-0.2.0/src/rxnn/training/utils.py +148 -0
  19. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/attention.py +10 -0
  20. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/layers.py +6 -0
  21. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/models.py +16 -4
  22. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/positional.py +7 -0
  23. rxnn-0.2.0/src/rxnn/transformers/sampler.py +443 -0
  24. rxnn-0.2.0/src/rxnn/utils.py +35 -0
  25. rxnn-0.1.82/README.md +0 -52
  26. rxnn-0.1.82/src/rxnn/memory/stm.py +0 -53
  27. rxnn-0.1.82/src/rxnn/transformers/sampler.py +0 -169
  28. rxnn-0.1.82/src/rxnn/utils.py +0 -14
  29. {rxnn-0.1.82 → rxnn-0.2.0}/LICENSE +0 -0
  30. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/__init__.py +0 -0
  31. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/__init__.py +0 -0
  32. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/models.py +0 -0
  33. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/experimental/moe.py +0 -0
  34. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/memory/__init__.py +0 -0
  35. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/memory/norm.py +0 -0
  36. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/rxt/__init__.py +0 -0
  37. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/__init__.py +0 -0
  38. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/training/tokenizer.py +0 -0
  39. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/__init__.py +0 -0
  40. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/ff.py +0 -0
  41. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/mask.py +0 -0
  42. {rxnn-0.1.82 → rxnn-0.2.0}/src/rxnn/transformers/moe.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rxnn
3
- Version: 0.1.82
3
+ Version: 0.2.0
4
4
  Summary: RxNN: Reactive Neural Networks Platform
5
5
  License: Apache-2.0
6
6
  Keywords: deep-learning,ai,machine-learning
@@ -23,13 +23,15 @@ Project-URL: Homepage, https://rxai.dev/rxnn
23
23
  Project-URL: Repository, https://github.com/RxAI-dev/rxnn/python
24
24
  Description-Content-Type: text/markdown
25
25
 
26
- <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxai.webp" width="300" />
27
- <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxnn.webp" width="300" />
26
+ <span>
27
+ <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxai_v2.png" width="400" />
28
+ <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxnn_v2.png" width="400" />
29
+ </span>
28
30
 
29
31
  # Reactive AI - RxNN
30
32
  ## Reactive Neural Networks Platform
31
33
 
32
- RxNN is AI/DeepLearning development platform made for Reactive Neural Networks and Event-driven AI, introduced by Reactive AI.
34
+ RxNN is AI/Deep Learning development platform made for Reactive Neural Networks and Event-driven AI, introduced by Reactive AI.
33
35
 
34
36
  ## Reactive Neural Networks and Event-driven AI
35
37
  Reactive neural networks (RxNN) are a new family of memory-augmented neural networks that combine classical deep learning
@@ -61,8 +63,8 @@ We are working on three new reactive architectures, that progressively advance f
61
63
 
62
64
  Each new architecture is based on the previous one and adding new features/abilities. They will be progressively
63
65
  released with next versions of **RxNN** framework:
64
- - 0.1.x: Reactive Transformer base models, Base Model Learning (pre-training/fine-tuning) & Transformers extensions (MoE Attention, Short-Term Memory, etc.)
65
- - 0.2.x: Memory Reinforcement Learning (MRL) for Short-Term Memory & Reactive Transformer, Attention-based Memory System details
66
+ - 0.1.x (Released): Reactive Transformer base models, Base Model Learning (pre-training/fine-tuning) & Transformers extensions (MoE Attention, Short-Term Memory, etc.)
67
+ - 0.2.x (Released): Memory Reinforcement Learning (MRL) for Short-Term Memory & Reactive Transformer, Attention-based Memory System details
66
68
  - 0.3.x: Reinforcement Learning from Human Feedback for Reactive models (RxRLHF), basic Tensor Reactive
67
69
  Extensions (TRX/Rust) for full Reactive Transformer, RxT-Alpha release (+following models - RxT-Beta, etc.)
68
70
  - 0.4.x: Preactor base models, Tensor Database (TDB/Rust) for Long-Term Memory, mxRAG/revRAG subsystems
@@ -75,6 +77,172 @@ released with next versions of **RxNN** framework:
75
77
  - 1.x.x: Multimodal reactive models (could be released earlier, depending on progress)
76
78
  - 2.0.0: Real-Time Vision Reactor - Worker class models
77
79
  - x.x.x: ...and more!
80
+
81
+ ## Usage
82
+ **RxNN** is made to train models based on reactive architectures, as well as transformer language models. Current version
83
+ is based on PyTorch and HuggingFace libraries (Transformers/Datasets/Tokenizer/Hub), and is integrated with [HuggingFace Hub](https://hugginface.co)
84
+ and [TensorBoard](https://github.com/tensorflow/tensorboard).
85
+
86
+ > We are also planning a version for **TensorFlow**, more info soon
87
+
88
+ ### Install library and dependencies
89
+ - RxNN and required deps: `pip install rxnn torch transformers tokenizers huggingface_hub`
90
+ - Datasets are required only for training: `pip install datasets`
91
+ - TensorBoard is optional: `pip install tensorboard`
92
+ - [Flash Attention](https://github.com/Dao-AILab/flash-attention) is recommended for faster training/inference (required for models with explicit `use_flash_attention=True`) - check its separate [installation guide](#installing-flash-attention)
93
+ - **NumPy** should be installed too: `pip install numpy`
94
+
95
+ > ### Installing Flash Attention
96
+ > Installing `flash-attn` could be very frustrating and may take hours (with standard method), only to result in some incompatibility
97
+ > error. Fortunately, the prebuilt versions could be downloaded from GitHub and installed just in seconds. However, you should choose
98
+ > the compatible version based on:
99
+ > - Python version
100
+ > - CUDA version
101
+ > - PyTorch version (2.7 is currently not supported)
102
+ > - ABI
103
+ >
104
+ > #### Steps
105
+ > 1. Choose your version from [https://github.com/Dao-AILab/flash-attention/releases](https://github.com/Dao-AILab/flash-attention/releases)
106
+ > 2. Download prebuilt release, in example: `wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
107
+ > 3. Install it, in example: `pip install --no-dependencies --upgrade flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
108
+ > 4. Verify: `flash_attn.__version__` (an incorrect version will cause the error when importing)
109
+ >
110
+ > #### Note on `use_flash_attention` option in models/layers
111
+ > Explicit `use_flash_attention` option is made to enable direct calls to `flash_attn_func` without using **PyTorch** `scaled_dot_product_attention`. Even
112
+ > if it's set to `False`, when `flash-attn` library is installed, **PyTorch** will try to use it implicitly through _SDPA backend_. It's better to set it
113
+ > to `False` and use automatically, because of better compatibility. Explicit options could be used for research
114
+
115
+ ### Modules
116
+ **RxNN** framework has multiple modules with models, layers, training and inference tools, made for complete development
117
+ of _reactive models_, and could be also used for regular **Transformers**.
118
+
119
+ #### Transformers
120
+ Transformers module includes classes for models and layers. It includes **Reactive Transformers** as well as **Classic Transformers**
121
+
122
+ Submodules:
123
+ - `rxnn.transformers.attention` - basic, most common attention layers - `MultiHeadAttention`, `GroupedQueryAttention` and `MultiQueryAttention`
124
+ - additional attention layers, especially `SparseQueryAttention` could be found in `rxnn.experimental.attention` module
125
+ - `SparseQueryAttention` will be moved to `rxnn.transformers.attention` in 0.2.x version
126
+ - `rxnn.transformers.positional` - positional encoding layers - `RotaryPositionalEmbedding` and legacy ones - `AbsolutePositionalEmbedding`/`RelativePositionalEmbedding`
127
+ - `rxnn.transformers.ff` - dense feed forward layers, including gated layers (_SwiGLU_, etc.) - `FeedForward` & `GatedFeedForward` (recommended)
128
+ - `rxnn.transformers.moe` - Mixture-of-Experts feed forward layers - `MoeFeedForward` & `GatedMoeFeedForward` (recommended)
129
+ - `rxnn.transformer.layers` - complete reactive/classic transformer layers - `ReactiveTransformerLayer` & `ClassicTransformerLayer`
130
+ - `rxnn.transformer.models` - reactive/classic transformer models - `ReactiveTransformerEncoder`, `ReactiveTransformerDecoder` & `ClassicTransformerEncoder`, `ClassicTransformerDecoder`
131
+ - `rxnn.transformer.sampler` - samplers for reactive models (Sampler is the integral part of reactive architectures) - `Sampler`, `SampleDecoder`, `BatchSampler` & `BatchSampleDecoder`
132
+
133
+ In **RxNN** models are initialized in declarative style by class composition, but then they are wrapped in imperative classes,
134
+ to be compatible with HuggingFace **JSON** config. In example:
135
+
136
+ ```python
137
+ from typing import TypedDict
138
+ import torch
139
+ import torch.nn as nn
140
+ from huggingface_hub import PyTorchModelHubMixin
141
+ from rxnn.transformers.attention import GroupedQueryAttention
142
+ from rxnn.transformers.positional import RotaryPositionalEmbedding
143
+ from rxnn.transformers.layers import ReactiveTransformerLayer
144
+ from rxnn.transformers.models import ReactiveTransformerDecoder
145
+ from rxnn.memory.stm import ShortTermMemory
146
+
147
+ class YourReactiveTransformerConfig(TypedDict):
148
+ num_layers: int
149
+ vocab_size: int
150
+ embed_dim: int
151
+ ff_dim: int
152
+ att_heads: int
153
+ seq_len: int
154
+ stm_size: int
155
+ att_groups: int
156
+ cross_att_groups: int
157
+
158
+
159
+ class YourReactiveTransformerDecoder(nn.Module, PyTorchModelHubMixin):
160
+ def __init__(
161
+ self,
162
+ config: YourReactiveTransformerConfig,
163
+ **kwargs
164
+ ):
165
+ super(YourReactiveTransformerDecoder, self).__init__(**kwargs)
166
+
167
+ embedding = nn.Embedding(config['vocab_size'], config['embed_dim'])
168
+ rope = RotaryPositionalEmbedding(config['embed_dim'] // config['att_heads'], config['seq_len'])
169
+ stm = ShortTermMemory(config['num_layers'], config['embed_dim'], config['stm_size'])
170
+
171
+ self.model = ReactiveTransformerDecoder(
172
+ stm=stm,
173
+ embedding=embedding,
174
+ own_layers=nn.ModuleList([
175
+ ReactiveTransformerLayer(
176
+ config['embed_dim'],
177
+ config['ff_dim'],
178
+ use_gated=True,
179
+ use_moe=False,
180
+ ff_activation=nn.GELU(),
181
+ ff_dropout=0.1,
182
+ use_rms_norm=True,
183
+ self_attention=GroupedQueryAttention(
184
+ config['embed_dim'],
185
+ config['att_heads'],
186
+ config['att_groups'],
187
+ rope=rope,
188
+ dropout=0.1,
189
+ max_seq_len=config['seq_len'],
190
+ is_causal=True,
191
+ ),
192
+ memory_cross_attention=GroupedQueryAttention(
193
+ config['embed_dim'],
194
+ config['att_heads'],
195
+ config['att_groups'],
196
+ rope=rope,
197
+ dropout=0.1,
198
+ max_seq_len=config['seq_len'],
199
+ is_causal=True,
200
+ rope_only_for_query=True
201
+ ),
202
+ ) for _ in range(config['num_layers'])
203
+ ])
204
+ )
205
+
206
+ def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None):
207
+ return self.model(x, attention_mask=attention_mask)
208
+ ```
209
+
210
+ #### Memory
211
+ The _memory_ module includes **Short-Term Memory** and layers responsible for its update. In future versions it will also
212
+ include **Long-Term Memory**.
213
+
214
+ The main `ShortTermMemory` class is located in `rxnn.memory.stm` module - the usage example is in Transformers module description.
215
+
216
+ > 0.2.x Memory modules docs in progress - will be released soon
217
+
218
+ #### Training
219
+ Training module includes **Trainers** for different training stages of reactive models and shared training utils.
220
+
221
+ Submodules:
222
+ - `rxnn.training.tokenizer` - custom Trainer for **HuggingFace** `tokenizers` and utils to load tokenizer from Hub
223
+ - Tokenizer could be loaded from Hub with `load_tokenizer_from_hf_hub(repo_id)`
224
+ - `rxnn.training.dataset` - datasets for different training stages:
225
+ - `MaskedLMDataset` & `AutoregressiveLMDataset` are made for base models pre-training
226
+ - `EncoderSftDataset` & `DecoderSftDataset` are made for Interaction Supervised Fine-Tuning for reactive models
227
+ - `MrlCurriculumDataset` is the dataset for single MRL Curriculum step
228
+ - `MrlDatasets` is wrapping MRL datasets for all curriculum steps
229
+ - each dataset has `from_hf_hub` class method to load dataset from Hub
230
+ - they have also `concat_from_hf_hub` class method to load multiple Hub datasets into single training dataset
231
+ - if dataset has no validation/test split, each dataset has `get_subset(subset_size, from_start=False)` method - it
232
+ returns new subset and modifying existing one - i.e. `valid_dataset = train_dataset.get_subset(0.1)`
233
+ - for concatenated datasets, validation/test split could be created with `concat_from_hf_hub_with_subset` - it cuts the
234
+ same percentage of each loaded dataset
235
+ - `rxnn.training.callbacks` contain Trainer callbacks, for different kind of utils (more info below)
236
+ - `rxnn.training.scheduler` includes learning rate scheduler for training
237
+ - `rxnn.training.bml` - Base Model Learning module with Trainers for pre-training and fine-tuning
238
+ - `rxnn.training.mrl` - Memory Reinforcement Learning module with Trainers for MRL
239
+ - `rxnn.training.rxrlhf` - Reinforcement Learning from Human Feedback for Reactive Models module (from 0.3.x)
240
+ - `rxnn.training.brl` - Behavioral Reinforcement Learning module (Reactor / from 0.7.x)
241
+
242
+ ##### Base Model Learning
243
+ Docs in progress
244
+
245
+
78
246
  Apache License
79
247
  Version 2.0, January 2004
80
248
  http://www.apache.org/licenses/
rxnn-0.2.0/README.md ADDED
@@ -0,0 +1,219 @@
1
+ <span>
2
+ <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxai_v2.png" width="400" />
3
+ <img src="https://raw.githubusercontent.com/RxAI-dev/RxNN/refs/heads/main/assets/logo/logo_rxnn_v2.png" width="400" />
4
+ </span>
5
+
6
+ # Reactive AI - RxNN
7
+ ## Reactive Neural Networks Platform
8
+
9
+ RxNN is AI/Deep Learning development platform made for Reactive Neural Networks and Event-driven AI, introduced by Reactive AI.
10
+
11
+ ## Reactive Neural Networks and Event-driven AI
12
+ Reactive neural networks (RxNN) are a new family of memory-augmented neural networks that combine classical deep learning
13
+ algorithms with reactive communication patterns. In Event-driven AI, input data (sequence) is treated as event, and memory
14
+ state has to be kept between events/interactions. Technically, it's a specific kind of RNN that's storing data between
15
+ processed sequences, instead of between sequence elements like in regular RNN. Then, their recurrence is on a higher level.
16
+ In the case of reactive communication patterns, RxRNNs are stateful reactive data sources that you have to connect before
17
+ you can send and receive messages.
18
+ While RxNNs are using some RNN concepts, they are rather made to extend Transformer language/multi-modal models. In our
19
+ opinion, the biggest downside of current LLMs is their stateless nature - conversational models have to process full chat
20
+ history on every interaction! That's not real-time processing, and it's not how human's awareness is working. In RxNN based
21
+ transformers, model is processing single messages, while all the previous interactions history should be saved and read
22
+ from memory. That features are required for **Weak** Reactive Neural Networks specification, and it will be the first major
23
+ step in transition from language models to awareness models - in Reactive AI ecosystem, it will be introduced in Reactive
24
+ Transformer architecture.
25
+
26
+ Additionally, to achieve awareness, **Strong** Reactive Neural Networks are working in reactive infinite reasoning loop,
27
+ that's generating Infinite Chain-of-Thoughts and is communicating in push-based mode (model decides if and when return output).
28
+
29
+ Reactive communication patterns in RxNN models are adapted to handle asynchronous nature of model - after it finish generating
30
+ sequence, it has to process it and save it in memory, but it could be done in background.
31
+
32
+ ## Release plan
33
+ We are working on three new reactive architectures, that progressively advance from language models to awareness models:
34
+ - Reactive Transformer: Reactive Language Model (RLM) with Short-Term Memory
35
+ - Preactor: extending Reactive Transformer with additional Long-Term Memory, providing theoretically infinite context (only
36
+ single message length is limited) and the ability to learn from interactions (Live Learning)
37
+ - Reactor: AGI awareness model & Strong Reactive Neural Network, that's working in infinite reasoning loop and doesn't require explicit human commands
38
+
39
+ Each new architecture is based on the previous one and adding new features/abilities. They will be progressively
40
+ released with next versions of **RxNN** framework:
41
+ - 0.1.x (Released): Reactive Transformer base models, Base Model Learning (pre-training/fine-tuning) & Transformers extensions (MoE Attention, Short-Term Memory, etc.)
42
+ - 0.2.x (Released): Memory Reinforcement Learning (MRL) for Short-Term Memory & Reactive Transformer, Attention-based Memory System details
43
+ - 0.3.x: Reinforcement Learning from Human Feedback for Reactive models (RxRLHF), basic Tensor Reactive
44
+ Extensions (TRX/Rust) for full Reactive Transformer, RxT-Alpha release (+following models - RxT-Beta, etc.)
45
+ - 0.4.x: Preactor base models, Tensor Database (TDB/Rust) for Long-Term Memory, mxRAG/revRAG subsystems
46
+ - 0.5.x: MRL for Long-Term Memory & Preactor, Live Learning for Preactor, PRx-Alpha release (+following models - PRx-Beta, etc.)
47
+ - 0.6.x: Reactor base models, TRX full implementation, Receptors & Effectors Reactive RNNs
48
+ - 0.7.x: Behavioral Reinforcement Learning (BRL) for Reactor's Infinite Chain-of-Thoughts, Continuous Live Learning for Reactor
49
+ - 0.8.x: Rx-Alpha release
50
+ - 0.9.x: Rx-Beta release
51
+ - 1.0.0: Reactor AGI official release (Expert, Assistant & Utility class models)
52
+ - 1.x.x: Multimodal reactive models (could be released earlier, depending on progress)
53
+ - 2.0.0: Real-Time Vision Reactor - Worker class models
54
+ - x.x.x: ...and more!
55
+
56
+ ## Usage
57
+ **RxNN** is made to train models based on reactive architectures, as well as transformer language models. Current version
58
+ is based on PyTorch and HuggingFace libraries (Transformers/Datasets/Tokenizer/Hub), and is integrated with [HuggingFace Hub](https://hugginface.co)
59
+ and [TensorBoard](https://github.com/tensorflow/tensorboard).
60
+
61
+ > We are also planning a version for **TensorFlow**, more info soon
62
+
63
+ ### Install library and dependencies
64
+ - RxNN and required deps: `pip install rxnn torch transformers tokenizers huggingface_hub`
65
+ - Datasets are required only for training: `pip install datasets`
66
+ - TensorBoard is optional: `pip install tensorboard`
67
+ - [Flash Attention](https://github.com/Dao-AILab/flash-attention) is recommended for faster training/inference (required for models with explicit `use_flash_attention=True`) - check its separate [installation guide](#installing-flash-attention)
68
+ - **NumPy** should be installed too: `pip install numpy`
69
+
70
+ > ### Installing Flash Attention
71
+ > Installing `flash-attn` could be very frustrating and may take hours (with standard method), only to result in some incompatibility
72
+ > error. Fortunately, the prebuilt versions could be downloaded from GitHub and installed just in seconds. However, you should choose
73
+ > the compatible version based on:
74
+ > - Python version
75
+ > - CUDA version
76
+ > - PyTorch version (2.7 is currently not supported)
77
+ > - ABI
78
+ >
79
+ > #### Steps
80
+ > 1. Choose your version from [https://github.com/Dao-AILab/flash-attention/releases](https://github.com/Dao-AILab/flash-attention/releases)
81
+ > 2. Download prebuilt release, in example: `wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
82
+ > 3. Install it, in example: `pip install --no-dependencies --upgrade flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl`
83
+ > 4. Verify: `flash_attn.__version__` (an incorrect version will cause the error when importing)
84
+ >
85
+ > #### Note on `use_flash_attention` option in models/layers
86
+ > Explicit `use_flash_attention` option is made to enable direct calls to `flash_attn_func` without using **PyTorch** `scaled_dot_product_attention`. Even
87
+ > if it's set to `False`, when `flash-attn` library is installed, **PyTorch** will try to use it implicitly through _SDPA backend_. It's better to set it
88
+ > to `False` and use automatically, because of better compatibility. Explicit options could be used for research
89
+
90
+ ### Modules
91
+ **RxNN** framework has multiple modules with models, layers, training and inference tools, made for complete development
92
+ of _reactive models_, and could be also used for regular **Transformers**.
93
+
94
+ #### Transformers
95
+ Transformers module includes classes for models and layers. It includes **Reactive Transformers** as well as **Classic Transformers**
96
+
97
+ Submodules:
98
+ - `rxnn.transformers.attention` - basic, most common attention layers - `MultiHeadAttention`, `GroupedQueryAttention` and `MultiQueryAttention`
99
+ - additional attention layers, especially `SparseQueryAttention` could be found in `rxnn.experimental.attention` module
100
+ - `SparseQueryAttention` will be moved to `rxnn.transformers.attention` in 0.2.x version
101
+ - `rxnn.transformers.positional` - positional encoding layers - `RotaryPositionalEmbedding` and legacy ones - `AbsolutePositionalEmbedding`/`RelativePositionalEmbedding`
102
+ - `rxnn.transformers.ff` - dense feed forward layers, including gated layers (_SwiGLU_, etc.) - `FeedForward` & `GatedFeedForward` (recommended)
103
+ - `rxnn.transformers.moe` - Mixture-of-Experts feed forward layers - `MoeFeedForward` & `GatedMoeFeedForward` (recommended)
104
+ - `rxnn.transformer.layers` - complete reactive/classic transformer layers - `ReactiveTransformerLayer` & `ClassicTransformerLayer`
105
+ - `rxnn.transformer.models` - reactive/classic transformer models - `ReactiveTransformerEncoder`, `ReactiveTransformerDecoder` & `ClassicTransformerEncoder`, `ClassicTransformerDecoder`
106
+ - `rxnn.transformer.sampler` - samplers for reactive models (Sampler is the integral part of reactive architectures) - `Sampler`, `SampleDecoder`, `BatchSampler` & `BatchSampleDecoder`
107
+
108
+ In **RxNN** models are initialized in declarative style by class composition, but then they are wrapped in imperative classes,
109
+ to be compatible with HuggingFace **JSON** config. In example:
110
+
111
+ ```python
112
+ from typing import TypedDict
113
+ import torch
114
+ import torch.nn as nn
115
+ from huggingface_hub import PyTorchModelHubMixin
116
+ from rxnn.transformers.attention import GroupedQueryAttention
117
+ from rxnn.transformers.positional import RotaryPositionalEmbedding
118
+ from rxnn.transformers.layers import ReactiveTransformerLayer
119
+ from rxnn.transformers.models import ReactiveTransformerDecoder
120
+ from rxnn.memory.stm import ShortTermMemory
121
+
122
+ class YourReactiveTransformerConfig(TypedDict):
123
+ num_layers: int
124
+ vocab_size: int
125
+ embed_dim: int
126
+ ff_dim: int
127
+ att_heads: int
128
+ seq_len: int
129
+ stm_size: int
130
+ att_groups: int
131
+ cross_att_groups: int
132
+
133
+
134
+ class YourReactiveTransformerDecoder(nn.Module, PyTorchModelHubMixin):
135
+ def __init__(
136
+ self,
137
+ config: YourReactiveTransformerConfig,
138
+ **kwargs
139
+ ):
140
+ super(YourReactiveTransformerDecoder, self).__init__(**kwargs)
141
+
142
+ embedding = nn.Embedding(config['vocab_size'], config['embed_dim'])
143
+ rope = RotaryPositionalEmbedding(config['embed_dim'] // config['att_heads'], config['seq_len'])
144
+ stm = ShortTermMemory(config['num_layers'], config['embed_dim'], config['stm_size'])
145
+
146
+ self.model = ReactiveTransformerDecoder(
147
+ stm=stm,
148
+ embedding=embedding,
149
+ own_layers=nn.ModuleList([
150
+ ReactiveTransformerLayer(
151
+ config['embed_dim'],
152
+ config['ff_dim'],
153
+ use_gated=True,
154
+ use_moe=False,
155
+ ff_activation=nn.GELU(),
156
+ ff_dropout=0.1,
157
+ use_rms_norm=True,
158
+ self_attention=GroupedQueryAttention(
159
+ config['embed_dim'],
160
+ config['att_heads'],
161
+ config['att_groups'],
162
+ rope=rope,
163
+ dropout=0.1,
164
+ max_seq_len=config['seq_len'],
165
+ is_causal=True,
166
+ ),
167
+ memory_cross_attention=GroupedQueryAttention(
168
+ config['embed_dim'],
169
+ config['att_heads'],
170
+ config['att_groups'],
171
+ rope=rope,
172
+ dropout=0.1,
173
+ max_seq_len=config['seq_len'],
174
+ is_causal=True,
175
+ rope_only_for_query=True
176
+ ),
177
+ ) for _ in range(config['num_layers'])
178
+ ])
179
+ )
180
+
181
+ def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None):
182
+ return self.model(x, attention_mask=attention_mask)
183
+ ```
184
+
185
+ #### Memory
186
+ The _memory_ module includes **Short-Term Memory** and layers responsible for its update. In future versions it will also
187
+ include **Long-Term Memory**.
188
+
189
+ The main `ShortTermMemory` class is located in `rxnn.memory.stm` module - the usage example is in Transformers module description.
190
+
191
+ > 0.2.x Memory modules docs in progress - will be released soon
192
+
193
+ #### Training
194
+ Training module includes **Trainers** for different training stages of reactive models and shared training utils.
195
+
196
+ Submodules:
197
+ - `rxnn.training.tokenizer` - custom Trainer for **HuggingFace** `tokenizers` and utils to load tokenizer from Hub
198
+ - Tokenizer could be loaded from Hub with `load_tokenizer_from_hf_hub(repo_id)`
199
+ - `rxnn.training.dataset` - datasets for different training stages:
200
+ - `MaskedLMDataset` & `AutoregressiveLMDataset` are made for base models pre-training
201
+ - `EncoderSftDataset` & `DecoderSftDataset` are made for Interaction Supervised Fine-Tuning for reactive models
202
+ - `MrlCurriculumDataset` is the dataset for single MRL Curriculum step
203
+ - `MrlDatasets` is wrapping MRL datasets for all curriculum steps
204
+ - each dataset has `from_hf_hub` class method to load dataset from Hub
205
+ - they have also `concat_from_hf_hub` class method to load multiple Hub datasets into single training dataset
206
+ - if dataset has no validation/test split, each dataset has `get_subset(subset_size, from_start=False)` method - it
207
+ returns new subset and modifying existing one - i.e. `valid_dataset = train_dataset.get_subset(0.1)`
208
+ - for concatenated datasets, validation/test split could be created with `concat_from_hf_hub_with_subset` - it cuts the
209
+ same percentage of each loaded dataset
210
+ - `rxnn.training.callbacks` contain Trainer callbacks, for different kind of utils (more info below)
211
+ - `rxnn.training.scheduler` includes learning rate scheduler for training
212
+ - `rxnn.training.bml` - Base Model Learning module with Trainers for pre-training and fine-tuning
213
+ - `rxnn.training.mrl` - Memory Reinforcement Learning module with Trainers for MRL
214
+ - `rxnn.training.rxrlhf` - Reinforcement Learning from Human Feedback for Reactive Models module (from 0.3.x)
215
+ - `rxnn.training.brl` - Behavioral Reinforcement Learning module (Reactor / from 0.7.x)
216
+
217
+ ##### Base Model Learning
218
+ Docs in progress
219
+
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "rxnn"
7
- version = "0.1.82"
7
+ version = "0.2.0"
8
8
  description = "RxNN: Reactive Neural Networks Platform"
9
9
 
10
10
  license = "Apache-2.0"
Binary file
@@ -287,6 +287,7 @@ class SparseQueryAttention(MultiHeadAttention):
287
287
  k = self.k_proj(key).view(b, -1, self.num_groups, head_dim).transpose(1, 2)
288
288
  v = self.v_proj(value).view(b, -1, self.num_groups, head_dim).transpose(1, 2)
289
289
  else:
290
+ # Relative embedding version is not working without this strange mapping - it will be removed in next versions
290
291
  group_heads = self.num_heads // self.num_groups
291
292
  query_heads = self.num_heads // self.num_query_groups
292
293
  # Process Q
@@ -457,6 +458,7 @@ def init_experimental_attention(
457
458
  dropout: float = 0.0,
458
459
  rope: RotaryPositionalEmbedding = None,
459
460
  rope_only_for_query: bool = False,
461
+ rope_only_for_keys: bool = False,
460
462
  use_relative_embeddings: bool = False,
461
463
  max_seq_len: int = 1024,
462
464
  use_flash_attention: bool = False,
@@ -478,6 +480,7 @@ def init_experimental_attention(
478
480
  use_relative_embeddings=use_relative_embeddings,
479
481
  max_seq_len=max_seq_len,
480
482
  rope_only_for_query=rope_only_for_query,
483
+ rope_only_for_keys=rope_only_for_keys,
481
484
  use_flash_attention=use_flash_attention,
482
485
  is_causal=is_causal,
483
486
  use_bias=use_bias,
@@ -493,6 +496,7 @@ def init_experimental_attention(
493
496
  use_relative_embeddings=use_relative_embeddings,
494
497
  max_seq_len=max_seq_len,
495
498
  rope_only_for_query=rope_only_for_query,
499
+ rope_only_for_keys=rope_only_for_keys,
496
500
  use_flash_attention=use_flash_attention,
497
501
  is_causal=is_causal,
498
502
  use_bias=use_bias,
@@ -511,6 +515,7 @@ def init_experimental_attention(
511
515
  use_relative_embeddings=use_relative_embeddings,
512
516
  max_seq_len=max_seq_len,
513
517
  rope_only_for_query=rope_only_for_query,
518
+ rope_only_for_keys=rope_only_for_keys,
514
519
  use_flash_attention=use_flash_attention,
515
520
  is_causal=is_causal,
516
521
  use_bias=use_bias,
@@ -0,0 +1,42 @@
1
+ import torch
2
+ import torch.nn as nn
3
+ from .stm import ShortTermMemory
4
+
5
+ class StmMemoryAttention(nn.Module):
6
+ def __init__(
7
+ self,
8
+ stm: ShortTermMemory,
9
+ attention_layers: nn.ModuleList,
10
+ memory_norm_layers: nn.ModuleList,
11
+ *args,
12
+ **kwargs
13
+ ):
14
+ super(StmMemoryAttention, self).__init__(*args, **kwargs)
15
+ self.stm = stm
16
+ self.attention_layers = attention_layers
17
+ self.memory_norm_layers = memory_norm_layers
18
+ assert len(self.attention_layers) == len(self.memory_norm_layers) == self.stm.memory.size(0)
19
+ self.num_layers = len(attention_layers)
20
+
21
+ def update_max_len(self, max_seq_len: int):
22
+ for i in range(self.num_layers):
23
+ if self.attention_layers[i].rope is not None:
24
+ self.attention_layers[i].rope.update_max_len(max_seq_len)
25
+
26
+ def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
27
+ mask = attention_mask.unsqueeze(1).unsqueeze(1).bool() if attention_mask is not None else None
28
+
29
+ new_stm = torch.zeros_like(self.stm.memory)
30
+ for i in range(self.num_layers):
31
+ layer_stm = self.stm(i)
32
+ # expand layer STM to batch size, if it's not in batch mode
33
+ if layer_stm.size(0) == 1:
34
+ layer_stm = layer_stm.expand(x.size(0), -1, -1)
35
+ encoded_layer_data = x[i]
36
+ normalized_layer_stm = self.memory_norm_layers[i](layer_stm)
37
+ new_layer_stm = self.attention_layers[i](normalized_layer_stm, encoded_layer_data, encoded_layer_data, mask=mask)
38
+ # self.stm.update_layer(i, new_layer_stm + layer_stm)
39
+ new_stm[i] = new_layer_stm + layer_stm # residual
40
+ self.stm.update_all(new_stm)
41
+ return self.stm.memory
42
+
@@ -0,0 +1,94 @@
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class ShortTermMemory(nn.Module):
5
+ """Short-term memory module for the Attention-based Memory System"""
6
+
7
+ def __init__(self, num_layers: int, embed_dim: int, stm_size: int, init_type: str = 'normal',
8
+ is_trainable: bool = False, legacy_init: bool = True, *args, **kwargs):
9
+ super(ShortTermMemory, self).__init__(*args, **kwargs)
10
+ self.num_layers = num_layers
11
+ self.embed_dim = embed_dim
12
+ self.stm_size = stm_size
13
+ self.batch_size = 1 # setting 1 as initial batch size (it will be normally used in inference/pre-training. Bigger batches are for RL stages)
14
+ self.is_trainable = is_trainable
15
+ assert init_type in ['normal', 'standard', 'uniform', 'ones', 'zeros'], \
16
+ 'STM init type must be one of "normal", "standard", "uniform", "ones", "zeros"'
17
+ self.init_type = init_type
18
+ stm = self._init_tensor()
19
+ if self.is_trainable:
20
+ self.memory = nn.Parameter(stm)
21
+ else:
22
+ self.register_buffer('memory', stm)
23
+ # Legacy init - temporary option to load old models with not-batched STM (they will be loaded, updated and then the option will be removed)
24
+ self.legacy_init = legacy_init
25
+
26
+ def _init_tensor(self, init_type: str = None):
27
+ init_type = init_type or self.init_type
28
+ stm_shape = (self.num_layers, self.stm_size, self.embed_dim) \
29
+ if self.legacy_init else (self.num_layers, self.batch_size, self.stm_size, self.embed_dim)
30
+ if init_type == 'normal':
31
+ return torch.normal(0, 0.02, stm_shape)
32
+ elif init_type == 'standard':
33
+ return torch.normal(0, 1, stm_shape)
34
+ elif init_type == 'uniform':
35
+ return torch.rand(*stm_shape) * 0.02
36
+ elif init_type == 'ones':
37
+ return torch.ones(*stm_shape)
38
+ else:
39
+ return torch.zeros(*stm_shape)
40
+
41
+ def reset_legacy_(self):
42
+ self.legacy_init = False
43
+ self.memory = self._init_tensor()
44
+
45
+ def forward(self, layer: int) -> torch.Tensor:
46
+ return self.memory[layer].unsqueeze(0) if self.legacy_init else self.memory[layer]
47
+
48
+ def update_layer(self, layer: int, new_stm: torch.Tensor):
49
+ self.memory[layer] = new_stm
50
+
51
+ def update_all(self, new_stm: torch.Tensor):
52
+ self.memory.copy_(new_stm)
53
+
54
+ def make_trainable(self):
55
+ if not self.is_trainable:
56
+ self.is_trainable = True
57
+ initial_stm = self.memory.clone()
58
+ del self.memory
59
+ self.memory = nn.Parameter(initial_stm)
60
+
61
+ def freeze(self):
62
+ if self.is_trainable:
63
+ self.requires_grad_(False)
64
+ trained_stm = self.memory.clone()
65
+ del self.memory
66
+ self.register_buffer('memory', trained_stm)
67
+
68
+ def reset(self, init_type: str = None):
69
+ self.memory = self._init_tensor(init_type)
70
+
71
+ def resize(self, new_stm_size: int, init_type: str = None):
72
+ self.stm_size = new_stm_size
73
+ self.memory = self._init_tensor(init_type)
74
+
75
+ def batched_memory(self, batch_size: int, init_type: str = None):
76
+ if init_type is not None:
77
+ assert init_type in ['normal', 'standard', 'uniform', 'ones', 'zeros'], \
78
+ 'STM init type must be one of "normal", "standard", "uniform", "ones", "zeros"'
79
+ self.init_type = init_type
80
+ self.batch_size = batch_size
81
+ self.memory = self._init_tensor()
82
+
83
+ def single_memory(self, init_type: str = None, use_mean_from_batch: bool = False):
84
+ if init_type is not None:
85
+ assert init_type in ['normal', 'standard', 'uniform', 'ones', 'zeros'], \
86
+ 'STM init type must be one of "normal", "standard", "uniform", "ones", "zeros"'
87
+ self.init_type = init_type
88
+ self.batch_size = 1
89
+ if use_mean_from_batch:
90
+ batch_mean = self.memory.mean(dim=(1, 2, 3), keepdim=True)
91
+ self.memory = self._init_tensor()
92
+ self.memory.copy_(batch_mean)
93
+ else:
94
+ self.memory = self._init_tensor()