rxnn 0.2.69__py3-none-any.whl → 0.2.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rxnn/memory/attention.py +8 -0
- rxnn/training/dataset.py +14 -5
- rxnn/training/models.py +2 -2
- rxnn/training/mrl.py +32 -10
- rxnn/transformers/layers.py +17 -2
- rxnn/transformers/models.py +21 -20
- {rxnn-0.2.69.dist-info → rxnn-0.2.71.dist-info}/METADATA +1 -1
- {rxnn-0.2.69.dist-info → rxnn-0.2.71.dist-info}/RECORD +10 -10
- {rxnn-0.2.69.dist-info → rxnn-0.2.71.dist-info}/LICENSE +0 -0
- {rxnn-0.2.69.dist-info → rxnn-0.2.71.dist-info}/WHEEL +0 -0
rxnn/memory/attention.py
CHANGED
@@ -64,6 +64,8 @@ class StmMemoryAttention(nn.Module):
|
|
64
64
|
layer_stm = layer_stm.expand(x.size(0), -1, -1)
|
65
65
|
encoded_layer_data = x[i]
|
66
66
|
normalized_layer_stm = self.memory_norm_layers[i](layer_stm)
|
67
|
+
if torch.isnan(normalized_layer_stm).any():
|
68
|
+
print(f"NaN detected in {i} layer memory norm output")
|
67
69
|
|
68
70
|
if self.debug_mode and self.training:
|
69
71
|
if self.debug_step != 0 and self.debug_step % self.debug_interval == 0:
|
@@ -72,7 +74,13 @@ class StmMemoryAttention(nn.Module):
|
|
72
74
|
else:
|
73
75
|
self.debug_step += 1
|
74
76
|
|
77
|
+
if torch.isnan(encoded_layer_data).any():
|
78
|
+
print(f"NaN detected in {i} layer encoded data input")
|
79
|
+
|
75
80
|
new_layer_stm = self.attention_layers[i](normalized_layer_stm, encoded_layer_data, encoded_layer_data, mask=attention_mask)
|
81
|
+
if torch.isnan(new_layer_stm).any():
|
82
|
+
print(f"NaN detected in {i} layer memory attention output")
|
83
|
+
|
76
84
|
if self.use_gated_residual:
|
77
85
|
new_stm[i] = self._residual_gate(self.gate[i], layer_stm, new_layer_stm) # gated residual
|
78
86
|
else:
|
rxnn/training/dataset.py
CHANGED
@@ -4,7 +4,7 @@ from datasets import Dataset as HfDataset, load_dataset, concatenate_datasets
|
|
4
4
|
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
5
5
|
from .tokenizer import load_tokenizer_from_hf_hub
|
6
6
|
|
7
|
-
from typing import Union, TypedDict, Optional, TypeAlias, Any
|
7
|
+
from typing import Union, TypedDict, Optional, TypeAlias, Any, Literal
|
8
8
|
|
9
9
|
|
10
10
|
class BaseDataset(Dataset):
|
@@ -854,8 +854,8 @@ class EncoderSftDataset(BaseInteractionDataset):
|
|
854
854
|
'labels': labels
|
855
855
|
}
|
856
856
|
|
857
|
-
|
858
|
-
MrlDataItem: TypeAlias = dict[str, Union[dict[
|
857
|
+
ItemFields: TypeAlias = Literal['input_ids', 'attention_mask']
|
858
|
+
MrlDataItem: TypeAlias = dict[str, Union[dict[ItemFields, torch.Tensor], list[dict[str, dict[ItemFields, torch.Tensor]]]]]
|
859
859
|
|
860
860
|
|
861
861
|
class MrlCurriculumDataset(Dataset):
|
@@ -1031,7 +1031,7 @@ class MrlCurriculumDataset(Dataset):
|
|
1031
1031
|
"""Collate function for MRL curriculum dataset with nested interactions"""
|
1032
1032
|
|
1033
1033
|
def collate_interaction_batch(interaction_batch: Union[list[dict[str, dict[str, torch.Tensor]]], tuple[Any]]) -> \
|
1034
|
-
dict[str, dict[
|
1034
|
+
dict[str, dict[ItemFields, torch.Tensor]]:
|
1035
1035
|
"""Helper to collate a batch of interactions"""
|
1036
1036
|
return {
|
1037
1037
|
'query': {
|
@@ -1047,13 +1047,22 @@ class MrlCurriculumDataset(Dataset):
|
|
1047
1047
|
batch_interactions = [x['interactions'] for x in batch]
|
1048
1048
|
transposed_interactions = list(zip(*batch_interactions))
|
1049
1049
|
|
1050
|
-
|
1050
|
+
def has_nans(tensor: dict[ItemFields, torch.Tensor]) -> bool:
|
1051
|
+
return torch.isnan(tensor['input_ids']).any().item() or torch.isnan(tensor['attention_mask']).any().item()
|
1052
|
+
|
1053
|
+
results: MrlDataItem = {
|
1051
1054
|
**collate_interaction_batch(batch), # Collate initial query and answer
|
1052
1055
|
'interactions': [
|
1053
1056
|
collate_interaction_batch(step_batch) for step_batch in transposed_interactions
|
1054
1057
|
]
|
1055
1058
|
}
|
1056
1059
|
|
1060
|
+
assert not has_nans(results['query']), "NaN in query"
|
1061
|
+
assert not has_nans(results['answer']), "NaN in answer"
|
1062
|
+
assert not any([(has_nans(item['query']) or has_nans(item['answer'])) for item in results['interactions']]), "NaN in interactions"
|
1063
|
+
|
1064
|
+
return results
|
1065
|
+
|
1057
1066
|
|
1058
1067
|
class MrlDatasetItem(TypedDict):
|
1059
1068
|
steps: int
|
rxnn/training/models.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import torch
|
2
2
|
import torch.nn as nn
|
3
3
|
from enum import Enum
|
4
|
-
from typing import Literal, Iterator
|
4
|
+
from typing import Literal, Iterator, Optional
|
5
5
|
from huggingface_hub import PyTorchModelHubMixin
|
6
6
|
from ..transformers.models import ReactiveTransformerEncoder, ReactiveTransformerDecoder
|
7
7
|
from ..transformers.ff import GatedLinearUnit, get_activation_layer
|
@@ -188,7 +188,7 @@ class MrlActorModel(nn.Module):
|
|
188
188
|
list(self.memory_attention_parameters())
|
189
189
|
))
|
190
190
|
|
191
|
-
def moe_router_loss(self):
|
191
|
+
def moe_router_loss(self) -> Optional[torch.Tensor]:
|
192
192
|
if self.encoder.model.use_moe and self.decoder.model.use_moe:
|
193
193
|
return (self.encoder.model.moe_router_loss() + self.decoder.model.moe_router_loss()) / 2
|
194
194
|
elif self.encoder.model.use_moe:
|
rxnn/training/mrl.py
CHANGED
@@ -591,6 +591,8 @@ class MRLTrainer:
|
|
591
591
|
actor = next(self.actor.children()) if isinstance(self.actor, DistributedDataParallel) else self.actor
|
592
592
|
|
593
593
|
router_loss = actor.moe_router_loss()
|
594
|
+
if torch.isnan(router_loss).any():
|
595
|
+
print("NaN detected in router loss")
|
594
596
|
if router_loss is not None:
|
595
597
|
return main_loss + self.moe_aux_loss_scale * router_loss
|
596
598
|
else:
|
@@ -605,18 +607,38 @@ class MRLTrainer:
|
|
605
607
|
print(f"Encoder grad norm - total: {encoder_total:.6f}, mean: {encoder_mean:.6f}")
|
606
608
|
print(f"Decoder grad norm - total: {decoder_total:.6f}, mean: {decoder_mean:.6f}")
|
607
609
|
print(f"Memory attention grad norm - total: {mem_att_total:.6f}, mean: {mem_att_mean:.6f}")
|
608
|
-
# decoder's cross att
|
609
|
-
dec_x_att_norms = [get_gradient_norms(layer.memory_cross_attention)[1] for layer in self.actor.decoder.model.layers]
|
610
|
-
print(f"Decoder cross-att mean norm: {(sum(dec_x_att_norms) / len(dec_x_att_norms)):.6f}, all: {dec_x_att_norms}")
|
611
610
|
|
611
|
+
dec_x_att_norms = [get_gradient_norms(layer.memory_cross_attention)[1] for layer in self.actor.decoder.model.layers]
|
612
612
|
mem_att_norms = [get_gradient_norms(layer)[1] for layer in self.actor.memory_attention.model.attention_layers]
|
613
|
-
print(f"Memory attention layers mean norm: {(sum(mem_att_norms) / len(mem_att_norms)):.6f}, all: {mem_att_norms}")
|
614
|
-
|
615
613
|
enc_ff_norms = [get_gradient_norms(layer.ff)[1] for layer in self.actor.encoder.model.layers]
|
616
|
-
|
614
|
+
enc_self_att_norms = [get_gradient_norms(layer.attention)[1] for layer in self.actor.encoder.model.layers]
|
615
|
+
enc_x_att_norms = [get_gradient_norms(layer.memory_cross_attention)[1] for layer in
|
616
|
+
self.actor.encoder.model.layers]
|
617
|
+
|
618
|
+
calc_mean = lambda x: sum(x) / len(x)
|
619
|
+
|
620
|
+
dec_x_att_norms_mean = calc_mean(dec_x_att_norms)
|
621
|
+
mem_att_norms_mean = calc_mean(mem_att_norms)
|
622
|
+
enc_ff_norms_mean = calc_mean(enc_ff_norms)
|
623
|
+
enc_self_att_norms_mean = calc_mean(enc_self_att_norms)
|
624
|
+
enc_x_att_norms_mean = calc_mean(enc_x_att_norms)
|
625
|
+
|
626
|
+
print(f"Decoder cross-att mean norm: {dec_x_att_norms_mean:.6f}, all: {dec_x_att_norms}")
|
627
|
+
print(f"Memory attention layers mean norm: {mem_att_norms_mean:.6f}, all: {mem_att_norms}")
|
628
|
+
print(f"Encoder ff mean norm: {enc_ff_norms_mean:.6f}, all: {enc_ff_norms}")
|
629
|
+
print(f"Encoder self-att mean norm: {enc_self_att_norms_mean:.6f}, all: {enc_self_att_norms}")
|
630
|
+
print(f"Encoder cross-att mean norm: {enc_x_att_norms_mean:.6f}, all: {enc_x_att_norms}")
|
631
|
+
|
632
|
+
if self.writer is not None:
|
633
|
+
self.writer.add_scalar('Gradient/encoder', encoder_mean, self.global_step['train'])
|
634
|
+
self.writer.add_scalar('Gradient/decoder', decoder_mean, self.global_step['train'])
|
635
|
+
self.writer.add_scalar('Gradient/mem-att', mem_att_mean, self.global_step['train'])
|
636
|
+
self.writer.add_scalar('Gradient/decoder x-att', dec_x_att_norms_mean, self.global_step['train'])
|
637
|
+
self.writer.add_scalar('Gradient/mem-att layers', mem_att_norms_mean, self.global_step['train'])
|
638
|
+
self.writer.add_scalar('Gradient/encoder ff', enc_ff_norms_mean, self.global_step['train'])
|
639
|
+
self.writer.add_scalar('Gradient/encoder self-att', enc_self_att_norms_mean, self.global_step['train'])
|
640
|
+
self.writer.add_scalar('Gradient/encoder x-att', enc_x_att_norms_mean, self.global_step['train'])
|
617
641
|
|
618
|
-
enc_ff_norms = [get_gradient_norms(layer.memory_cross_attention)[1] for layer in self.actor.encoder.model.layers]
|
619
|
-
print(f"Encoder cross-att mean norm: {(sum(enc_ff_norms) / len(enc_ff_norms)):.6f}, all: {enc_ff_norms}")
|
620
642
|
|
621
643
|
def update_actor(self, state: tuple[TokenizedDict, TokenizedDict, TokenizedDict], action: TokenizedDict,
|
622
644
|
advantages: torch.Tensor, old_log_probs: torch.Tensor, epoch: int) -> float:
|
@@ -649,7 +671,7 @@ class MRLTrainer:
|
|
649
671
|
# 4.4 Unscale and clip gradient norms
|
650
672
|
self.scaler.unscale_(self.optimizer)
|
651
673
|
torch.nn.utils.clip_grad_norm_(self.actor.unique_parameters(), max_norm=1.0,
|
652
|
-
error_if_nonfinite=
|
674
|
+
error_if_nonfinite=self.debug_mode)
|
653
675
|
if self.debug_mode and self.epoch_step['train'] % self.debug_interval == 0:
|
654
676
|
self._log_gradients(logits)
|
655
677
|
# 4.5 Run scaled optimization step
|
@@ -670,7 +692,7 @@ class MRLTrainer:
|
|
670
692
|
policy_loss.backward(retain_graph=True)
|
671
693
|
# 4.4 Clip gradient norms
|
672
694
|
torch.nn.utils.clip_grad_norm_(self.actor.unique_parameters(), max_norm=1.0,
|
673
|
-
error_if_nonfinite=
|
695
|
+
error_if_nonfinite=self.debug_mode)
|
674
696
|
if self.debug_mode and self.epoch_step['train'] % self.debug_interval == 0:
|
675
697
|
self._log_gradients(logits)
|
676
698
|
# 4.5 Run scaled optimization step
|
rxnn/transformers/layers.py
CHANGED
@@ -102,7 +102,11 @@ class ReactiveTransformerLayer(nn.Module):
|
|
102
102
|
residual = x
|
103
103
|
if not self.use_post_norm:
|
104
104
|
x = self.norm1(x)
|
105
|
+
if torch.isnan(x).any():
|
106
|
+
print("NaN detected in pre-norm (self-attention) output")
|
105
107
|
x = self.attention(x, x, x, mask=mask)
|
108
|
+
if torch.isnan(x).any():
|
109
|
+
print("NaN detected in self-attention output")
|
106
110
|
x = residual + x
|
107
111
|
if self.use_post_norm:
|
108
112
|
x = self.norm1(x)
|
@@ -110,11 +114,18 @@ class ReactiveTransformerLayer(nn.Module):
|
|
110
114
|
residual = x
|
111
115
|
if not self.use_post_norm:
|
112
116
|
x = self.norm2(x)
|
117
|
+
if torch.isnan(x).any():
|
118
|
+
print("NaN detected in pre-norm (cross-attention) output")
|
113
119
|
|
114
|
-
|
115
|
-
|
120
|
+
mem_mask = mask.squeeze(1).unsqueeze(-1).expand(-1, -1, -1, stm.size(1)) \
|
121
|
+
if mask is not None else None
|
122
|
+
|
123
|
+
if torch.isnan(stm).any():
|
124
|
+
print("NaN detected in STM cross-attention input")
|
116
125
|
|
117
126
|
x = self.memory_cross_attention(x, stm, stm, mask=mem_mask)
|
127
|
+
if torch.isnan(x).any():
|
128
|
+
print("NaN detected in cross-attention output")
|
118
129
|
x = residual + x
|
119
130
|
if self.use_post_norm:
|
120
131
|
x = self.norm2(x)
|
@@ -123,7 +134,11 @@ class ReactiveTransformerLayer(nn.Module):
|
|
123
134
|
residual = x
|
124
135
|
if not self.use_post_norm:
|
125
136
|
x = self.norm3(x)
|
137
|
+
if torch.isnan(x).any():
|
138
|
+
print("NaN detected in pre-norm (ff) output")
|
126
139
|
x = self.ff(x)
|
140
|
+
if torch.isnan(x).any():
|
141
|
+
print("NaN detected in ff output")
|
127
142
|
x = residual + x
|
128
143
|
if self.use_post_norm:
|
129
144
|
x = self.norm3(x)
|
rxnn/transformers/models.py
CHANGED
@@ -58,6 +58,15 @@ class ReactiveTransformerBase(nn.Module):
|
|
58
58
|
else:
|
59
59
|
return None
|
60
60
|
|
61
|
+
def _handle_layer(self, i: int, x: torch.Tensor, mask: torch.Tensor = None, is_shared: bool = False):
|
62
|
+
stm_layer_idx = i if is_shared else i + self.num_shared_layers
|
63
|
+
layer_stm = self.stm(stm_layer_idx)
|
64
|
+
# expand layer STM to batch size, if it's not in batch mode
|
65
|
+
if layer_stm.size(0) == 1:
|
66
|
+
layer_stm = layer_stm.expand(x.size(0), -1, -1)
|
67
|
+
layer = self.shared_layers[i] if is_shared else self.layers[i]
|
68
|
+
return layer(x, layer_stm, mask=mask)
|
69
|
+
|
61
70
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
62
71
|
# Shared logic for encoders and decoders - apply embeddings and positional encoding
|
63
72
|
x = self.embedding(x)
|
@@ -84,6 +93,8 @@ class ReactiveTransformerDecoder(ReactiveTransformerBase):
|
|
84
93
|
|
85
94
|
def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
|
86
95
|
x = super().forward(x) # apply embeddings
|
96
|
+
if torch.isnan(x).any():
|
97
|
+
print("NaN detected in decoder embedding output")
|
87
98
|
seq_len = x.size(1)
|
88
99
|
if not self.use_flash_attention and self.use_relative_embedding:
|
89
100
|
mask = create_causal_mask(seq_len, device=x.device)
|
@@ -96,18 +107,12 @@ class ReactiveTransformerDecoder(ReactiveTransformerBase):
|
|
96
107
|
# Process shared layers
|
97
108
|
if self.shared_layers is not None:
|
98
109
|
for i in range(self.num_shared_layers):
|
99
|
-
|
100
|
-
# expand layer STM to batch size, if it's not in batch mode
|
101
|
-
if layer_stm.size(0) == 1:
|
102
|
-
layer_stm = layer_stm.expand(x.size(0), -1, -1)
|
103
|
-
x = self.shared_layers[i](x, layer_stm, mask=mask)
|
110
|
+
x = self._handle_layer(i, x, mask=mask, is_shared=True)
|
104
111
|
# Process own layers
|
105
112
|
for i in range(self.num_own_layers):
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
layer_stm = layer_stm.expand(x.size(0), -1, -1)
|
110
|
-
x = self.layers[i](x, layer_stm, mask=mask)
|
113
|
+
x = self._handle_layer(i, x, mask=mask)
|
114
|
+
if torch.isnan(x).any():
|
115
|
+
print(f"NaN detected in {i}. decoder layer output")
|
111
116
|
return self.head(self.head_norm(x) if self.use_head_norm else x)
|
112
117
|
|
113
118
|
|
@@ -116,6 +121,8 @@ class ReactiveTransformerEncoder(ReactiveTransformerBase):
|
|
116
121
|
|
117
122
|
def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
|
118
123
|
x = super().forward(x) # apply embeddings
|
124
|
+
if torch.isnan(x).any():
|
125
|
+
print("NaN detected in encoder embedding output")
|
119
126
|
if attention_mask is not None:
|
120
127
|
attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).bool()
|
121
128
|
|
@@ -123,19 +130,13 @@ class ReactiveTransformerEncoder(ReactiveTransformerBase):
|
|
123
130
|
# Process shared layers
|
124
131
|
if self.shared_layers is not None:
|
125
132
|
for i in range(self.num_shared_layers):
|
126
|
-
|
127
|
-
# expand layer STM to batch size, if it's not in batch mode
|
128
|
-
if layer_stm.size(0) == 1:
|
129
|
-
layer_stm = layer_stm.expand(x.size(0), -1, -1)
|
130
|
-
x = self.shared_layers[i](x, layer_stm, mask=attention_mask)
|
133
|
+
x = self._handle_layer(i, x, mask=attention_mask, is_shared=True)
|
131
134
|
hidden_states.append(x)
|
132
135
|
# Process own layers
|
133
136
|
for i in range(self.num_own_layers):
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
layer_stm = layer_stm.expand(x.size(0), -1, -1)
|
138
|
-
x = self.layers[i](x, layer_stm, mask=attention_mask)
|
137
|
+
x = self._handle_layer(i, x, mask=attention_mask)
|
138
|
+
if torch.isnan(x).any():
|
139
|
+
print(f"NaN detected in {i}. encoder layer output")
|
139
140
|
hidden_states.append(x)
|
140
141
|
return x, torch.stack(hidden_states)
|
141
142
|
|
@@ -5,7 +5,7 @@ rxnn/experimental/attention.py,sha256=jlNS82INjycNEfmk3HtkIacUvT_ELhaCO2g-kZTvhX
|
|
5
5
|
rxnn/experimental/models.py,sha256=KheR1zSNJIaeVvpVAkEJwcuM5nOqQP0ZF08XhrtGJ8E,5387
|
6
6
|
rxnn/experimental/moe.py,sha256=jHZ1QhpWiVQOswVpFmuH7b2IUOPf0Uuf-I2Ddwsd7Us,6140
|
7
7
|
rxnn/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
rxnn/memory/attention.py,sha256=
|
8
|
+
rxnn/memory/attention.py,sha256=O4ycW3KKP5hFYadgVh47LvGWJn9zNHz8vh9E9okC0h8,4223
|
9
9
|
rxnn/memory/norm.py,sha256=cVjjhCLqR5K6-321SP_ObG17y-ddlcTJeCTXvW4vpk0,6675
|
10
10
|
rxnn/memory/stm.py,sha256=jv57gsH9XW19sLbxpRDqsp1yfsii_4Ef4Ncr_ztk-i4,3937
|
11
11
|
rxnn/rxt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -14,10 +14,10 @@ rxnn/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
rxnn/training/base.py,sha256=CqaArEZYOdH64nmKfx28U3GI46TzO4oNkjf_hrF23Cw,11835
|
15
15
|
rxnn/training/bml.py,sha256=hw6gLpLkGvqLzxIvBg4MvCc5r8cHpEm2RDyh7nH6CtE,16914
|
16
16
|
rxnn/training/callbacks.py,sha256=rS8leuVFPVVfE5Zc8DMkUZhRIPN-vpPbUjowXE5TSBw,36779
|
17
|
-
rxnn/training/dataset.py,sha256=
|
17
|
+
rxnn/training/dataset.py,sha256=ruU6k33pQmpTqhxpjLFNdDJnCjcrBcGeFOzJqFahJDM,51880
|
18
18
|
rxnn/training/ddp.py,sha256=VsNBjn3cY-uUj8hbsW7oKvb0_ZKnXnJ2KgObm-Mr9i4,836
|
19
|
-
rxnn/training/models.py,sha256=
|
20
|
-
rxnn/training/mrl.py,sha256=
|
19
|
+
rxnn/training/models.py,sha256=ILkcqBV1MImnULnq-YDSSEf8cUdEbUgQaH0FRTsa4LA,9069
|
20
|
+
rxnn/training/mrl.py,sha256=Ntkti6DDKipKa-AwTvo1WDOdIXOL3uXOhT-Xx29wR-w,67369
|
21
21
|
rxnn/training/reward.py,sha256=uiSsBXmjMw2yv-1Bssy3RTlpU6zP8ape3490Sl-aT0M,16144
|
22
22
|
rxnn/training/rl.py,sha256=hWtExxY-_pAmTOGYxyCNounUbaGWvLDVltC4sRC7MN4,7175
|
23
23
|
rxnn/training/scheduler.py,sha256=LcjU35mEwz2U5x3U6tLfeeYlBqMxbFSxYzJYuXkWbSY,1408
|
@@ -26,14 +26,14 @@ rxnn/training/utils.py,sha256=ngDCm654NL3UsPy190Er4XPc9HI-OyEV6tDLMgEEvQc,6219
|
|
26
26
|
rxnn/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
27
|
rxnn/transformers/attention.py,sha256=KRnKT6XUqAXElxV9y72mSpdTeiMgCKCCLqqxCFNTHmA,16372
|
28
28
|
rxnn/transformers/ff.py,sha256=WDjO-H9XWInoWnUnxiseIH6Kx5GlHP0zGJygwhcb1gc,2589
|
29
|
-
rxnn/transformers/layers.py,sha256=
|
29
|
+
rxnn/transformers/layers.py,sha256=bcDP8vZ5dpTWWqMCkzrPG8yQA0D0G5VjnV2Nq9IO8Dc,8816
|
30
30
|
rxnn/transformers/mask.py,sha256=J0cfLVLt3SzS2ra3KcY4khrkhI975Dw4CjpUi3Sn25s,419
|
31
|
-
rxnn/transformers/models.py,sha256=
|
31
|
+
rxnn/transformers/models.py,sha256=r4vNldYqCIpwMpXkFZvYbw0UBK3NE75qH7bc6OZ8YjE,11587
|
32
32
|
rxnn/transformers/moe.py,sha256=j6jEx6Ip0zttlUZKKn82azxo95lkLZs-H2GLSMD88hY,5859
|
33
33
|
rxnn/transformers/positional.py,sha256=1PjcJybUzeQlIKJI4tahAGZcYgCRCL0otxs7mpsNuzM,4410
|
34
34
|
rxnn/transformers/sampler.py,sha256=t6iiQTdLQ0TakUWnnhKkb5DKF2F_9-thXHBydDF3fxg,17389
|
35
35
|
rxnn/utils.py,sha256=ihb6OTyDtPiocB_lOvnq7eOkjjpCkgs8wxvXUBNQ7mM,996
|
36
|
-
rxnn-0.2.
|
37
|
-
rxnn-0.2.
|
38
|
-
rxnn-0.2.
|
39
|
-
rxnn-0.2.
|
36
|
+
rxnn-0.2.71.dist-info/LICENSE,sha256=C8coDFIUYuOcke4JLPwTqahQUCyXyGq6WOaigOkx8tY,11275
|
37
|
+
rxnn-0.2.71.dist-info/METADATA,sha256=7BHHcFtImjPB57X2eRLgO4IFOSBNb7GOR5ytMaCttkI,60420
|
38
|
+
rxnn-0.2.71.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
39
|
+
rxnn-0.2.71.dist-info/RECORD,,
|
File without changes
|
File without changes
|