x-transformers 2.2.7__tar.gz → 2.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {x_transformers-2.2.7 → x_transformers-2.2.8}/PKG-INFO +1 -1
  2. {x_transformers-2.2.7 → x_transformers-2.2.8}/pyproject.toml +1 -1
  3. {x_transformers-2.2.7 → x_transformers-2.2.8}/tests/test_x_transformers.py +22 -1
  4. x_transformers-2.2.8/train_entropy_tokenizer.py +118 -0
  5. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/x_transformers.py +4 -1
  6. {x_transformers-2.2.7 → x_transformers-2.2.8}/.github/FUNDING.yml +0 -0
  7. {x_transformers-2.2.7 → x_transformers-2.2.8}/.github/workflows/python-publish.yml +0 -0
  8. {x_transformers-2.2.7 → x_transformers-2.2.8}/.github/workflows/python-test.yaml +0 -0
  9. {x_transformers-2.2.7 → x_transformers-2.2.8}/.gitignore +0 -0
  10. {x_transformers-2.2.7 → x_transformers-2.2.8}/LICENSE +0 -0
  11. {x_transformers-2.2.7 → x_transformers-2.2.8}/README.md +0 -0
  12. {x_transformers-2.2.7 → x_transformers-2.2.8}/data/README.md +0 -0
  13. {x_transformers-2.2.7 → x_transformers-2.2.8}/data/enwik8.gz +0 -0
  14. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/all-attention.png +0 -0
  15. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/attention-on-attention.png +0 -0
  16. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/cosine-sim-attention.png +0 -0
  17. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/deepnorm.png +0 -0
  18. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/dynamic-pos-bias-linear.png +0 -0
  19. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/dynamic-pos-bias-log.png +0 -0
  20. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/dynamic-pos-bias-sinusoidal.png +0 -0
  21. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/dynamic-pos-bias.png +0 -0
  22. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/enhanced-recurrence.png +0 -0
  23. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/fcm.png +0 -0
  24. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/ffglu.png +0 -0
  25. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/flash-attention.png +0 -0
  26. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/gate_values.png +0 -0
  27. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/gating.png +0 -0
  28. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/length-extrapolation-scale.png +0 -0
  29. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/macaron-1.png +0 -0
  30. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/macaron-2.png +0 -0
  31. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/memory-transformer.png +0 -0
  32. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/normformer.png +0 -0
  33. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/pia.png +0 -0
  34. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/qknorm-analysis.png +0 -0
  35. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/resi_dual.png +0 -0
  36. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/residual_attn.png +0 -0
  37. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/rezero.png +0 -0
  38. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/rotary.png +0 -0
  39. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/sandwich-2.png +0 -0
  40. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/sandwich.png +0 -0
  41. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/sandwich_norm.png +0 -0
  42. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/scalenorm.png +0 -0
  43. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/talking-heads.png +0 -0
  44. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/topk-attention.png +0 -0
  45. {x_transformers-2.2.7 → x_transformers-2.2.8}/images/xval.png +0 -0
  46. {x_transformers-2.2.7 → x_transformers-2.2.8}/train_belief_state.py +0 -0
  47. {x_transformers-2.2.7 → x_transformers-2.2.8}/train_copy.py +0 -0
  48. {x_transformers-2.2.7 → x_transformers-2.2.8}/train_enwik8.py +0 -0
  49. {x_transformers-2.2.7 → x_transformers-2.2.8}/train_length_extrapolate.py +0 -0
  50. {x_transformers-2.2.7 → x_transformers-2.2.8}/train_parity.py +0 -0
  51. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/__init__.py +0 -0
  52. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/attend.py +0 -0
  53. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/autoregressive_wrapper.py +0 -0
  54. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/belief_state_wrapper.py +0 -0
  55. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/continuous.py +0 -0
  56. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/dpo.py +0 -0
  57. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/entropy_based_tokenizer.py +0 -0
  58. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/multi_input.py +0 -0
  59. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/neo_mlp.py +0 -0
  60. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/nonautoregressive_wrapper.py +0 -0
  61. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/xl_autoregressive_wrapper.py +0 -0
  62. {x_transformers-2.2.7 → x_transformers-2.2.8}/x_transformers/xval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: x-transformers
3
- Version: 2.2.7
3
+ Version: 2.2.8
4
4
  Summary: X-Transformers
5
5
  Project-URL: Homepage, https://pypi.org/project/x-transformers/
6
6
  Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "x-transformers"
3
- version = "2.2.7"
3
+ version = "2.2.8"
4
4
  description = "X-Transformers"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
@@ -798,4 +798,25 @@ def test_entropy_based_tokenizer(
798
798
 
799
799
  assert len(segmented_seq) == seq.shape[0]
800
800
 
801
- tokenizer(seq[0]) # able to handle without batch dim
801
+ tokenizer(seq[0]) # able to handle without batch dim
802
+
803
+ def test_custom_ff_activation():
804
+
805
+ model = TransformerWrapper(
806
+ num_tokens = 20000,
807
+ max_seq_len = 1024,
808
+ attn_layers = Decoder(
809
+ dim = 128,
810
+ depth = 6,
811
+ heads = 8,
812
+ attn_dim_head = 64,
813
+ ff_custom_activation = nn.Sigmoid()
814
+ )
815
+ )
816
+
817
+ seq = torch.randint(0, 20000, (2, 1024))
818
+
819
+ logits = model(seq)
820
+
821
+ assert logits.shape == (2, 1024, 20000)
822
+
@@ -0,0 +1,118 @@
1
+ from x_transformers import TransformerWrapper, Decoder
2
+ from x_transformers.autoregressive_wrapper import AutoregressiveWrapper
3
+ from x_transformers.entropy_based_tokenizer import EntropyBasedTokenizer
4
+
5
+ import random
6
+ import tqdm
7
+ import gzip
8
+ import numpy as np
9
+ import torch
10
+ import torch.optim as optim
11
+ from torch.nn import functional as F
12
+ from torch.utils.data import DataLoader, Dataset
13
+
14
+ # constants
15
+
16
+ NUM_BATCHES = int(1e5)
17
+ BATCH_SIZE = 4
18
+ GRADIENT_ACCUMULATE_EVERY = 4
19
+ LEARNING_RATE = 1e-4
20
+ VALIDATE_EVERY = 100
21
+ GENERATE_EVERY = 100
22
+ GENERATE_LENGTH = 1024
23
+ SEQ_LEN = 1024
24
+
25
+ # helpers
26
+
27
+ def cycle(loader):
28
+ while True:
29
+ for data in loader:
30
+ yield data
31
+
32
+ def decode_token(token):
33
+ return str(chr(max(32, token)))
34
+
35
+ def decode_tokens(tokens):
36
+ return ''.join(list(map(decode_token, tokens)))
37
+
38
+ # instantiate GPT-like decoder model
39
+
40
+ model = TransformerWrapper(
41
+ num_tokens = 256,
42
+ max_seq_len = SEQ_LEN,
43
+ attn_layers = Decoder(
44
+ dim = 512,
45
+ depth = 6,
46
+ heads = 8,
47
+ rotary_pos_emb = True
48
+ )
49
+ )
50
+
51
+ tokenizer = EntropyBasedTokenizer(
52
+ model,
53
+ entropy_threshold = 2.5
54
+ )
55
+
56
+ model = AutoregressiveWrapper(model)
57
+ model.cuda()
58
+
59
+ # prepare enwik8 data
60
+
61
+ with gzip.open('./data/enwik8.gz') as file:
62
+ data = np.frombuffer(file.read(int(95e6)), dtype=np.uint8).copy()
63
+ train_x, valid_x = np.split(data, [int(90e6)])
64
+ data_train, data_val = torch.from_numpy(train_x), torch.from_numpy(valid_x)
65
+
66
+ class TextSamplerDataset(Dataset):
67
+ def __init__(self, data, seq_len):
68
+ super().__init__()
69
+ self.data = data
70
+ self.seq_len = seq_len
71
+
72
+ def __getitem__(self, index):
73
+ rand_start = torch.randint(0, self.data.size(0) - self.seq_len - 1, (1,))
74
+ full_seq = self.data[rand_start: rand_start + self.seq_len + 1].long()
75
+ return full_seq.cuda()
76
+
77
+ def __len__(self):
78
+ return self.data.size(0) // self.seq_len
79
+
80
+ train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
81
+ val_dataset = TextSamplerDataset(data_val, SEQ_LEN)
82
+ train_loader = cycle(DataLoader(train_dataset, batch_size = BATCH_SIZE, drop_last = True))
83
+ val_loader = cycle(DataLoader(val_dataset, batch_size = BATCH_SIZE, drop_last = True))
84
+
85
+ # optimizer
86
+
87
+ optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
88
+
89
+ # training
90
+
91
+ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10., desc='training'):
92
+ model.train()
93
+
94
+ for __ in range(GRADIENT_ACCUMULATE_EVERY):
95
+ loss = model(next(train_loader))
96
+ (loss / GRADIENT_ACCUMULATE_EVERY).backward()
97
+
98
+ print(f'training loss: {loss.item()}')
99
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
100
+ optim.step()
101
+ optim.zero_grad()
102
+
103
+ if i % VALIDATE_EVERY == 0:
104
+ model.eval()
105
+ with torch.no_grad():
106
+ loss = model(next(val_loader))
107
+ print(f'validation loss: {loss.item()}')
108
+
109
+ if i % GENERATE_EVERY == 0:
110
+ model.eval()
111
+ inp = random.choice(val_dataset)[:-1]
112
+
113
+ tokens = tokenizer(inp, return_segmented_seq = True)
114
+
115
+ delimiter = " \u275A "
116
+ output_str = delimiter.join([decode_tokens(token) for token in tokens])
117
+
118
+ print(f"{output_str}\n\n")
@@ -1196,6 +1196,7 @@ class FeedForward(Module):
1196
1196
  glu_mult_bias = False,
1197
1197
  swish = False,
1198
1198
  relu_squared = False,
1199
+ custom_activation = None,
1199
1200
  post_act_ln = False,
1200
1201
  dropout = 0.,
1201
1202
  no_bias = False,
@@ -1205,7 +1206,9 @@ class FeedForward(Module):
1205
1206
  inner_dim = int(dim * mult)
1206
1207
  dim_out = default(dim_out, dim)
1207
1208
 
1208
- if relu_squared:
1209
+ if exists(custom_activation):
1210
+ activation = deepcopy(custom_activation)
1211
+ elif relu_squared:
1209
1212
  activation = ReluSquared()
1210
1213
  elif swish:
1211
1214
  activation = nn.SiLU()
File without changes
File without changes