x-transformers 2.9.0__py3-none-any.whl → 2.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- x_transformers/attend.py +15 -1
- x_transformers/x_transformers.py +6 -0
- {x_transformers-2.9.0.dist-info → x_transformers-2.9.2.dist-info}/METADATA +13 -1
- {x_transformers-2.9.0.dist-info → x_transformers-2.9.2.dist-info}/RECORD +6 -6
- {x_transformers-2.9.0.dist-info → x_transformers-2.9.2.dist-info}/WHEEL +0 -0
- {x_transformers-2.9.0.dist-info → x_transformers-2.9.2.dist-info}/licenses/LICENSE +0 -0
x_transformers/attend.py
CHANGED
@@ -67,6 +67,15 @@ def once(fn):
|
|
67
67
|
|
68
68
|
print_once = once(print)
|
69
69
|
|
70
|
+
# gumbel softmax attention related
|
71
|
+
|
72
|
+
def log_prob_from_hard_attend(intermeds: Intermediates):
|
73
|
+
log_probs = intermeds.pre_softmax_attn.log_softmax(dim = -1)
|
74
|
+
|
75
|
+
one_hot = intermeds.post_softmax_attn.argmax(dim = -1, keepdim = True)
|
76
|
+
log_prob = log_probs.gather(-1, one_hot)
|
77
|
+
return rearrange(log_prob, 'b h i 1 -> b h i')
|
78
|
+
|
70
79
|
# selective attention
|
71
80
|
# https://arxiv.org/abs/2410.02703 - section 3.3
|
72
81
|
# it is a technique to allow each token to prevent itself from being attended to by future tokens
|
@@ -171,6 +180,9 @@ class Attend(Module):
|
|
171
180
|
qk_norm = False,
|
172
181
|
l2_distance = False,
|
173
182
|
sigmoid = False,
|
183
|
+
gumbel_softmax = False,
|
184
|
+
gumbel_softmax_temp = 1.,
|
185
|
+
gumbel_softmax_hard = True,
|
174
186
|
custom_attn_fn: Callable | None = None,
|
175
187
|
flash = False,
|
176
188
|
softclamp_logits = False,
|
@@ -203,7 +215,7 @@ class Attend(Module):
|
|
203
215
|
assert not (flash and hard), 'hard attention not available for flash'
|
204
216
|
assert not (flash and is_sparse_topk_attn), 'topk attention not available for flash'
|
205
217
|
|
206
|
-
assert at_most_one_of(sigmoid, hard, l2_distance, is_sparse_topk_attn)
|
218
|
+
assert at_most_one_of(sigmoid, hard, l2_distance, gumbel_softmax, is_sparse_topk_attn)
|
207
219
|
|
208
220
|
if exists(custom_attn_fn):
|
209
221
|
self.attn_fn = custom_attn_fn
|
@@ -213,6 +225,8 @@ class Attend(Module):
|
|
213
225
|
self.attn_fn = one_hot_straight_through
|
214
226
|
elif is_sparse_topk_attn:
|
215
227
|
self.attn_fn = partial(sparse_topk_attn, sparse_topk = sparse_topk, straight_through = sparse_topk_straight_through)
|
228
|
+
elif gumbel_softmax:
|
229
|
+
self.attn_fn = partial(F.gumbel_softmax, dim = -1, tau = gumbel_softmax_temp, hard = gumbel_softmax_hard)
|
216
230
|
else:
|
217
231
|
softmax_fn = partial(F.softmax, dim = -1)
|
218
232
|
self.attn_fn = partial(softmax_fn, dtype = torch.float32) if not qk_norm else softmax_fn
|
x_transformers/x_transformers.py
CHANGED
@@ -1336,6 +1336,9 @@ class Attention(Module):
|
|
1336
1336
|
value_rmsnorm = False, # used in alphagenome and bytedance's GR3 for further stability
|
1337
1337
|
l2_distance = False,
|
1338
1338
|
sigmoid = False,
|
1339
|
+
gumbel_softmax = False,
|
1340
|
+
gumbel_softmax_temp = 1.,
|
1341
|
+
gumbel_softmax_hard = True,
|
1339
1342
|
selective = False,
|
1340
1343
|
custom_attn_fn: Callable | None = None,
|
1341
1344
|
hybrid_module: Module | None = None,
|
@@ -1541,6 +1544,9 @@ class Attention(Module):
|
|
1541
1544
|
scale = qk_norm_scale if qk_norm else self.scale,
|
1542
1545
|
l2_distance = l2_distance,
|
1543
1546
|
sigmoid = sigmoid,
|
1547
|
+
gumbel_softmax = gumbel_softmax,
|
1548
|
+
gumbel_softmax_temp = gumbel_softmax_temp,
|
1549
|
+
gumbel_softmax_hard = gumbel_softmax_hard,
|
1544
1550
|
selective = selective,
|
1545
1551
|
custom_attn_fn = custom_attn_fn,
|
1546
1552
|
add_zero_kv = add_zero_kv,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: x-transformers
|
3
|
-
Version: 2.9.
|
3
|
+
Version: 2.9.2
|
4
4
|
Summary: X-Transformers
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/x-transformers/
|
6
6
|
Project-URL: Repository, https://github.com/lucidrains/x-transformers
|
@@ -2574,4 +2574,16 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
|
|
2574
2574
|
}
|
2575
2575
|
```
|
2576
2576
|
|
2577
|
+
```bibtex
|
2578
|
+
@misc{yan2017hierarchicalmultiscaleattentionnetworks,
|
2579
|
+
title = {Hierarchical Multi-scale Attention Networks for Action Recognition},
|
2580
|
+
author = {Shiyang Yan and Jeremy S. Smith and Wenjin Lu and Bailing Zhang},
|
2581
|
+
year = {2017},
|
2582
|
+
eprint = {1708.07590},
|
2583
|
+
archivePrefix = {arXiv},
|
2584
|
+
primaryClass = {cs.CV},
|
2585
|
+
url = {https://arxiv.org/abs/1708.07590},
|
2586
|
+
}
|
2587
|
+
```
|
2588
|
+
|
2577
2589
|
*solve intelligence... then use that to solve everything else.* - Demis Hassabis
|
@@ -1,5 +1,5 @@
|
|
1
1
|
x_transformers/__init__.py,sha256=aVuhUU0572TJHW88BVc4yA2tla0Zb8l3NH7W4RZ1AEs,1005
|
2
|
-
x_transformers/attend.py,sha256=
|
2
|
+
x_transformers/attend.py,sha256=RZJT9pPlpqSG3nOUqQHNRR6jOeJ2r-Fvvar2wdu9HLw,18687
|
3
3
|
x_transformers/autoregressive_wrapper.py,sha256=BsGO9xfVYkvynqbU1__tu_S_cxl7gss0YwnkhIa2baY,18401
|
4
4
|
x_transformers/belief_state_wrapper.py,sha256=YLUMk6t2MhFBEw5lHDDHJHcoCxTIkHvxTNY__GGZEKU,13374
|
5
5
|
x_transformers/continuous.py,sha256=WwpQCjyVY4PtuEAOFY68zqgklbF9I7AL5w6874YlDe8,13249
|
@@ -10,10 +10,10 @@ x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg
|
|
10
10
|
x_transformers/neo_mlp.py,sha256=XCNnnop9WLarcxap1kGuYc1x8GHvwkZiDRnXOxSl3Po,3452
|
11
11
|
x_transformers/nonautoregressive_wrapper.py,sha256=hMQqNimGtchNIe13cR5LZule1V7I1qM5LmY8VQfVdnA,11698
|
12
12
|
x_transformers/up_wrapper.py,sha256=YC2LN14_7Xx9Wtiek2rtEJ_qHqdfSmKlh3d7Cgxwd80,7073
|
13
|
-
x_transformers/x_transformers.py,sha256=
|
13
|
+
x_transformers/x_transformers.py,sha256=o6B10urcC7MRUrmoHOgYJgkrVDzHhX-jt6zZY3pZEgA,125700
|
14
14
|
x_transformers/xl_autoregressive_wrapper.py,sha256=CvZMJ6A6PA-Y_bQAhnORwjJBSl6Vjq2IdW5KTdk8NI8,4195
|
15
15
|
x_transformers/xval.py,sha256=AwwYUm8yDAtKQyKJDIhYMsiLTJ_skh3scUFMjp5sda8,8597
|
16
|
-
x_transformers-2.9.
|
17
|
-
x_transformers-2.9.
|
18
|
-
x_transformers-2.9.
|
19
|
-
x_transformers-2.9.
|
16
|
+
x_transformers-2.9.2.dist-info/METADATA,sha256=3JsbSIp9fsGpuXopeIaIq4ffjYTJIHyqdRLxM21cfUM,95381
|
17
|
+
x_transformers-2.9.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
18
|
+
x_transformers-2.9.2.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
|
19
|
+
x_transformers-2.9.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|