PyPI - x-transformers - Versions diffs - 1.29.2__py3-none-any.whl → 1.30.1__py3-none-any.whl - Mend

x-transformers 1.29.2py3-none-any.whl → 1.30.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

x_transformers/attend.py CHANGED Viewed

@@ -1,5 +1,7 @@
+from __future__ import annotations
 from functools import partial
-from typing import Optional, Tuple
+from typing import Tuple
 import torch
 from torch import nn, einsum, Tensor
@@ -16,10 +18,10 @@ from einops import rearrange, repeat
 @dataclass
 class Intermediates:
-    qk_similarities: Optional[Tensor] = None
-    pre_softmax_attn: Optional[Tensor] = None
-    post_softmax_attn: Optional[Tensor] = None
-    cached_kv: Optional[Tuple[Tensor, Tensor]] = None
+    qk_similarities:    Tensor | None = None
+    pre_softmax_attn:   Tensor | None = None
+    post_softmax_attn:  Tensor | None = None
+    cached_kv:          Tuple[Tensor, Tensor] | None = None
     def to_tuple(self):
         return (self.qk_similarities, self.pre_softmax_attn, self.post_softmax_attn)

x_transformers/autoregressive_wrapper.py CHANGED Viewed

@@ -1,5 +1,7 @@
+from __future__ import annotations
 from math import ceil, log
-from typing import Optional, Union, Tuple, Callable
+from typing import Tuple, Callable
 import torch
 from torch import nn, Tensor
@@ -133,12 +135,12 @@ class AutoregressiveWrapper(Module):
         seq_len,
         eos_token = None,
         temperature = 1.,
-        prompt_lens: Optional[Tensor] = None,
+        prompt_lens: Tensor | None = None,
         filter_logits_fn: Callable = top_k,
         restrict_to_max_seq_len = True,
-        amateur_model: Optional[Union[Module, Tuple[Module]]] = None,
+        amateur_model: Module | Tuple[Module] | None = None,
         filter_kwargs: dict = dict(),
-        contrastive_decode_kwargs: Union[dict, Tuple[dict]] = dict(
+        contrastive_decode_kwargs: dict | Tuple[dict] = dict(
             beta = 0.5,
             alpha = 0.1
         ),

x_transformers/continuous.py CHANGED Viewed

@@ -143,11 +143,11 @@ class ContinuousTransformerWrapper(nn.Module):
         if return_mems:
             hiddens = intermediates.hiddens
-            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), hiddens))
+            new_mems = tuple(t[..., -self.max_mem_len:, :].detach() for t in hiddens)
             return out, new_mems
         if return_attn:
-            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+            attn_maps = tuple(t.post_softmax_attn for t in intermediates.attn_intermediates)
             return out, attn_maps
         return out

x_transformers/x_transformers.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 import math
 from random import random
 from packaging import version
@@ -11,7 +13,7 @@ from torch.cuda.amp import autocast
 from functools import partial, wraps
 from collections import namedtuple
 from dataclasses import dataclass
-from typing import List, Dict, Tuple, Callable, Optional, Union
+from typing import List, Dict, Tuple, Callable
 from einops import rearrange, repeat, reduce, pack, unpack
 from einops.layers.torch import Rearrange
@@ -25,13 +27,13 @@ DEFAULT_DIM_HEAD = 64
 @dataclass
 class LayerIntermediates:
-    hiddens:            Optional[List[Tensor]] = None   # all hiddens, before the final norm (in pre-norm architecture)
-    last_hidden:        Optional[Tensor] = None         # very last hidden after all attention layers, after the final norm
-    attn_intermediates: Optional[List[Intermediates]] = None
-    layer_hiddens:      Optional[List[Tensor]] = None
-    attn_z_loss:        Optional[Tensor] = None
-    mems:               Optional[Tensor] = None
-    memory_tokens:      Optional[Tensor] = None
+    hiddens:            List[Tensor] | None = None   # all hiddens, before the final norm (in pre-norm architecture)
+    last_hidden:        Tensor | None = None         # very last hidden after all attention layers, after the final norm
+    attn_intermediates: List[Intermediates] | None = None
+    layer_hiddens:      List[Tensor] | None = None
+    attn_z_loss:        Tensor | None = None
+    mems:               Tensor | None = None
+    memory_tokens:      Tensor | None = None
 # helpers
@@ -140,7 +142,7 @@ def init_zero_(layer):
 # keyword argument helpers
 def pick_and_pop(keys, d):
-    values = list(map(lambda key: d.pop(key), keys))
+    values = tuple(d.pop(key) for key in  keys)
     return dict(zip(keys, values))
 def group_dict_by_key(cond, d):
@@ -149,7 +151,7 @@ def group_dict_by_key(cond, d):
         match = bool(cond(key))
         ind = int(not match)
         return_val[ind][key] = d[key]
-    return (*return_val,)
+    return tuple(return_val)
 def string_begins_with(prefix, str):
     return str.startswith(prefix)
@@ -159,7 +161,8 @@ def group_by_key_prefix(prefix, d):
 def groupby_prefix_and_trim(prefix, d):
     kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
-    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
+    prefix_len = len(prefix)
+    kwargs_without_prefix = {key[prefix_len:]: value for key, value in kwargs_with_prefix.items()}
     return kwargs_without_prefix, kwargs
 # structured dropout, more effective than traditional attention dropouts
@@ -441,25 +444,27 @@ class RotaryEmbedding(Module):
     @autocast(enabled = False)
     def forward(self, t):
-        max_pos = t.max()+1
+        max_pos = t.max() + 1
         freqs = torch.einsum('i , j -> i j', t.type_as(self.inv_freq), self.inv_freq) / self.interpolation_factor
-        freqs = torch.cat((freqs, freqs), dim = -1)
+        freqs = torch.stack((freqs, freqs), dim = -1)
+        freqs = rearrange(freqs, '... d r -> ... (d r)')
         if not exists(self.scale):
             return freqs, 1.
         power = (t - (max_pos // 2)) / self.scale_base
         scale = self.scale ** rearrange(power, 'n -> n 1')
-        scale = torch.cat((scale, scale), dim = -1)
+        scale = torch.stack((scale, scale), dim = -1)
+        scale = rearrange(scale, '... d r -> ... (d r)')
         return freqs, scale
 def rotate_half(x):
-    x = rearrange(x, '... (j d) -> ... j d', j = 2)
-    x1, x2 = x.unbind(dim = -2)
-    return torch.cat((-x2, x1), dim = -1)
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
 @autocast(enabled = False)
 def apply_rotary_pos_emb(t, freqs, scale = 1):
@@ -572,8 +577,8 @@ class GRUGating(Module):
 def shift(t, amount, mask = None):
     if amount == 0:
         return t
-    else:
-        amount = min(amount, t.shape[1])
+    amount = min(amount, t.shape[1])
     if exists(mask):
         t = t.masked_fill(~mask[..., None], 0.)
@@ -597,6 +602,23 @@ class ShiftTokens(Module):
         x = torch.cat((*segments_to_shift, *rest), dim = -1)
         return self.fn(x, **kwargs)
+# post branch operator
+class LayerScale(Module):
+    def __init__(self, fn: Module, dim, init_value = 0.):
+        super().__init__()
+        self.fn = fn
+        self.gamma = nn.Parameter(torch.ones(dim) * init_value)
+    def forward(self, x, **kwargs):
+        out = self.fn(x, **kwargs)
+        if isinstance(out, Tensor):
+            return out * self.gamma
+        out, *rest = out
+        return out * self.gamma, *rest
 # feedforward
 class GLU(Module):
@@ -817,7 +839,7 @@ class Attention(Module):
         mem = None,
         mem_mask = None,
         return_intermediates = False,
-        cache: Optional[Intermediates] = None,
+        cache: Intermediates | None = None,
     ):
         b, n, h, kv_h, head_scale, num_mem_kv, device, has_context = x.shape[0], x.shape[1], self.heads, self.kv_heads, self.head_scale, self.num_mem_kv, x.device, exists(context)
@@ -1024,11 +1046,11 @@ class AttentionLayers(Module):
         rotary_interpolation_factor = 1.,
         rotary_xpos_scale_base = 512,
         rotary_base_rescale_factor = 1.,
-        custom_layers = None,
+        weight_tie_layers = False,
+        custom_layers: Tuple[str] | None = None,
+        layers_execute_order: Tuple[int] | None = None,
         sandwich_coef = None,
         par_ratio = None,
-        weight_tie_layers = False,   # Albert - https://arxiv.org/abs/1909.11942
-        layers_execute_order = None, # generalizes weight tying, can do arbitrary layer execution orders
         residual_attn = False,
         cross_residual_attn = False,
         macaron = False,
@@ -1045,6 +1067,8 @@ class AttentionLayers(Module):
         layer_dropout = 0.,
         cross_attn_tokens_dropout = 0.,
         disable_abs_pos_emb = None,
+        use_layerscale = False,
+        layerscale_init_value = 0.,
         **kwargs
     ):
         super().__init__()
@@ -1108,6 +1132,8 @@ class AttentionLayers(Module):
         self.cross_attend = cross_attend
+        # determine norm
         assert (int(use_scalenorm) + int(use_rmsnorm) + int(use_simple_rmsnorm)) <= 1, 'you can only use either scalenorm, rmsnorm, or simple rmsnorm'
         if use_scalenorm:
@@ -1121,6 +1147,8 @@ class AttentionLayers(Module):
         norm_fn = partial(norm_class, dim)
+        # determine default block layer type order
         if cross_attend and not only_cross:
             default_block = ('a', 'c', 'f')
         elif cross_attend and only_cross:
@@ -1131,6 +1159,13 @@ class AttentionLayers(Module):
         if macaron:
             default_block = ('f',) + default_block
+        # determine post branch wrapper
+        post_branch_fn = None
+        if use_layerscale:
+            post_branch_fn = partial(LayerScale, dim = dim, init_value = layerscale_init_value)
         # zero init
         if zero_init_branch_output:
@@ -1219,6 +1254,9 @@ class AttentionLayers(Module):
                 shift_range_lower = -layer_shift_tokens if not causal else 0
                 layer = ShiftTokens(range(shift_range_lower, shift_range_upper), layer)
+            if exists(post_branch_fn):
+                layer = post_branch_fn(layer)
             residual_fn = GRUGating if gate_residual else Residual
             residual = residual_fn(dim, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant)
@@ -1248,8 +1286,8 @@ class AttentionLayers(Module):
         self_attn_kv_mask = None,
         mems = None,
         mem_masks = None,
-        seq_start_pos: Optional[Tensor] = None,
-        cache: Optional[LayerIntermediates] = None,
+        seq_start_pos: Tensor | None = None,
+        cache: LayerIntermediates | None = None,
         cache_age = 1,
         return_hiddens = False,
         rotary_pos_emb = None
@@ -1641,7 +1679,7 @@ class TransformerWrapper(Module):
         return_attn_z_loss = False,
         attn_z_loss_weight = 1e-4,
         seq_start_pos = None,
-        cache: Optional[LayerIntermediates] = None,
+        cache: LayerIntermediates | None = None,
         **kwargs
     ):
         b, n, device, num_mems, has_memory_tokens, emb_frac_gradient = x.shape[0], x.shape[1], x.device, self.num_memory_tokens, self.num_memory_tokens > 0, self.emb_frac_gradient

x_transformers/xval.py CHANGED Viewed

@@ -176,11 +176,11 @@ class XValTransformerWrapper(nn.Module):
         if return_mems:
             hiddens = intermediates.hiddens
-            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), hiddens))
+            new_mems = tuple(t[..., -self.max_mem_len:, :].detach() for t in hiddens)
             return out, new_mems
         if return_attn:
-            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+            attn_maps = tuple(t.post_softmax_attn for t in intermediates.attn_intermediates)
             return out, attn_maps
         return out

{x_transformers-1.29.2.dist-info → x_transformers-1.30.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.29.2
+Version: 1.30.1
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang
@@ -14,6 +14,6 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3.6
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: torch >=1.6
-Requires-Dist: einops >=0.7.0
+Requires-Dist: torch >=2.0
+Requires-Dist: einops >=0.8.0

x_transformers-1.30.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+x_transformers/__init__.py,sha256=8LQl-dNL6vj8VHRx5LMSOlRDTXQvYOuM21PDXz8WdiI,703
+x_transformers/attend.py,sha256=Y9eE26I7BM8rGveabhiRhzw_xq9TY61Sp10QC1hX2O8,10192
+x_transformers/autoregressive_wrapper.py,sha256=uX8Mb0zLsQrZECt_9UGt35g7tC05Rk3nPqO6xp2FFCc,9619
+x_transformers/continuous.py,sha256=WO52n9lFAXv5-SGadi2cApGF8dkouN8QSTEOuC7erj8,6180
+x_transformers/dpo.py,sha256=LjvWgCkqTl-UuehrzQ8nkX5guLr4whYwsmm7SKSwdls,3450
+x_transformers/nonautoregressive_wrapper.py,sha256=ys_p8obc7lTeeodCqvkRKxOXQ1C9T3j5Jwr-JbVgnXk,10432
+x_transformers/x_transformers.py,sha256=EEfqwI-NANzrQf10Tc_bRSdjWOIEJdhxOfzeKY4osyI,66137
+x_transformers/xl_autoregressive_wrapper.py,sha256=DCx4n0_c1tFai4nOqaWVnqx2p9eutsZsDMiMP1ckxNU,4117
+x_transformers/xval.py,sha256=QE1ltYZTR_eGgIHPP2BrMWVWVLqMW-OpDZh87BSmQEg,8563
+x_transformers-1.30.1.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
+x_transformers-1.30.1.dist-info/METADATA,sha256=gkmRLAvk0l9_vkrTVBIWLnFq_cEtCYrI8oI3B07d9B8,661
+x_transformers-1.30.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+x_transformers-1.30.1.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
+x_transformers-1.30.1.dist-info/RECORD,,

x_transformers-1.29.2.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-x_transformers/__init__.py,sha256=8LQl-dNL6vj8VHRx5LMSOlRDTXQvYOuM21PDXz8WdiI,703
-x_transformers/attend.py,sha256=L7vctHJ0PnECohu4cUu8yvY8cUrVyJxHmMFR0RGL0z4,10163
-x_transformers/autoregressive_wrapper.py,sha256=gYKIN5Rm8dMYSTX5yHpg9sPYyZf9rsRTJCNrYRdJ-Ww,9618
-x_transformers/continuous.py,sha256=dpHK4NSMDQAJQ_N3Uj9rip0fYGXyu0QCCO_OfEdbRGs,6192
-x_transformers/dpo.py,sha256=LjvWgCkqTl-UuehrzQ8nkX5guLr4whYwsmm7SKSwdls,3450
-x_transformers/nonautoregressive_wrapper.py,sha256=ys_p8obc7lTeeodCqvkRKxOXQ1C9T3j5Jwr-JbVgnXk,10432
-x_transformers/x_transformers.py,sha256=vPt5x0Pg03xGf8t2rZGW0zPd8xP0uvGLQvROFlmmOao,65200
-x_transformers/xl_autoregressive_wrapper.py,sha256=DCx4n0_c1tFai4nOqaWVnqx2p9eutsZsDMiMP1ckxNU,4117
-x_transformers/xval.py,sha256=EN3hxxleTRGYeAz6i4x3U_PrOm9TjxMF3eDhMKGx59E,8575
-x_transformers-1.29.2.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
-x_transformers-1.29.2.dist-info/METADATA,sha256=0_ON52HHs50Dcwp4PMfGhLWhDGKC9Rd4V3QAvmqxGyo,661
-x_transformers-1.29.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-x_transformers-1.29.2.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
-x_transformers-1.29.2.dist-info/RECORD,,

{x_transformers-1.29.2.dist-info → x_transformers-1.30.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{x_transformers-1.29.2.dist-info → x_transformers-1.30.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{x_transformers-1.29.2.dist-info → x_transformers-1.30.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

x-transformers 1.29.2__py3-none-any.whl → 1.30.1__py3-none-any.whl

x-transformers 1.29.2py3-none-any.whl → 1.30.1py3-none-any.whl