PyPI - x-transformers - Versions diffs - 1.34.0__py3-none-any.whl → 1.35.0__py3-none-any.whl - Mend

x-transformers 1.34.0py3-none-any.whl → 1.35.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

x_transformers/attend.py CHANGED Viewed

@@ -138,9 +138,27 @@ class Attend(Module):
         # flash attention
         self.flash = flash
-        assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
-        self.sdp_kwargs = sdp_kwargs
+        torch_version = version.parse(torch.__version__)
+        assert not (flash and torch_version < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
+        # torch 2.3 uses new backend and context manager
+        if torch_version >= version.parse('2.3'):
+            from torch.nn.attention import SDPBackend
+            str_to_backend = dict(
+                enable_flash = SDPBackend.FLASH_ATTENTION,
+                enable_mem_efficient = SDPBackend.EFFICIENT_ATTENTION,
+                enable_math = SDPBackend.MATH,
+                enable_cudnn = SDPBackend.CUDNN_ATTENTION
+            )
+            sdpa_backends = [str_to_backend[enable_str] for enable_str, enable in sdp_kwargs.items() if enable]
+            self.sdp_context_manager = partial(torch.nn.attention.sdpa_kernel, sdpa_backends)
+        else:
+            self.sdp_context_manager = partial(torch.backends.cuda.sdp_kernel, **sdp_kwargs)
     def flash_attn(
         self,
@@ -231,7 +249,7 @@ class Attend(Module):
         # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
-        with torch.backends.cuda.sdp_kernel(**self.sdp_kwargs):
+        with self.sdp_context_manager():
             out = F.scaled_dot_product_attention(
                 q, k, v,
                 attn_mask = mask,

x_transformers/x_transformers.py CHANGED Viewed

@@ -810,6 +810,19 @@ class AdaptiveLayerScale(Module):
         out, *rest = out
         return out * gamma, *rest
+# skip connection combining
+class ConcatCombine(Module):
+    def __init__(self, dim, prev_layer_ind):
+        super().__init__()
+        self.prev_layer_ind = prev_layer_ind
+        self.combine = nn.Linear(dim * 2, dim, bias = False)
+    def forward(self, x, prev_layers: list[Tensor]):
+        skip = prev_layers[self.prev_layer_ind]
+        concatted_skip = torch.cat((skip, x), dim = -1)
+        return self.combine(concatted_skip)
 # feedforward
 class GLU(Module):
@@ -1307,6 +1320,7 @@ class AttentionLayers(Module):
         disable_abs_pos_emb = None,
         use_layerscale = False,
         layerscale_init_value = 0.,
+        unet_skips = False,
         **kwargs
     ):
         super().__init__()
@@ -1468,6 +1482,8 @@ class AttentionLayers(Module):
         # calculate layer block order
+        len_default_block = 1
         if exists(custom_layers):
             layer_types = custom_layers
         elif exists(par_ratio):
@@ -1487,6 +1503,7 @@ class AttentionLayers(Module):
         else:
             assert exists(depth), '`depth` must be passed in for `Decoder` or `Encoder`'
             layer_types = default_block * depth
+            len_default_block = len(default_block)
         self.layer_types = layer_types
         self.layers_execute_order = default(layers_execute_order, tuple(range(len(layer_types))))
@@ -1522,11 +1539,31 @@ class AttentionLayers(Module):
         self.final_norm = norm_fn() if pre_norm or resi_dual else nn.Identity()
+        # whether unet or not
+        self.unet_skips = unet_skips
+        num_skips = self.depth // len_default_block
+        assert not (unet_skips and num_skips == 0), 'must have depth of at least 2 for unet skip connections'
+        skip_indices = [i * len_default_block for i in range(num_skips)]
+        self.skip_combines = ModuleList([])
         # iterate and construct layers
         for ind, (layer_type, layer_shift_tokens) in enumerate(zip(self.layer_types, shift_tokens)):
+            # `ind` is the index of each module - attention, feedforward, cross attention
+            # but `block_ind` refers to the typical enumeration of a transformer block (attn + ff + [optional] cross attn)
+            block_begin = divisible_by(ind, len_default_block)
+            block_ind = ind // len_default_block
             is_last_layer = ind == (len(self.layer_types) - 1)
+            # attention, cross attention, feedforward
             if layer_type == 'a':
                 layer = Attention(dim, heads = heads, causal = causal, **attn_kwargs)
             elif layer_type == 'c':
@@ -1548,6 +1585,14 @@ class AttentionLayers(Module):
             residual_fn = GRUGating if gate_residual else Residual
             residual = residual_fn(dim, scale_residual = scale_residual, scale_residual_constant = scale_residual_constant)
+            # handle unet skip connection
+            skip_combine = None
+            is_latter_half = block_begin and block_ind >= (self.depth / 2)
+            if self.unet_skips and is_latter_half:
+                skip_combine = ConcatCombine(dim, skip_indices.pop())
             # all normalizations of the layer
             pre_branch_norm = norm_fn() if pre_norm else None
@@ -1560,6 +1605,8 @@ class AttentionLayers(Module):
                 post_main_norm
             ])
+            self.skip_combines.append(skip_combine)
             self.layers.append(ModuleList([
                 norms,
                 layer,
@@ -1670,6 +1717,7 @@ class AttentionLayers(Module):
         layer_variables = (
             self.layer_types,
+            self.skip_combines,
             self.layers,
             self.layer_dropouts
         )
@@ -1680,11 +1728,24 @@ class AttentionLayers(Module):
         layer_variables = tuple(tuple(layer_variable[i] for i in layers_execute_order) for layer_variable in layer_variables)
+        # store all hiddens for skips
+        skip_hiddens = []
         # go through the attention and feedforward layers
-        for ind, (layer_type, (norm, block, residual_fn), layer_dropout) in enumerate(zip(*layer_variables)):
+        for ind, (layer_type, skip_combine, (norm, block, residual_fn), layer_dropout) in enumerate(zip(*layer_variables)):
             is_last = ind == (len(self.layers) - 1)
+            # handle skip connections
+            skip_hiddens.append(x)
+            if exists(skip_combine):
+                x = skip_combine(x, skip_hiddens)
+            # layer dropout
             if self.training and layer_dropout > 0. and random() < layer_dropout:
                 continue

{x_transformers-1.34.0.dist-info → x_transformers-1.35.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: x-transformers
-Version: 1.34.0
+Version: 1.35.0
 Summary: X-Transformers - Pytorch
 Home-page: https://github.com/lucidrains/x-transformers
 Author: Phil Wang

{x_transformers-1.34.0.dist-info → x_transformers-1.35.0.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 x_transformers/__init__.py,sha256=-MkQrSc37cTVDX7AOykxunYnqVtFlQ7lb0Cse5dsGWU,793
-x_transformers/attend.py,sha256=MI-m91wumBFqFqr_KK9MLgsLk_vPeaVbFMyDr_mWdmY,11349
+x_transformers/attend.py,sha256=7q996VGYHGIsc0FQnN8WNiwHn3xny3i1biRwx7yW5vg,12090
 x_transformers/autoregressive_wrapper.py,sha256=ka_iiej5lEBOcbutWQgGrFVMDilz2PFWzLhBh5_tmmg,10366
 x_transformers/continuous.py,sha256=cIVEdhfei258__ziV7kQBrJMxCel54bExBTDrO9rfCI,6450
 x_transformers/dpo.py,sha256=LjvWgCkqTl-UuehrzQ8nkX5guLr4whYwsmm7SKSwdls,3450
 x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg,9252
 x_transformers/nonautoregressive_wrapper.py,sha256=ys_p8obc7lTeeodCqvkRKxOXQ1C9T3j5Jwr-JbVgnXk,10432
-x_transformers/x_transformers.py,sha256=hs9j-lHukVGYLlpbBhn4CZhSzI7s0x6bYtEhCc33ftE,78680
+x_transformers/x_transformers.py,sha256=2oQoQs7RMbFrVdMeOddy6yq1MhJxnficjORmMWBjjPo,80593
 x_transformers/xl_autoregressive_wrapper.py,sha256=DCx4n0_c1tFai4nOqaWVnqx2p9eutsZsDMiMP1ckxNU,4117
 x_transformers/xval.py,sha256=QE1ltYZTR_eGgIHPP2BrMWVWVLqMW-OpDZh87BSmQEg,8563
-x_transformers-1.34.0.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
-x_transformers-1.34.0.dist-info/METADATA,sha256=aTRBJepYjojT5TFi8W2oK4j7daQGRQaWwj2HHnnwDCQ,661
-x_transformers-1.34.0.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
-x_transformers-1.34.0.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
-x_transformers-1.34.0.dist-info/RECORD,,
+x_transformers-1.35.0.dist-info/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
+x_transformers-1.35.0.dist-info/METADATA,sha256=D32aQ96BsP6BXjikkuZUHc77sO6thZVO9cI_xFgLQF0,661
+x_transformers-1.35.0.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
+x_transformers-1.35.0.dist-info/top_level.txt,sha256=hO6KGpFuGucRNEtRfme4A_rGcM53AKwGP7RVlRIxS5Q,15
+x_transformers-1.35.0.dist-info/RECORD,,

{x_transformers-1.34.0.dist-info → x_transformers-1.35.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (74.0.0)
+Generator: setuptools (74.1.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{x_transformers-1.34.0.dist-info → x_transformers-1.35.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{x_transformers-1.34.0.dist-info → x_transformers-1.35.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

x-transformers 1.34.0__py3-none-any.whl → 1.35.0__py3-none-any.whl

x-transformers 1.34.0py3-none-any.whl → 1.35.0py3-none-any.whl