PyPI - x-transformers - Versions diffs - 2.3.14__py3-none-any.whl → 2.3.16__py3-none-any.whl - Mend

x-transformers 2.3.14py3-none-any.whl → 2.3.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

x_transformers/continuous.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import torch
-from torch import nn, cat, stack
+from torch import nn, cat, stack, arange
 from torch.nn import Module
 import torch.nn.functional as F
 from torch.distributions import Normal
@@ -64,7 +64,7 @@ class ContinuousTransformerWrapper(Module):
         use_abs_pos_emb = True,
         scaled_sinu_pos_emb = False,
         average_pool_embed = False,
-        probabilistic = False
+        probabilistic = False,
     ):
         super().__init__()
         dim = attn_layers.dim
@@ -130,6 +130,7 @@ class ContinuousTransformerWrapper(Module):
         sum_embeds = None,
         prepend_embeds = None,
         prepend_mask = None,
+        seq_start_pos = None,
         **kwargs
     ):
         batch, seq, orig_mask, device = *x.shape[:2], mask, x.device
@@ -138,14 +139,14 @@ class ContinuousTransformerWrapper(Module):
         if exists(lens):
             assert not exists(mask), 'either `mask` or `lens` passed in, but not both'
-            seq_arange = torch.arange(seq, device = device)
+            seq_arange = arange(seq, device = device)
             mask = einx.less('j, i -> i j', seq_arange, lens)
         # project in + positional embedding
         x = self.project_in(x)
-        x = x + self.pos_emb(x, pos = pos)
+        x = x + self.pos_emb(x, pos = pos, seq_start_pos = seq_start_pos)
         if exists(sum_embeds):
             x = x + sum_embeds
@@ -220,7 +221,8 @@ class ContinuousAutoregressiveWrapper(Module):
         self,
         net: ContinuousTransformerWrapper,
         loss_fn: Module | None = None,
-        equal_loss_weight_batch = False  # setting this to True, if the mask is passed in and sequences are variable in length, each sequence will be weighted the same (as opposed to each token)
+        equal_loss_weight_batch = False,  # setting this to True, if the mask is passed in and sequences are variable in length, each sequence will be weighted the same (as opposed to each token)
+        rollout_steps = 1                 # they used 2 rollout steps in a successful world model paper https://ai.meta.com/vjepa/
     ):
         super().__init__()
         self.net = net
@@ -234,6 +236,14 @@ class ContinuousAutoregressiveWrapper(Module):
         self.loss_fn = loss_fn
         self.equal_loss_weight_batch = equal_loss_weight_batch
+        # num rollout steps - if greater than one, recurrently feedback the output and enforce loss rollout steps - 1 ahead
+        # applied successfully in vjepa2 world model, with rollout steps of 2
+        # rollout steps of 1 would be the same as single step autoregressive
+        assert not (rollout_steps > 1 and probabilistic), f'rollout steps greater than 1 only supported for non-probabilistic'
+        assert 1 <= rollout_steps
+        self.rollout_steps = rollout_steps
     @torch.no_grad()
     def generate(
         self,
@@ -247,12 +257,13 @@ class ContinuousAutoregressiveWrapper(Module):
         device = start_tokens.device
         was_training = self.net.training
-        num_dims = len(start_tokens.shape)
+        num_dims = start_tokens.ndim
         assert num_dims >= 2, 'number of dimensions of your start tokens must be greater or equal to 2'
+        no_batch = num_dims == 2
-        if num_dims == 2:
-            start_tokens = start_tokens[None, :]
+        if no_batch:
+            start_tokens = rearrange(start_tokens, 'n d -> 1 n d')
         b, t, _, device = *start_tokens.shape, start_tokens.device
@@ -281,8 +292,8 @@ class ContinuousAutoregressiveWrapper(Module):
         out = out[:, t:]
-        if num_dims == 2:
-            out = out.squeeze(0)
+        if no_batch:
+            out = rearrange(out, '1 n d -> n d')
         self.net.train(was_training)
         return out
@@ -292,7 +303,37 @@ class ContinuousAutoregressiveWrapper(Module):
         x,
         **kwargs
     ):
-        inp, target = x[:, :-1], x[:, 1:]
+        steps = self.rollout_steps
+        one_step_autoregress = steps == 1
+        # get the input
+        inp = x[:, :-steps]
+        # variables
+        batch, seq_len, device = *inp.shape[:2], inp.device
+        # get target
+        seq_start_pos = None
+        if one_step_autoregress:
+            target = x[:, None, 1:]
+        else:
+            batch_arange = arange(batch, device = device)
+            batch_arange = rearrange(batch_arange, 'b -> b 1 1')
+            seq_arange = arange(seq_len, device = device)
+            steps_arange = arange(steps, device = device) + 1
+            target_indices = einx.add('r, n -> r n', steps_arange, seq_arange)
+            target = x[batch_arange, target_indices] # rollout targets
+            seq_start_pos = torch.zeros(batch, device = device, dtype = torch.long)
+        # assert inputs
         assert 'prepend_embeds' not in kwargs
@@ -303,29 +344,57 @@ class ContinuousAutoregressiveWrapper(Module):
         if exists(lens):
             assert 'mask' not in kwargs, 'either `mask` or `lens` passed in, but not both'
             seq_len, device = inp.shape[1], inp.device
-            seq_arange = torch.arange(seq_len, device = device)
+            seq_arange = arange(seq_len, device = device)
             mask = einx.less('j, i -> i j', seq_arange, lens)
             kwargs['mask'] = mask
-        # mask
+        # handle mask manually
-        mask = kwargs.get('mask', None)
+        mask = kwargs.pop('mask', None)
-        if exists(mask) and mask.shape[1] == x.shape[1]:
-            mask = mask[:, :-1]
-            kwargs['mask'] = mask
+        has_mask = exists(mask)
+        # maybe rollout
+        outputs = []
+        masks = []
+        for step_index in range(steps):
+            step_mask = None
+            if has_mask:
+                step_mask = mask[:, step_index:(step_index + seq_len)]
+                masks.append(step_mask)
+            # forward
+            out = self.net(inp, mask = step_mask, seq_start_pos = seq_start_pos, **kwargs)
+            outputs.append(out)
+            inp = out
+            if not one_step_autoregress:
+                seq_start_pos.sub_(1)
+        # stack masks and predictions from rollouts
+        masks = stack(masks, dim = 1) if exists(mask) else None
+        pred = stack(outputs, dim = 1)
+        # loss
-        out = self.net(inp, **kwargs)
+        loss = self.loss_fn(pred, target)
-        loss = self.loss_fn(out, target)
+        # adjusting loss based on mask
-        if exists(mask):
+        if has_mask:
             assert loss.ndim > 1, 'loss should not be reduced if mask is passed in'
             if self.equal_loss_weight_batch:
-                loss = masked_mean(loss, mask)
+                loss = masked_mean(loss, masks)
             else:
-                loss = loss[mask]
+                loss = loss[masks]
         return loss.mean()

{x_transformers-2.3.14.dist-info → x_transformers-2.3.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: x-transformers
-Version: 2.3.14
+Version: 2.3.16
 Summary: X-Transformers
 Project-URL: Homepage, https://pypi.org/project/x-transformers/
 Project-URL: Repository, https://github.com/lucidrains/x-transformers
@@ -2486,4 +2486,13 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
+```bibtex
+@inproceedings{Assran2025VJEPA2S,
+    title   = {V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning},
+    author  = {Mahmoud Assran and Adrien Bardes and David Fan and Quentin Garrido and Russell Howes and Mojtaba Komeili and Matthew Muckley and Ammar Rizvi and Claire Roberts and Koustuv Sinha and Artem Zholus and Sergio Arnaud and Abha Gejji and Ada Martin and Francois Robert Hogan and Daniel Dugas and Piotr Bojanowski and Vasil Khalidov and Patrick Labatut and Francisco Massa and Marc Szafraniec and Kapil Krishnakumar and Yong Li and Xiaodong Ma and Sarath Chandar and Franziska Meier and Yann LeCun and Michael Rabbat and Nicolas Ballas and Fair at Meta and Mila - Qu{\'e}bec and AI Institute and Polytechnique Montr{\'e}al},
+    year    = {2025},
+    url     = {https://api.semanticscholar.org/CorpusID:279306055}
+}
+```
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis

{x_transformers-2.3.14.dist-info → x_transformers-2.3.16.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ x_transformers/__init__.py,sha256=h3I2ejobgEdy8H7NgV-rP8UaBCnd16-MysvDXH9GMEA,98
 x_transformers/attend.py,sha256=xFsBtl7h7_qebPh7kE81BpmCWAjCgFpB9i_IHu_91es,17288
 x_transformers/autoregressive_wrapper.py,sha256=reLCno9Z9pchVU79tBF8OMo21LwSZ67KAeB83jqkyAc,10505
 x_transformers/belief_state_wrapper.py,sha256=YLUMk6t2MhFBEw5lHDDHJHcoCxTIkHvxTNY__GGZEKU,13374
-x_transformers/continuous.py,sha256=KPKi7TKqHYcDWYVhSkSB9y5iZMnhzVZxHhjJRdL7w5I,9521
+x_transformers/continuous.py,sha256=jy2wsQ3sS80Qwm_gnAmdAnzBfzLoWrGPacOTzU1Q6JM,11674
 x_transformers/dpo.py,sha256=xt4OuOWhU8pN3OKN2LZAaC2NC8iiEnchqqcrPWVqf0o,3521
 x_transformers/entropy_based_tokenizer.py,sha256=F2lO8-v3aLIcVDVNhu7RR-UtRdlmaaYJzBK9m7OnLE8,5018
 x_transformers/multi_input.py,sha256=tCh-fTJDj2ib4SMGtsa-AM8MxKzJAQSwqAXOu3HU2mg,9252
@@ -11,7 +11,7 @@ x_transformers/nonautoregressive_wrapper.py,sha256=2NU58hYMgn-4Jzg3mie-mXb0XH_dC
 x_transformers/x_transformers.py,sha256=ZfOXrZSiy2jlZ8wVmDdMTLW4hAY_qfmPQHW9t2ABxbo,114097
 x_transformers/xl_autoregressive_wrapper.py,sha256=CvZMJ6A6PA-Y_bQAhnORwjJBSl6Vjq2IdW5KTdk8NI8,4195
 x_transformers/xval.py,sha256=AwwYUm8yDAtKQyKJDIhYMsiLTJ_skh3scUFMjp5sda8,8597
-x_transformers-2.3.14.dist-info/METADATA,sha256=Tnvnrfnr-eIlUVEH3IePLykynVikAq-t01v4pSh3yPQ,89022
-x_transformers-2.3.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-x_transformers-2.3.14.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
-x_transformers-2.3.14.dist-info/RECORD,,
+x_transformers-2.3.16.dist-info/METADATA,sha256=-lL73g4mG5pszuaU7lPdMVGJ7ZtqBqhaejr5VvWWUiw,89897
+x_transformers-2.3.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+x_transformers-2.3.16.dist-info/licenses/LICENSE,sha256=As9u198X-U-vph5noInuUfqsAG2zX_oXPHDmdjwlPPY,1066
+x_transformers-2.3.16.dist-info/RECORD,,

{x_transformers-2.3.14.dist-info → x_transformers-2.3.16.dist-info}/WHEEL RENAMED Viewed

File without changes

{x_transformers-2.3.14.dist-info → x_transformers-2.3.16.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

x-transformers 2.3.14__py3-none-any.whl → 2.3.16__py3-none-any.whl

x-transformers 2.3.14py3-none-any.whl → 2.3.16py3-none-any.whl