PyPI - mimic-video - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

mimic-video 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mimic-video might be problematic. Click here for more details.

Files changed (6) hide show

mimic_video/mimic_video.py CHANGED Viewed

@@ -1,18 +1,20 @@
 import torch
-from torch import nn
+from torch import nn, cat, stack, is_tensor, tensor
 from torch.nn import Module, ModuleList, Linear
 import torch.nn.functional as F
 import einx
-from einops import einsum, rearrange
+from einops import einsum, rearrange, repeat
 from einops.layers.torch import Rearrange
 from x_mlps_pytorch import create_mlp
 from torch_einops_utils import (
     pad_left_ndim,
-    align_dims_left
+    align_dims_left,
+    pad_at_dim,
+    pack_with_inverse,
 )
 # ein notation
@@ -37,12 +39,23 @@ def divisible_by(num, den):
 # tensor function
+def cast_tensor(val, device = None):
+    return tensor(val, device = device) if not is_tensor(val) else val
 def max_neg_value(t):
     return -torch.finfo(t.dtype).max
 def l2norm(t, eps = 1e-10):
     return F.normalize(t, dim = -1, eps = eps)
+# token shift from Peng et al. of RWKV
+# cheap way to generate relative positions
+def shift_feature_dim(t):
+    x, x_shift = t.chunk(2, dim = -1)
+    x_shift = pad_at_dim(x_shift, (1, -1), dim = 1)
+    return cat((x, x_shift), dim = -1)
 # time
 # they follow p0's research finding with the beta distribution
@@ -196,6 +209,7 @@ class MimicVideo(Module):
         *,
         dim_video_hidden,
         dim_action = 20,
+        dim_joint_state = 32,
         depth = 8,
         dim_head = 64,
         heads = 8,
@@ -215,10 +229,10 @@ class MimicVideo(Module):
         dim_time_cond = default(dim_time_cond, dim * 2)
-        self.to_time_cond = nn.Sequential(
-            RandomFourierEmbed(dim),
-            create_mlp(dim_in = dim, dim = dim_time_cond, depth = 2, activation = nn.SiLU())
-        )
+        self.to_fourier_embed = RandomFourierEmbed(dim) # used by deepmind, its fine
+        self.to_time_cond = create_mlp(dim_in = dim * 2, dim = dim_time_cond, depth = 2, activation = nn.SiLU())
+        self.to_joint_state_token = Linear(dim_joint_state, dim)
         self.video_hidden_norm = nn.RMSNorm(dim_video_hidden)
@@ -262,17 +276,18 @@ class MimicVideo(Module):
         actions,
         video_hiddens, # they use layer 19 of cosmos predict, at first denoising step. that's all
         *,
+        joint_state,
         time = None,
+        time_video_denoise = 0., # 0 is noise in the scheme i prefer - default to their optimal choice, but can be changed
         context_mask = None,
     ):
+        batch, device = actions.shape[0], actions.device
         is_training = not exists(time)
         # handle flow time conditioning
         if is_training:
-            batch, device = actions.shape[0], actions.device
             time = torch.rand((batch,), device = device)
             time = self.sample_time_fn(time)
@@ -285,7 +300,28 @@ class MimicVideo(Module):
         else:
             noised = actions
-        time_cond = self.to_time_cond(time)
+        if time.ndim == 0:
+            time = rearrange(time, '-> b', b = batch)
+        # handle the video denoising times
+        time_video_denoise = cast_tensor(time_video_denoise)
+        if time_video_denoise.ndim == 0:
+            time_video_denoise = rearrange(time_video_denoise, '-> 1')
+        if time_video_denoise.shape[0] != batch:
+            time_video_denoise = repeat(time_video_denoise, '1 -> b', b = batch)
+        times = stack((time, time_video_denoise), dim = -1)
+        # fourier embed and mlp to time condition
+        fourier_embed = self.to_fourier_embed(times)
+        fourier_embed = rearrange(fourier_embed, '... times d -> ... (times d)')
+        time_cond = self.to_time_cond(fourier_embed)
         # handle video hiddens
@@ -295,6 +331,10 @@ class MimicVideo(Module):
         tokens = self.to_action_tokens(noised)
+        joint_state_token = self.to_joint_state_token(joint_state)
+        tokens, inverse_pack = pack_with_inverse((joint_state_token, tokens), 'b * d')
         # transformer layers
         for (
@@ -322,14 +362,28 @@ class MimicVideo(Module):
             tokens = residual + attn(tokens) * gate
-            # feedforward
+            # prepare feedforward
             residual = tokens
             tokens, gate = ff_norm(tokens, time_cond)
+            # shift along time for action tokens for cheap relative positioning, which is better than messing with rope with such short action chunks
+            joint_state_token, tokens = inverse_pack(tokens)
+            tokens = shift_feature_dim(tokens)
+            tokens, _ = pack_with_inverse((joint_state_token, tokens), 'b * d')
+            # feedforward
             tokens = residual + ff(tokens) * gate
+        # remove joint token
+        _, tokens = inverse_pack(tokens)
         # prediction
         pred_flow = self.to_pred_action_flow(tokens)

{mimic_video-0.0.1.dist-info → mimic_video-0.0.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mimic-video
-Version: 0.0.1
+Version: 0.0.3
 Summary: Mimic Video
 Project-URL: Homepage, https://pypi.org/project/mimic-video/
 Project-URL: Repository, https://github.com/lucidrains/mimic-video

mimic_video-0.0.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+mimic_video/__init__.py,sha256=-4HP_pbT4YLhRUwNwuL4qyLHbgDyQ099nHL7eVi0_Ag,48
+mimic_video/mimic_video.py,sha256=-2HVpXAgEG28JFkJeUlypdmOMyYDD2tw0Fisf9-BZ-M,10243
+mimic_video-0.0.3.dist-info/METADATA,sha256=MVJMzysTCCpsgxBKUA9ye-aFSeQAXinyP3ejCtJ8JD8,2960
+mimic_video-0.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+mimic_video-0.0.3.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+mimic_video-0.0.3.dist-info/RECORD,,

mimic_video-0.0.1.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-mimic_video/__init__.py,sha256=-4HP_pbT4YLhRUwNwuL4qyLHbgDyQ099nHL7eVi0_Ag,48
-mimic_video/mimic_video.py,sha256=aejvjr1F3A7pZFikf-kEgeOpi1_53xVddBMpDPoxA90,8272
-mimic_video-0.0.1.dist-info/METADATA,sha256=414y344JcuIKQJss7d9riTrHszIwthHW8DDSSuRntdo,2960
-mimic_video-0.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-mimic_video-0.0.1.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-mimic_video-0.0.1.dist-info/RECORD,,

{mimic_video-0.0.1.dist-info → mimic_video-0.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{mimic_video-0.0.1.dist-info → mimic_video-0.0.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mimic-video 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

Potentially problematic release.

mimic-video 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl