PyPI - mimic-video - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.24__py3-none-any.whl - Mend

mimic-video 0.0.19py3-none-any.whl → 0.0.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mimic-video might be problematic. Click here for more details.

Files changed (6) hide show

mimic_video/mimic_video.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+from functools import partial
 import torch
 from torch import nn, cat, stack, is_tensor, tensor
@@ -7,7 +8,7 @@ from torch.nn import Module, ModuleList, Linear, GRU
 import torch.nn.functional as F
 import einx
-from einops import einsum, rearrange, repeat
+from einops import einsum, rearrange, repeat, reduce
 from einops.layers.torch import Rearrange
 from x_mlps_pytorch import create_mlp
@@ -15,12 +16,16 @@ from x_mlps_pytorch import create_mlp
 from tqdm import tqdm
 from torch_einops_utils import (
+    lens_to_mask,
     pad_left_ndim,
     align_dims_left,
     pad_at_dim,
     pack_with_inverse,
+    masked_mean
 )
+from hyper_connections.mHCv2 import get_init_and_expand_reduce_stream_functions
 # ein notation
 # b - batch
@@ -30,6 +35,10 @@ from torch_einops_utils import (
 # i, j - sequence (source, target)
 # d - feature dimension
+# constants
+LinearNoBias = partial(Linear, bias = False)
 # functions
 def exists(v):
@@ -96,8 +105,8 @@ class AdaptiveRMSNorm(Module):
         self.scale = dim ** 0.5
         self.eps = eps
-        self.to_modulation = Linear(dim_time_cond, dim * 3, bias = False)
-        self.split_modulation = Rearrange('b (three d) -> three b 1 d', three = 3)
+        self.to_modulation = LinearNoBias(dim_time_cond, dim * 3)
+        self.split_modulation = Rearrange('... (three d) -> three ... d', three = 3)
         nn.init.zeros_(self.to_modulation.weight)
@@ -108,9 +117,8 @@ class AdaptiveRMSNorm(Module):
         tokens,
         time_cond
     ):
-        if time_cond.ndim == 1:
-            time_cond = pad_left_ndim(time_cond, 1)
+        if time_cond.ndim == 2:
+            time_cond = rearrange(time_cond, 'b d -> b 1 d')
         modulations = self.to_modulation(time_cond)
@@ -134,7 +142,8 @@ class Attention(Module):
         dim_context = None,
         dim_head = 64,
         heads = 8,
-        kv_heads = 2
+        kv_heads = 2,
+        attn_gate_value = True
     ):
         super().__init__()
         dim_q_inner = dim_head * heads
@@ -143,9 +152,12 @@ class Attention(Module):
         self.scale = dim_head ** -0.5
-        self.to_queries = Linear(dim, dim_q_inner, bias = False)
-        self.to_keys_values = Linear(dim_context, dim_kv_inner * 2, bias = False)
-        self.to_out = Linear(dim_q_inner, dim, bias = False)
+        self.to_queries = LinearNoBias(dim, dim_q_inner)
+        self.to_keys_values = LinearNoBias(dim_context, dim_kv_inner * 2)
+        self.attn_gate_value = nn.Sequential(LinearNoBias(dim, heads), Rearrange('b n (g h) -> b g h n 1', h = kv_heads))
+        self.to_out = LinearNoBias(dim_q_inner, dim)
         assert divisible_by(heads, kv_heads)
         groups = heads // kv_heads
@@ -185,6 +197,8 @@ class Attention(Module):
         out = einsum(attn, values, 'b g h i j, b h j d -> b g h i d')
+        out = out * self.attn_gate_value(tokens).sigmoid()
         out = self.merge_heads(out)
         out = self.to_out(out)
@@ -238,7 +252,11 @@ class MimicVideo(Module):
         expansion_factor = 4.,
         ada_ln_zero_bias = -5.,
         dim_time_cond = None,
-        sample_time_fn = None
+        sample_time_fn = None,
+        train_time_rtc = False,
+        train_time_rtc_max_delay = None,
+        num_residual_streams = 1,
+        mhc_kwargs: dict = dict()
     ):
         super().__init__()
@@ -288,6 +306,10 @@ class MimicVideo(Module):
         self.video_hidden_norm = nn.RMSNorm(dim_video_hidden)
+        # manifold constrained hyper connections (mHC) from bytedance + deepseek
+        init_hyper_conn, self.expand_stream, self.reduce_stream = get_init_and_expand_reduce_stream_functions(num_residual_streams, dim = dim, add_stream_embed = True, **mhc_kwargs)
         # rnn
         self.rnn = GRU(dim, dim)
@@ -309,11 +331,20 @@ class MimicVideo(Module):
             ff = SwiGLUFeedForward(dim = dim, expansion_factor = expansion_factor)
+            # maybe hyper connect
+            attn_residual = init_hyper_conn()
+            cross_attn_residual = init_hyper_conn()
+            ff_residual = init_hyper_conn()
             layers.append(ModuleList([
-                attn_adanorm,
-                attn,
+                cross_attn_residual,
                 cross_attn_adanorm,
                 cross_attn,
+                attn_residual,
+                attn_adanorm,
+                attn,
+                ff_residual,
                 ff_adanorm,
                 ff
             ]))
@@ -327,6 +358,17 @@ class MimicVideo(Module):
             Linear(dim, dim_action, bias = False)
         )
+        # inference related
+        # train time RTC related - https://arxiv.org/abs/2512.05964
+        self.train_time_rtc = train_time_rtc
+        assert not train_time_rtc or exists(train_time_rtc_max_delay)
+        self.train_time_rtc_max_delay = train_time_rtc_max_delay
+        # aux loss and device
         self.register_buffer('zero', tensor(0.), persistent = False)
     @property
@@ -380,6 +422,7 @@ class MimicVideo(Module):
         assert actions.shape[-2:] == self.action_shape
         batch, device = actions.shape[0], actions.device
+        orig_actions = actions
         is_training = not exists(time) and not return_flow
@@ -393,7 +436,8 @@ class MimicVideo(Module):
                 assert exists(self.video_predict_wrapper), f'`video_predict_wrapper` must be passed in if raw video is passed into MimicVideo'
                 video_hiddens = self.video_predict_wrapper(video, prompts = prompts, prompt_token_ids = prompt_token_ids)
-                video_hiddens = video_hiddens.float() # maybe bfloat to float32
+                video_hiddens = video_hiddens.to(self.device).float() # maybe bfloat to float32
                 video_hiddens, _ = pack_with_inverse(video_hiddens, 'b * d')
@@ -420,9 +464,24 @@ class MimicVideo(Module):
             actions, left_aligned_time = align_dims_left((actions, time))
             noised = noise.lerp(actions, left_aligned_time)
         else:
             noised = actions
+        # maybe train time rtc
+        action_loss_mask = None
+        if is_training and self.train_time_rtc:
+            rand_prefix_len = torch.randint(0, self.train_time_rtc_max_delay, (batch,), device = device)
+            action_prefix_mask = lens_to_mask(rand_prefix_len, self.action_chunk_len)
+            actions = einx.where('b na, b na d, b na d', action_prefix_mask, orig_actions, actions)
+            time = einx.where('b na, , b', action_prefix_mask, 1., time)
+            action_loss_mask = ~action_prefix_mask
         if time.ndim == 0:
             time = repeat(time, '-> b', b = batch)
@@ -436,8 +495,14 @@ class MimicVideo(Module):
         if time_video_denoise.shape[0] != batch:
             time_video_denoise = repeat(time_video_denoise, '1 -> b', b = batch)
+        if time.ndim == 2:
+            time_video_denoise = repeat(time_video_denoise, 'b -> b n', n = time.shape[-1])
         times = stack((time, time_video_denoise), dim = -1)
+        if times.ndim == 3:
+            times = pad_at_dim(times, (1, 0), dim = 1, value = 1.) # handle joint state token on the action
         # fourier embed and mlp to time condition
         fourier_embed = self.to_fourier_embed(times)
@@ -468,41 +533,48 @@ class MimicVideo(Module):
         tokens, inverse_pack = pack_with_inverse((joint_state_token, tokens), 'b * d')
+        # maybe expand streams
+        tokens = self.expand_stream(tokens)
         # transformer layers
         for ((
-            attn_norm,
-            attn,
+            maybe_cross_attn_mhc,
             cross_attn_norm,
             cross_attn,
+            maybe_attn_mhc,
+            attn_norm,
+            attn,
+            maybe_ff_mhc,
             ff_norm,
             ff
         ), cached_video_kv) in zip(self.layers, prev_cached_video_hiddens_kv):
             # cross attention
-            residual = tokens
+            tokens, add_residual = maybe_cross_attn_mhc(tokens)
             tokens, gate = cross_attn_norm(tokens, time_cond)
             cross_attn_out, video_kv = cross_attn(tokens, context = video_hiddens, context_mask = context_mask, kv = cached_video_kv, return_kv = True)
-            tokens = residual + cross_attn_out * gate
+            tokens = add_residual(cross_attn_out * gate)
             if return_cache:
                 next_cached_video_hiddens_kv.append(video_kv)
             # self attention
-            residual = tokens
+            tokens, add_residual = maybe_attn_mhc(tokens)
             tokens, gate = attn_norm(tokens, time_cond)
-            tokens = residual + attn(tokens) * gate
+            tokens = add_residual(attn(tokens) * gate)
             # prepare feedforward
-            residual = tokens
+            tokens, add_residual = maybe_ff_mhc(tokens)
             tokens, gate = ff_norm(tokens, time_cond)
@@ -516,7 +588,11 @@ class MimicVideo(Module):
             # feedforward
-            tokens = residual + ff(tokens) * gate
+            tokens = add_residual(ff(tokens) * gate)
+        # maybe reduce streams
+        tokens = self.reduce_stream(tokens)
         # remove joint token
@@ -533,9 +609,9 @@ class MimicVideo(Module):
         else:
             # mse flow loss
-            flow_loss = F.mse_loss(pred_flow, flow)
+            flow_loss = F.mse_loss(pred_flow, flow, reduction = 'none')
-            out = flow_loss
+            out = masked_mean(flow_loss, action_loss_mask)
         if not return_cache:
             return out

{mimic_video-0.0.19.dist-info → mimic_video-0.0.24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mimic-video
-Version: 0.0.19
+Version: 0.0.24
 Summary: Mimic Video
 Project-URL: Homepage, https://pypi.org/project/mimic-video/
 Project-URL: Repository, https://github.com/lucidrains/mimic-video
@@ -36,7 +36,8 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.10
 Requires-Dist: einops>=0.8.1
 Requires-Dist: einx>=0.3.0
-Requires-Dist: torch-einops-utils>=0.0.8
+Requires-Dist: hyper-connections>=0.4.3
+Requires-Dist: torch-einops-utils>=0.0.12
 Requires-Dist: torch>=2.5
 Requires-Dist: tqdm
 Requires-Dist: x-mlps-pytorch
@@ -147,3 +148,16 @@ That's it
     url     = {https://api.semanticscholar.org/CorpusID:283920528}
 }
 ```
+```bibtex
+@misc{black2025trainingtimeactionconditioningefficient,
+    title   = {Training-Time Action Conditioning for Efficient Real-Time Chunking},
+    author  = {Kevin Black and Allen Z. Ren and Michael Equi and Sergey Levine},
+    year    = {2025},
+    eprint  = {2512.05964},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.RO},
+    url     = {https://arxiv.org/abs/2512.05964},
+}
+```

mimic_video-0.0.24.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+mimic_video/__init__.py,sha256=Rs3QeBBGBKKi1U1ykcyeBrCL2XCbfNvppeeD1Fb1pdY,47
+mimic_video/cosmos_predict.py,sha256=2XR9cqcUC4gKpjEDBy-GtLtMkLXvs8yKe7w8g6EeS6s,8471
+mimic_video/mimic_video.py,sha256=Qr0Dc4z-LTRlTt0qXlgcJtdSP1pBsarXeOnJSUxj_yY,17388
+mimic_video-0.0.24.dist-info/METADATA,sha256=4kXYmqL3XtJbZ35iX42Z85RFV_ZGMM_phKGUZWnfcaw,4581
+mimic_video-0.0.24.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+mimic_video-0.0.24.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+mimic_video-0.0.24.dist-info/RECORD,,

mimic_video-0.0.19.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-mimic_video/__init__.py,sha256=Rs3QeBBGBKKi1U1ykcyeBrCL2XCbfNvppeeD1Fb1pdY,47
-mimic_video/cosmos_predict.py,sha256=2XR9cqcUC4gKpjEDBy-GtLtMkLXvs8yKe7w8g6EeS6s,8471
-mimic_video/mimic_video.py,sha256=wQNfdV2PGlR_-S-4Mm3cyswtRQ3nBQGJiHptya3ckKU,14761
-mimic_video-0.0.19.dist-info/METADATA,sha256=67a7iVIkf557qMos_yvpcqL8dK9yL0Jf1DPQuhb2bwo,4142
-mimic_video-0.0.19.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-mimic_video-0.0.19.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-mimic_video-0.0.19.dist-info/RECORD,,

{mimic_video-0.0.19.dist-info → mimic_video-0.0.24.dist-info}/WHEEL RENAMED Viewed

File without changes

{mimic_video-0.0.19.dist-info → mimic_video-0.0.24.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mimic-video 0.0.19__py3-none-any.whl → 0.0.24__py3-none-any.whl

Potentially problematic release.

mimic-video 0.0.19py3-none-any.whl → 0.0.24py3-none-any.whl