PyPI - mimic-video - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.27__py3-none-any.whl - Mend

mimic-video 0.0.19py3-none-any.whl → 0.0.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mimic-video might be problematic. Click here for more details.

Files changed (6) hide show

mimic_video/mimic_video.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from __future__ import annotations
+from functools import partial
 import torch
 from torch import nn, cat, stack, is_tensor, tensor
@@ -7,7 +8,7 @@ from torch.nn import Module, ModuleList, Linear, GRU
 import torch.nn.functional as F
 import einx
-from einops import einsum, rearrange, repeat
+from einops import einsum, rearrange, repeat, reduce
 from einops.layers.torch import Rearrange
 from x_mlps_pytorch import create_mlp
@@ -15,12 +16,16 @@ from x_mlps_pytorch import create_mlp
 from tqdm import tqdm
 from torch_einops_utils import (
+    lens_to_mask,
     pad_left_ndim,
     align_dims_left,
     pad_at_dim,
     pack_with_inverse,
+    masked_mean
 )
+from hyper_connections.mHCv2 import get_init_and_expand_reduce_stream_functions
 # ein notation
 # b - batch
@@ -30,6 +35,10 @@ from torch_einops_utils import (
 # i, j - sequence (source, target)
 # d - feature dimension
+# constants
+LinearNoBias = partial(Linear, bias = False)
 # functions
 def exists(v):
@@ -38,9 +47,22 @@ def exists(v):
 def default(v, d):
     return v if exists(v) else d
+def identity(t):
+    return t
 def divisible_by(num, den):
     return (num % den) == 0
+# wrappers
+def eval_no_grad(fn):
+    def inner(*args, **kwargs):
+        with torch.no_grad():
+            fn.eval()
+            return fn(*args, **kwargs)
+    return inner
 # tensor function
 def cast_tensor(val, device = None):
@@ -60,6 +82,30 @@ def shift_feature_dim(t):
     x_shift = pad_at_dim(x_shift, (1, -1), dim = 1)
     return cat((x, x_shift), dim = -1)
+# action normalization
+class Normalizer(Module):
+    def __init__(
+        self,
+        mean,
+        std,
+        eps = 1e-6
+    ):
+        super().__init__()
+        assert (std > 0.).all(), 'std must be positive'
+        self.eps = eps
+        self.register_buffer('mean', mean)
+        self.register_buffer('std', std)
+    def normalize(self, t):
+        mean, std = self.mean, self.std
+        return (t - mean) / std.clamp_min(self.eps)
+    def inverse_normalize(self, t):
+        mean, std = self.mean, self.std
+        return (t * std) + mean
 # time
 # they follow p0's research finding with the beta distribution
@@ -96,8 +142,8 @@ class AdaptiveRMSNorm(Module):
         self.scale = dim ** 0.5
         self.eps = eps
-        self.to_modulation = Linear(dim_time_cond, dim * 3, bias = False)
-        self.split_modulation = Rearrange('b (three d) -> three b 1 d', three = 3)
+        self.to_modulation = LinearNoBias(dim_time_cond, dim * 3)
+        self.split_modulation = Rearrange('... (three d) -> three ... d', three = 3)
         nn.init.zeros_(self.to_modulation.weight)
@@ -108,9 +154,8 @@ class AdaptiveRMSNorm(Module):
         tokens,
         time_cond
     ):
-        if time_cond.ndim == 1:
-            time_cond = pad_left_ndim(time_cond, 1)
+        if time_cond.ndim == 2:
+            time_cond = rearrange(time_cond, 'b d -> b 1 d')
         modulations = self.to_modulation(time_cond)
@@ -134,7 +179,8 @@ class Attention(Module):
         dim_context = None,
         dim_head = 64,
         heads = 8,
-        kv_heads = 2
+        kv_heads = 2,
+        attn_gate_value = True
     ):
         super().__init__()
         dim_q_inner = dim_head * heads
@@ -143,9 +189,12 @@ class Attention(Module):
         self.scale = dim_head ** -0.5
-        self.to_queries = Linear(dim, dim_q_inner, bias = False)
-        self.to_keys_values = Linear(dim_context, dim_kv_inner * 2, bias = False)
-        self.to_out = Linear(dim_q_inner, dim, bias = False)
+        self.to_queries = LinearNoBias(dim, dim_q_inner)
+        self.to_keys_values = LinearNoBias(dim_context, dim_kv_inner * 2)
+        self.attn_gate_value = nn.Sequential(LinearNoBias(dim, heads), Rearrange('b n (g h) -> b g h n 1', h = kv_heads))
+        self.to_out = LinearNoBias(dim_q_inner, dim)
         assert divisible_by(heads, kv_heads)
         groups = heads // kv_heads
@@ -185,6 +234,8 @@ class Attention(Module):
         out = einsum(attn, values, 'b g h i j, b h j d -> b g h i d')
+        out = out * self.attn_gate_value(tokens).sigmoid()
         out = self.merge_heads(out)
         out = self.to_out(out)
@@ -238,7 +289,12 @@ class MimicVideo(Module):
         expansion_factor = 4.,
         ada_ln_zero_bias = -5.,
         dim_time_cond = None,
-        sample_time_fn = None
+        sample_time_fn = None,
+        train_time_rtc = False,
+        train_time_rtc_max_delay = None,
+        num_residual_streams = 1,
+        mhc_kwargs: dict = dict(),
+        action_mean_std: Tensor | None = None
     ):
         super().__init__()
@@ -248,12 +304,21 @@ class MimicVideo(Module):
         self.video_predict_wrapper = video_predict_wrapper
-        # dims
+        # action related
         self.action_chunk_len = action_chunk_len
         self.dim_action = dim_action
         self.action_shape = (action_chunk_len, dim_action)
+        self.action_normalizer = None
+        if exists(action_mean_std):
+            assert action_mean_std.shape == (2, dim_action), f'must be in shape of (2 action_dim)'
+            self.action_normalizer = Normalizer(*action_mean_std)
+        # joint dim
         self.dim_joint_state = dim_joint_state
         dim_video_hidden = default(dim_video_hidden, video_predict_wrapper.dim_latent if exists(video_predict_wrapper) else None)
@@ -288,6 +353,10 @@ class MimicVideo(Module):
         self.video_hidden_norm = nn.RMSNorm(dim_video_hidden)
+        # manifold constrained hyper connections (mHC) from bytedance + deepseek
+        init_hyper_conn, self.expand_stream, self.reduce_stream = get_init_and_expand_reduce_stream_functions(num_residual_streams, dim = dim, add_stream_embed = True, **mhc_kwargs)
         # rnn
         self.rnn = GRU(dim, dim)
@@ -309,11 +378,20 @@ class MimicVideo(Module):
             ff = SwiGLUFeedForward(dim = dim, expansion_factor = expansion_factor)
+            # maybe hyper connect
+            attn_residual = init_hyper_conn()
+            cross_attn_residual = init_hyper_conn()
+            ff_residual = init_hyper_conn()
             layers.append(ModuleList([
-                attn_adanorm,
-                attn,
+                cross_attn_residual,
                 cross_attn_adanorm,
                 cross_attn,
+                attn_residual,
+                attn_adanorm,
+                attn,
+                ff_residual,
                 ff_adanorm,
                 ff
             ]))
@@ -327,8 +405,25 @@ class MimicVideo(Module):
             Linear(dim, dim_action, bias = False)
         )
+        # inference related
+        # train time RTC related - https://arxiv.org/abs/2512.05964
+        self.train_time_rtc = train_time_rtc
+        assert not train_time_rtc or exists(train_time_rtc_max_delay)
+        self.train_time_rtc_max_delay = train_time_rtc_max_delay
+        # aux loss and device
         self.register_buffer('zero', tensor(0.), persistent = False)
+    # only action parameters
+    def action_parameters(self):
+        video_model_params = set(self.video_predict_wrapper.parameters()) if exists(self.video_predict_wrapper) else {}
+        return set(self.parameters()) - video_model_params
     @property
     def device(self):
         return self.zero.device
@@ -338,26 +433,60 @@ class MimicVideo(Module):
         self,
         steps = 16,
         batch_size = 1,
+        prefix_action_chunk = None,
         disable_progress_bar = False,
         **kwargs
     ):
         self.eval()
+        inpainting = exists(prefix_action_chunk)
+        if inpainting:
+            prefix_len = prefix_action_chunk.shape[1]
+            assert prefix_len < self.action_chunk_len
+            maybe_normed_prefix = prefix_action_chunk
+            if exists(self.action_normalizer):
+                maybe_normed_prefix = self.action_normalizer.normalize(prefix_action_chunk)
+        # noise
         noise = torch.randn((batch_size, *self.action_shape), device = self.device)
+        # times
         times = torch.linspace(0., 1., steps + 1, device = self.device)[:-1]
         delta = 1. / steps
+        # denoised action starts as noise
         denoised = noise
         cache = None
+        # denoise
         for time in tqdm(times, disable = disable_progress_bar):
+            if inpainting:
+                denoised[:, :prefix_len] = maybe_normed_prefix
             pred_flow, cache = self.forward(actions = denoised, time = time, cache = cache, return_cache = True, **kwargs)
             denoised = denoised + delta * pred_flow
+        # handle action inverse norm
+        if exists(self.action_normalizer):
+            denoised = self.action_normalizer.inverse_normalize(denoised)
+        # final set, with unnormalized prefix, if inpainting
+        if inpainting:
+            denoised[:, :prefix_len] = prefix_action_chunk
         return denoised
     def forward(
@@ -372,6 +501,8 @@ class MimicVideo(Module):
         time_video_denoise = 0., # 0 is noise in the scheme i prefer - default to their optimal choice, but can be changed
         prompts = None,
         prompt_token_ids = None,
+        detach_video_hiddens = False,
+        no_grad_video_model_forward = False,
         cache = None,
         return_cache = False,
         return_flow = False
@@ -379,7 +510,11 @@ class MimicVideo(Module):
         assert not exists(self.video_predict_wrapper) or (exists(prompts) ^ exists(prompt_token_ids))
         assert actions.shape[-2:] == self.action_shape
+        if exists(self.action_normalizer):
+            actions = self.action_normalizer.normalize(actions)
         batch, device = actions.shape[0], actions.device
+        orig_actions = actions
         is_training = not exists(time) and not return_flow
@@ -392,8 +527,11 @@ class MimicVideo(Module):
             if not exists(video_hiddens):
                 assert exists(self.video_predict_wrapper), f'`video_predict_wrapper` must be passed in if raw video is passed into MimicVideo'
-                video_hiddens = self.video_predict_wrapper(video, prompts = prompts, prompt_token_ids = prompt_token_ids)
-                video_hiddens = video_hiddens.float() # maybe bfloat to float32
+                video_forward_wrap = eval_no_grad if no_grad_video_model_forward else identity
+                video_hiddens = video_forward_wrap(self.video_predict_wrapper)(video, prompts = prompts, prompt_token_ids = prompt_token_ids)
+                video_hiddens = video_hiddens.to(self.device).float() # maybe bfloat to float32
                 video_hiddens, _ = pack_with_inverse(video_hiddens, 'b * d')
@@ -401,6 +539,9 @@ class MimicVideo(Module):
             # handle video hiddens
+            if detach_video_hiddens:
+                video_hiddens = video_hiddens.detach()
             video_hiddens = self.video_hidden_norm(video_hiddens)
         # handle caching
@@ -420,9 +561,24 @@ class MimicVideo(Module):
             actions, left_aligned_time = align_dims_left((actions, time))
             noised = noise.lerp(actions, left_aligned_time)
         else:
             noised = actions
+        # maybe train time rtc
+        action_loss_mask = None
+        if is_training and self.train_time_rtc:
+            rand_prefix_len = torch.randint(0, self.train_time_rtc_max_delay, (batch,), device = device)
+            action_prefix_mask = lens_to_mask(rand_prefix_len, self.action_chunk_len)
+            actions = einx.where('b na, b na d, b na d', action_prefix_mask, orig_actions, actions)
+            time = einx.where('b na, , b', action_prefix_mask, 1., time)
+            action_loss_mask = ~action_prefix_mask
         if time.ndim == 0:
             time = repeat(time, '-> b', b = batch)
@@ -436,8 +592,14 @@ class MimicVideo(Module):
         if time_video_denoise.shape[0] != batch:
             time_video_denoise = repeat(time_video_denoise, '1 -> b', b = batch)
+        if time.ndim == 2:
+            time_video_denoise = repeat(time_video_denoise, 'b -> b n', n = time.shape[-1])
         times = stack((time, time_video_denoise), dim = -1)
+        if times.ndim == 3:
+            times = pad_at_dim(times, (1, 0), dim = 1, value = 1.) # handle joint state token on the action
         # fourier embed and mlp to time condition
         fourier_embed = self.to_fourier_embed(times)
@@ -468,41 +630,48 @@ class MimicVideo(Module):
         tokens, inverse_pack = pack_with_inverse((joint_state_token, tokens), 'b * d')
+        # maybe expand streams
+        tokens = self.expand_stream(tokens)
         # transformer layers
         for ((
-            attn_norm,
-            attn,
+            maybe_cross_attn_mhc,
             cross_attn_norm,
             cross_attn,
+            maybe_attn_mhc,
+            attn_norm,
+            attn,
+            maybe_ff_mhc,
             ff_norm,
             ff
         ), cached_video_kv) in zip(self.layers, prev_cached_video_hiddens_kv):
             # cross attention
-            residual = tokens
+            tokens, add_residual = maybe_cross_attn_mhc(tokens)
             tokens, gate = cross_attn_norm(tokens, time_cond)
             cross_attn_out, video_kv = cross_attn(tokens, context = video_hiddens, context_mask = context_mask, kv = cached_video_kv, return_kv = True)
-            tokens = residual + cross_attn_out * gate
+            tokens = add_residual(cross_attn_out * gate)
             if return_cache:
                 next_cached_video_hiddens_kv.append(video_kv)
             # self attention
-            residual = tokens
+            tokens, add_residual = maybe_attn_mhc(tokens)
             tokens, gate = attn_norm(tokens, time_cond)
-            tokens = residual + attn(tokens) * gate
+            tokens = add_residual(attn(tokens) * gate)
             # prepare feedforward
-            residual = tokens
+            tokens, add_residual = maybe_ff_mhc(tokens)
             tokens, gate = ff_norm(tokens, time_cond)
@@ -516,7 +685,11 @@ class MimicVideo(Module):
             # feedforward
-            tokens = residual + ff(tokens) * gate
+            tokens = add_residual(ff(tokens) * gate)
+        # maybe reduce streams
+        tokens = self.reduce_stream(tokens)
         # remove joint token
@@ -533,9 +706,9 @@ class MimicVideo(Module):
         else:
             # mse flow loss
-            flow_loss = F.mse_loss(pred_flow, flow)
+            flow_loss = F.mse_loss(pred_flow, flow, reduction = 'none')
-            out = flow_loss
+            out = masked_mean(flow_loss, action_loss_mask)
         if not return_cache:
             return out

{mimic_video-0.0.19.dist-info → mimic_video-0.0.27.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mimic-video
-Version: 0.0.19
+Version: 0.0.27
 Summary: Mimic Video
 Project-URL: Homepage, https://pypi.org/project/mimic-video/
 Project-URL: Repository, https://github.com/lucidrains/mimic-video
@@ -36,7 +36,8 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.10
 Requires-Dist: einops>=0.8.1
 Requires-Dist: einx>=0.3.0
-Requires-Dist: torch-einops-utils>=0.0.8
+Requires-Dist: hyper-connections>=0.4.3
+Requires-Dist: torch-einops-utils>=0.0.12
 Requires-Dist: torch>=2.5
 Requires-Dist: tqdm
 Requires-Dist: x-mlps-pytorch
@@ -147,3 +148,16 @@ That's it
     url     = {https://api.semanticscholar.org/CorpusID:283920528}
 }
 ```
+```bibtex
+@misc{black2025trainingtimeactionconditioningefficient,
+    title   = {Training-Time Action Conditioning for Efficient Real-Time Chunking},
+    author  = {Kevin Black and Allen Z. Ren and Michael Equi and Sergey Levine},
+    year    = {2025},
+    eprint  = {2512.05964},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.RO},
+    url     = {https://arxiv.org/abs/2512.05964},
+}
+```

mimic_video-0.0.27.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+mimic_video/__init__.py,sha256=Rs3QeBBGBKKi1U1ykcyeBrCL2XCbfNvppeeD1Fb1pdY,47
+mimic_video/cosmos_predict.py,sha256=2XR9cqcUC4gKpjEDBy-GtLtMkLXvs8yKe7w8g6EeS6s,8471
+mimic_video/mimic_video.py,sha256=WlwFfFvOW5k6X-BxRvF0zjwpKEET9C_FIyewD6_GmcE,20017
+mimic_video-0.0.27.dist-info/METADATA,sha256=al9--DJ_U_jwWilronv3IADdbCIuQfEQRMCJ3vEtE80,4581
+mimic_video-0.0.27.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+mimic_video-0.0.27.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+mimic_video-0.0.27.dist-info/RECORD,,

mimic_video-0.0.19.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-mimic_video/__init__.py,sha256=Rs3QeBBGBKKi1U1ykcyeBrCL2XCbfNvppeeD1Fb1pdY,47
-mimic_video/cosmos_predict.py,sha256=2XR9cqcUC4gKpjEDBy-GtLtMkLXvs8yKe7w8g6EeS6s,8471
-mimic_video/mimic_video.py,sha256=wQNfdV2PGlR_-S-4Mm3cyswtRQ3nBQGJiHptya3ckKU,14761
-mimic_video-0.0.19.dist-info/METADATA,sha256=67a7iVIkf557qMos_yvpcqL8dK9yL0Jf1DPQuhb2bwo,4142
-mimic_video-0.0.19.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-mimic_video-0.0.19.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-mimic_video-0.0.19.dist-info/RECORD,,

{mimic_video-0.0.19.dist-info → mimic_video-0.0.27.dist-info}/WHEEL RENAMED Viewed

File without changes

{mimic_video-0.0.19.dist-info → mimic_video-0.0.27.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mimic-video 0.0.19__py3-none-any.whl → 0.0.27__py3-none-any.whl

Potentially problematic release.

mimic-video 0.0.19py3-none-any.whl → 0.0.27py3-none-any.whl