PyPI - mimic-video - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.24__py3-none-any.whl - Mend

mimic-video 0.0.3py3-none-any.whl → 0.0.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mimic-video might be problematic. Click here for more details.

Files changed (8) hide show

mimic_video/__init__.py +0 -1
mimic_video/cosmos_predict.py +269 -0
mimic_video/mimic_video.py +271 -47
{mimic_video-0.0.3.dist-info → mimic_video-0.0.24.dist-info}/METADATA +87 -2
mimic_video-0.0.24.dist-info/RECORD +7 -0
mimic_video-0.0.3.dist-info/RECORD +0 -6
{mimic_video-0.0.3.dist-info → mimic_video-0.0.24.dist-info}/WHEEL +0 -0
{mimic_video-0.0.3.dist-info → mimic_video-0.0.24.dist-info}/licenses/LICENSE +0 -0

mimic_video/__init__.py CHANGED Viewed

	@@ -1,2 +1 @@
1	-
2 1	from mimic_video.mimic_video import MimicVideo

mimic_video/cosmos_predict.py ADDED Viewed

@@ -0,0 +1,269 @@
+from __future__ import annotations
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import torch
+from torch.nn import Module
+from torch import nn, Tensor
+from einops import rearrange
+from diffusers.models.transformers.transformer_cosmos import CosmosTransformer3DModel
+from diffusers.models.autoencoders.autoencoder_kl_cosmos import AutoencoderKLCosmos
+from diffusers.schedulers.scheduling_edm_euler import EDMEulerScheduler
+from transformers import T5EncoderModel, T5TokenizerFast, T5Config
+# helpers
+def exists(v):
+    return v is not None
+def identity(t):
+    return t
+def default(v, d):
+    return v if exists(v) else d
+# constants
+TINY_TRANSFORMER_CONFIG = dict(
+    in_channels = 16,
+    out_channels = 16,
+    num_attention_heads = 1,
+    attention_head_dim = 16,
+    mlp_ratio = 1.0,
+    text_embed_dim = 32,
+    adaln_lora_dim = 32,
+    patch_size = (1, 2, 2),
+    max_size = (4, 16, 16),
+    extra_pos_embed_type = None,
+    concat_padding_mask = False,
+)
+TINY_VAE_CONFIG = dict(
+    in_channels = 3,
+    out_channels = 3,
+    latent_channels = 16,
+    encoder_block_out_channels = (8, 16),
+    decode_block_out_channels = (8, 16),
+    temporal_compression_ratio = 4,
+    spatial_compression_ratio = 4,
+    num_layers = 1,
+    attention_resolutions = (),
+    resolution = 64,
+)
+TINY_T5_CONFIG = dict(
+    vocab_size = 32128,
+    d_model = 32,
+    d_kv = 8,
+    d_ff = 64,
+    num_layers = 1,
+    num_heads = 1,
+)
+REAL_TRANSFORMER_CONFIG = dict(
+    in_channels = 16,
+    out_channels = 16,
+    num_attention_heads = 32,
+    attention_head_dim = 128,
+    mlp_ratio = 4.0,
+    text_embed_dim = 1024,
+    patch_size = (1, 2, 2),
+    max_size = (128, 240, 240),
+    extra_pos_embed_type = "learnable",
+    concat_padding_mask = True,
+)
+REAL_VAE_CONFIG = dict(
+    in_channels = 3,
+    out_channels = 3,
+    latent_channels = 16,
+    encoder_block_out_channels = (128, 256, 512, 512),
+    decode_block_out_channels = (256, 512, 512, 512),
+    temporal_compression_ratio = 8,
+    spatial_compression_ratio = 8,
+)
+REAL_T5_CONFIG = dict(
+    vocab_size = 32128,
+    d_model = 1024,
+    d_kv = 64,
+    d_ff = 2048,
+    num_layers = 12,
+    num_heads = 16,
+)
+# main class
+class CosmosPredictWrapper(Module):
+    """
+    Wraps Cosmos VAE + DiT for extracting hidden states from a video.
+    Supports proper EDM Euler denoising steps.
+    """
+    def __init__(
+        self,
+        model_name: str = 'nvidia/Cosmos-1.0-Diffusion-7B-Video2World',
+        extract_layer: int = 19,
+        random_weights: bool = False,
+        tiny: bool = False,
+        normalize = lambda t: (t - 0.5) * 2.0
+    ):
+        super().__init__()
+        self.extract_layer = extract_layer
+        self.hook_handle = None
+        self.cached_hidden_states: list[Tensor] = []
+        if random_weights:
+            self._init_random_weights(tiny = tiny)
+        else:
+            self._init_pretrained(model_name)
+        # Initialize scheduler
+        self.scheduler = EDMEulerScheduler()
+        # store hidden dim for consumers
+        self.dim_latent = self.transformer.config.num_attention_heads * self.transformer.config.attention_head_dim
+        # maybe normalize
+        self.normalize = normalize
+        self._register_hook()
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def _init_pretrained(self, model_name: str):
+        """Load pretrained weights from HuggingFace"""
+        from diffusers import CosmosVideoToWorldPipeline
+        pipeline = CosmosVideoToWorldPipeline.from_pretrained(model_name)
+        # Extract components we need
+        self.vae = pipeline.vae
+        self.transformer = pipeline.transformer
+        self.text_encoder = pipeline.text_encoder
+        self.tokenizer = pipeline.tokenizer
+        # Clean up pipeline
+        del pipeline
+    def _init_random_weights(self, tiny: bool = False):
+        """Initialize with random weights for testing"""
+        transformer_config = TINY_TRANSFORMER_CONFIG if tiny else REAL_TRANSFORMER_CONFIG
+        vae_config = TINY_VAE_CONFIG if tiny else REAL_VAE_CONFIG
+        t5_config_dict = TINY_T5_CONFIG if tiny else REAL_T5_CONFIG
+        num_layers = max(2, self.extract_layer + 1)
+        if not tiny:
+            num_layers = max(28, num_layers)
+        self.transformer = CosmosTransformer3DModel(
+            num_layers = num_layers,
+            **transformer_config
+        )
+        self.vae = AutoencoderKLCosmos(**vae_config)
+        t5_config = T5Config(**t5_config_dict)
+        self.text_encoder = T5EncoderModel(t5_config)
+        self.tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-small")
+    def __del__(self):
+        if exists(self.hook_handle):
+            self.hook_handle.remove()
+    def _register_hook(self):
+        assert hasattr(self.transformer, 'transformer_blocks'), 'transformer must have transformer_blocks'
+        assert len(self.transformer.transformer_blocks) > self.extract_layer, f'layer {self.extract_layer} out of bounds'
+        target_layer = self.transformer.transformer_blocks[self.extract_layer]
+        def hook_fn(module, inp, out):
+            self.cached_hidden_states.append(out.detach().cpu())
+        self.hook_handle = target_layer.register_forward_hook(hook_fn)
+    def forward(
+        self,
+        videos: Tensor,
+        prompts: str | list[str] | None = None,
+        prompt_token_ids: Tensor | None = None,
+        num_inference_steps: int = 1,
+    ) -> Tensor:
+        """
+        videos: (batch, frames, channels, height, width) in [0, 1]
+        num_inference_steps: number of denoising steps to run
+        returns: hidden states tensor from the specified transformer layer (from first step)
+        """
+        batch, t, c, h, w = videos.shape
+        assert exists(prompts) ^ exists(prompt_token_ids)
+        # Scale videos from [0, 1] to [-1, 1] for Cosmos VAE
+        videos = self.normalize(videos)
+        if isinstance(prompts, str):
+            prompts = [prompts] * batch
+        self.cached_hidden_states.clear()
+        # Move video to device and rearrange for VAE: (B, T, C, H, W) -> (B, C, T, H, W)
+        videos = rearrange(videos, 'b t c h w -> b c t h w')
+        with torch.inference_mode():
+            # 1. encode video to latents via VAE
+            latents = self.vae.encode(videos).latent_dist.sample()
+            # 2. maybe encode text prompts
+            if exists(prompt_token_ids):
+                text_inputs = dict(input_ids = prompt_token_ids)
+            else:
+                text_inputs = self.tokenizer(
+                    prompts,
+                    return_tensors = "pt",
+                    padding = True,
+                    truncation = True,
+                    max_length = 512
+                )
+            encoder_hidden_states = self.text_encoder(**text_inputs).last_hidden_state
+            # 3. Setup scheduler timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device = self.device)
+            timesteps = self.scheduler.timesteps
+            # 4. Add noise to latents (start from pure noise scaled by initial sigma)
+            noise = torch.randn_like(latents)
+            latents = latents + noise * self.scheduler.init_noise_sigma
+            # 5. Denoising loop
+            for i, timestep in enumerate(timesteps):
+                # Scale model input
+                latent_model_input = self.scheduler.scale_model_input(latents, timestep)
+                # Predict noise residual
+                noise_pred = self.transformer(
+                    hidden_states = latent_model_input,
+                    encoder_hidden_states = encoder_hidden_states,
+                    timestep = timestep.expand(batch),
+                    return_dict = False
+                )[0]
+                # Compute previous noisy sample
+                latents = self.scheduler.step(noise_pred, timestep, latents, return_dict = False)[0]
+        assert len(self.cached_hidden_states) > 0, 'hidden states not captured'
+        # Return hidden states from the first denoising step
+        hidden = self.cached_hidden_states[0]
+        assert hidden.shape[-1] == self.dim_latent, f'hidden dim mismatch: expected {self.dim_latent_hidden}, got {hidden.shape[-1]}'
+        return hidden

mimic_video/mimic_video.py CHANGED Viewed

@@ -1,22 +1,31 @@
+from __future__ import annotations
+from functools import partial
 import torch
 from torch import nn, cat, stack, is_tensor, tensor
-from torch.nn import Module, ModuleList, Linear
+from torch.nn import Module, ModuleList, Linear, GRU
 import torch.nn.functional as F
 import einx
-from einops import einsum, rearrange, repeat
+from einops import einsum, rearrange, repeat, reduce
 from einops.layers.torch import Rearrange
 from x_mlps_pytorch import create_mlp
+from tqdm import tqdm
 from torch_einops_utils import (
+    lens_to_mask,
     pad_left_ndim,
     align_dims_left,
     pad_at_dim,
     pack_with_inverse,
+    masked_mean
 )
+from hyper_connections.mHCv2 import get_init_and_expand_reduce_stream_functions
 # ein notation
 # b - batch
@@ -26,6 +35,10 @@ from torch_einops_utils import (
 # i, j - sequence (source, target)
 # d - feature dimension
+# constants
+LinearNoBias = partial(Linear, bias = False)
 # functions
 def exists(v):
@@ -85,25 +98,27 @@ class AdaptiveRMSNorm(Module):
         self,
         dim,
         dim_time_cond,
-        eps = 1e-6
+        eps = 1e-6,
+        ada_ln_zero_bias = -5.
     ):
         super().__init__()
         self.scale = dim ** 0.5
         self.eps = eps
-        self.to_modulation = Linear(dim_time_cond, dim * 3, bias = False)
-        self.split_modulation = Rearrange('b (three d) -> three b 1 d', three = 3)
+        self.to_modulation = LinearNoBias(dim_time_cond, dim * 3)
+        self.split_modulation = Rearrange('... (three d) -> three ... d', three = 3)
         nn.init.zeros_(self.to_modulation.weight)
+        self.ada_ln_zero_bias = ada_ln_zero_bias
     def forward(
         self,
         tokens,
         time_cond
     ):
-        if time_cond.ndim == 1:
-            time_cond = pad_left_ndim(time_cond, 1)
+        if time_cond.ndim == 2:
+            time_cond = rearrange(time_cond, 'b d -> b 1 d')
         modulations = self.to_modulation(time_cond)
@@ -113,7 +128,9 @@ class AdaptiveRMSNorm(Module):
         adaptive_normed = normed * (scale + 1.) + shift
-        return adaptive_normed, gate
+        gate_with_bias = (gate + self.ada_ln_zero_bias).sigmoid()
+        return adaptive_normed, gate_with_bias
 # attention
@@ -125,7 +142,8 @@ class Attention(Module):
         dim_context = None,
         dim_head = 64,
         heads = 8,
-        kv_heads = 2
+        kv_heads = 2,
+        attn_gate_value = True
     ):
         super().__init__()
         dim_q_inner = dim_head * heads
@@ -134,9 +152,12 @@ class Attention(Module):
         self.scale = dim_head ** -0.5
-        self.to_queries = Linear(dim, dim_q_inner, bias = False)
-        self.to_keys_values = Linear(dim_context, dim_kv_inner * 2, bias = False)
-        self.to_out = Linear(dim_q_inner, dim, bias = False)
+        self.to_queries = LinearNoBias(dim, dim_q_inner)
+        self.to_keys_values = LinearNoBias(dim_context, dim_kv_inner * 2)
+        self.attn_gate_value = nn.Sequential(LinearNoBias(dim, heads), Rearrange('b n (g h) -> b g h n 1', h = kv_heads))
+        self.to_out = LinearNoBias(dim_q_inner, dim)
         assert divisible_by(heads, kv_heads)
         groups = heads // kv_heads
@@ -149,15 +170,20 @@ class Attention(Module):
         self,
         tokens,
         context = None,
-        context_mask = None
+        context_mask = None,
+        kv = None,
+        return_kv = False
     ):
         context = default(context, tokens)
         queries = self.to_queries(tokens)
-        keys, values = self.to_keys_values(context).chunk(2, dim = -1)
         queries = self.split_q_heads(queries)
-        keys, values = tuple(self.split_kv_heads(t) for t in (keys, values))
+        if not exists(kv):
+            keys, values = self.to_keys_values(context).chunk(2, dim = -1)
+            keys, values = tuple(self.split_kv_heads(t) for t in (keys, values))
+        else:
+            keys, values = kv
         queries = queries * self.scale
@@ -171,9 +197,16 @@ class Attention(Module):
         out = einsum(attn, values, 'b g h i j, b h j d -> b g h i d')
+        out = out * self.attn_gate_value(tokens).sigmoid()
         out = self.merge_heads(out)
-        return self.to_out(out)
+        out = self.to_out(out)
+        if not return_kv:
+            return out
+        return out, stack((keys, values))
 # feedforward
@@ -206,19 +239,47 @@ class MimicVideo(Module):
     def __init__(
         self,
         dim,
+        video_predict_wrapper: Module | None = None,
         *,
-        dim_video_hidden,
+        dim_video_hidden = None,
+        action_chunk_len = 32,
         dim_action = 20,
         dim_joint_state = 32,
+        proprio_mask_prob = 0.1,
         depth = 8,
         dim_head = 64,
         heads = 8,
         expansion_factor = 4.,
+        ada_ln_zero_bias = -5.,
         dim_time_cond = None,
-        sample_time_fn = None
+        sample_time_fn = None,
+        train_time_rtc = False,
+        train_time_rtc_max_delay = None,
+        num_residual_streams = 1,
+        mhc_kwargs: dict = dict()
     ):
         super().__init__()
+        self.depth = depth
+        # maybe video predict
+        self.video_predict_wrapper = video_predict_wrapper
+        # dims
+        self.action_chunk_len = action_chunk_len
+        self.dim_action = dim_action
+        self.action_shape = (action_chunk_len, dim_action)
+        self.dim_joint_state = dim_joint_state
+        dim_video_hidden = default(dim_video_hidden, video_predict_wrapper.dim_latent if exists(video_predict_wrapper) else None)
+        assert exists(dim_video_hidden), f'`dim_video_hidden` must be set or `video_predict_wrapper` passed in with `dim_latent`'
+        self.dim_video_hidden = dim_video_hidden
         # flow related
         self.sample_time_fn = default(sample_time_fn, default_sample_time_fn)
@@ -232,10 +293,27 @@ class MimicVideo(Module):
         self.to_fourier_embed = RandomFourierEmbed(dim) # used by deepmind, its fine
         self.to_time_cond = create_mlp(dim_in = dim * 2, dim = dim_time_cond, depth = 2, activation = nn.SiLU())
+        # joint token related
         self.to_joint_state_token = Linear(dim_joint_state, dim)
+        self.proprio_mask_prob = proprio_mask_prob
+        self.has_proprio_masking = proprio_mask_prob > 0.
+        self.proprio_mask_token = nn.Parameter(torch.randn(dim))
+        # video norm
         self.video_hidden_norm = nn.RMSNorm(dim_video_hidden)
+        # manifold constrained hyper connections (mHC) from bytedance + deepseek
+        init_hyper_conn, self.expand_stream, self.reduce_stream = get_init_and_expand_reduce_stream_functions(num_residual_streams, dim = dim, add_stream_embed = True, **mhc_kwargs)
+        # rnn
+        self.rnn = GRU(dim, dim)
         # transformer
         layers = []
@@ -249,15 +327,24 @@ class MimicVideo(Module):
             cross_attn = Attention(dim = dim, dim_head = dim_head, dim_context = dim_video_hidden, heads = heads)
-            ff_adanorm = AdaptiveRMSNorm(dim = dim, dim_time_cond = dim_time_cond)
+            ff_adanorm = AdaptiveRMSNorm(dim = dim, dim_time_cond = dim_time_cond, ada_ln_zero_bias = ada_ln_zero_bias)
             ff = SwiGLUFeedForward(dim = dim, expansion_factor = expansion_factor)
+            # maybe hyper connect
+            attn_residual = init_hyper_conn()
+            cross_attn_residual = init_hyper_conn()
+            ff_residual = init_hyper_conn()
             layers.append(ModuleList([
-                attn_adanorm,
-                attn,
+                cross_attn_residual,
                 cross_attn_adanorm,
                 cross_attn,
+                attn_residual,
+                attn_adanorm,
+                attn,
+                ff_residual,
                 ff_adanorm,
                 ff
             ]))
@@ -268,22 +355,102 @@ class MimicVideo(Module):
         self.to_pred_action_flow = nn.Sequential(
             nn.RMSNorm(dim),
-            Linear(dim, dim_action)
+            Linear(dim, dim_action, bias = False)
         )
+        # inference related
+        # train time RTC related - https://arxiv.org/abs/2512.05964
+        self.train_time_rtc = train_time_rtc
+        assert not train_time_rtc or exists(train_time_rtc_max_delay)
+        self.train_time_rtc_max_delay = train_time_rtc_max_delay
+        # aux loss and device
+        self.register_buffer('zero', tensor(0.), persistent = False)
+    @property
+    def device(self):
+        return self.zero.device
+    @torch.no_grad()
+    def sample(
+        self,
+        steps = 16,
+        batch_size = 1,
+        disable_progress_bar = False,
+        **kwargs
+    ):
+        self.eval()
+        noise = torch.randn((batch_size, *self.action_shape), device = self.device)
+        times = torch.linspace(0., 1., steps + 1, device = self.device)[:-1]
+        delta = 1. / steps
+        denoised = noise
+        cache = None
+        for time in tqdm(times, disable = disable_progress_bar):
+            pred_flow, cache = self.forward(actions = denoised, time = time, cache = cache, return_cache = True, **kwargs)
+            denoised = denoised + delta * pred_flow
+        return denoised
     def forward(
         self,
-        actions,
-        video_hiddens, # they use layer 19 of cosmos predict, at first denoising step. that's all
         *,
+        actions,
         joint_state,
+        video = None,
+        video_hiddens = None, # they use layer 19 of cosmos predict, at first denoising step. that's all
+        context_mask = None,
         time = None,
         time_video_denoise = 0., # 0 is noise in the scheme i prefer - default to their optimal choice, but can be changed
-        context_mask = None,
+        prompts = None,
+        prompt_token_ids = None,
+        cache = None,
+        return_cache = False,
+        return_flow = False
     ):
+        assert not exists(self.video_predict_wrapper) or (exists(prompts) ^ exists(prompt_token_ids))
+        assert actions.shape[-2:] == self.action_shape
         batch, device = actions.shape[0], actions.device
+        orig_actions = actions
+        is_training = not exists(time) and not return_flow
+        if not exists(cache):
+            # handle maybe extraction of video hiddens
+            # only if cache is not given
+            assert exists(video) ^ exists(video_hiddens)
+            if not exists(video_hiddens):
+                assert exists(self.video_predict_wrapper), f'`video_predict_wrapper` must be passed in if raw video is passed into MimicVideo'
-        is_training = not exists(time)
+                video_hiddens = self.video_predict_wrapper(video, prompts = prompts, prompt_token_ids = prompt_token_ids)
+                video_hiddens = video_hiddens.to(self.device).float() # maybe bfloat to float32
+                video_hiddens, _ = pack_with_inverse(video_hiddens, 'b * d')
+                assert video_hiddens.shape[-1] == self.dim_video_hidden
+            # handle video hiddens
+            video_hiddens = self.video_hidden_norm(video_hiddens)
+        # handle caching
+        prev_cached_video_hiddens_kv = cache if exists(cache) else ((None,) * self.depth)
+        next_cached_video_hiddens_kv = []
         # handle flow time conditioning
@@ -297,11 +464,26 @@ class MimicVideo(Module):
             actions, left_aligned_time = align_dims_left((actions, time))
             noised = noise.lerp(actions, left_aligned_time)
         else:
             noised = actions
+        # maybe train time rtc
+        action_loss_mask = None
+        if is_training and self.train_time_rtc:
+            rand_prefix_len = torch.randint(0, self.train_time_rtc_max_delay, (batch,), device = device)
+            action_prefix_mask = lens_to_mask(rand_prefix_len, self.action_chunk_len)
+            actions = einx.where('b na, b na d, b na d', action_prefix_mask, orig_actions, actions)
+            time = einx.where('b na, , b', action_prefix_mask, 1., time)
+            action_loss_mask = ~action_prefix_mask
         if time.ndim == 0:
-            time = rearrange(time, '-> b', b = batch)
+            time = repeat(time, '-> b', b = batch)
         # handle the video denoising times
@@ -313,8 +495,14 @@ class MimicVideo(Module):
         if time_video_denoise.shape[0] != batch:
             time_video_denoise = repeat(time_video_denoise, '1 -> b', b = batch)
+        if time.ndim == 2:
+            time_video_denoise = repeat(time_video_denoise, 'b -> b n', n = time.shape[-1])
         times = stack((time, time_video_denoise), dim = -1)
+        if times.ndim == 3:
+            times = pad_at_dim(times, (1, 0), dim = 1, value = 1.) # handle joint state token on the action
         # fourier embed and mlp to time condition
         fourier_embed = self.to_fourier_embed(times)
@@ -323,48 +511,70 @@ class MimicVideo(Module):
         time_cond = self.to_time_cond(fourier_embed)
-        # handle video hiddens
-        video_hiddens = self.video_hidden_norm(video_hiddens)
         # embed
         tokens = self.to_action_tokens(noised)
+        # one layer of rnn for actions
+        rnn_out, _, = self.rnn(tokens)
+        tokens = rnn_out + tokens
+        #  mask joint state token for proprioception masking training
         joint_state_token = self.to_joint_state_token(joint_state)
+        if self.training and self.has_proprio_masking:
+            mask = torch.rand((batch,), device = device) < self.proprio_mask_prob
+            joint_state_token = einx.where('b, d, b d', mask, self.proprio_mask_token, joint_state_token)
+        # pack joint with action tokens
         tokens, inverse_pack = pack_with_inverse((joint_state_token, tokens), 'b * d')
+        # maybe expand streams
+        tokens = self.expand_stream(tokens)
         # transformer layers
-        for (
-            attn_norm,
-            attn,
+        for ((
+            maybe_cross_attn_mhc,
             cross_attn_norm,
             cross_attn,
+            maybe_attn_mhc,
+            attn_norm,
+            attn,
+            maybe_ff_mhc,
             ff_norm,
             ff
-        ) in self.layers:
+        ), cached_video_kv) in zip(self.layers, prev_cached_video_hiddens_kv):
             # cross attention
-            residual = tokens
+            tokens, add_residual = maybe_cross_attn_mhc(tokens)
             tokens, gate = cross_attn_norm(tokens, time_cond)
-            tokens = residual + cross_attn(tokens, context = video_hiddens, context_mask = context_mask) * gate
+            cross_attn_out, video_kv = cross_attn(tokens, context = video_hiddens, context_mask = context_mask, kv = cached_video_kv, return_kv = True)
+            tokens = add_residual(cross_attn_out * gate)
+            if return_cache:
+                next_cached_video_hiddens_kv.append(video_kv)
             # self attention
-            residual = tokens
+            tokens, add_residual = maybe_attn_mhc(tokens)
             tokens, gate = attn_norm(tokens, time_cond)
-            tokens = residual + attn(tokens) * gate
+            tokens = add_residual(attn(tokens) * gate)
             # prepare feedforward
-            residual = tokens
+            tokens, add_residual = maybe_ff_mhc(tokens)
             tokens, gate = ff_norm(tokens, time_cond)
@@ -378,7 +588,11 @@ class MimicVideo(Module):
             # feedforward
-            tokens = residual + ff(tokens) * gate
+            tokens = add_residual(ff(tokens) * gate)
+        # maybe reduce streams
+        tokens = self.reduce_stream(tokens)
         # remove joint token
@@ -389,9 +603,19 @@ class MimicVideo(Module):
         pred_flow = self.to_pred_action_flow(tokens)
         if not is_training:
-            return pred_flow
+            # flow
+            out = pred_flow
+        else:
+            # mse flow loss
+            flow_loss = F.mse_loss(pred_flow, flow, reduction = 'none')
+            out = masked_mean(flow_loss, action_loss_mask)
+        if not return_cache:
+            return out
-        # mse flow loss
+        # handle returning of cache
-        flow_loss = F.mse_loss(pred_flow, flow)
-        return flow_loss
+        return out, next_cached_video_hiddens_kv

{mimic_video-0.0.3.dist-info → mimic_video-0.0.24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mimic-video
-Version: 0.0.3
+Version: 0.0.24
 Summary: Mimic Video
 Project-URL: Homepage, https://pypi.org/project/mimic-video/
 Project-URL: Repository, https://github.com/lucidrains/mimic-video
@@ -36,12 +36,17 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.10
 Requires-Dist: einops>=0.8.1
 Requires-Dist: einx>=0.3.0
-Requires-Dist: torch-einops-utils>=0.0.8
+Requires-Dist: hyper-connections>=0.4.3
+Requires-Dist: torch-einops-utils>=0.0.12
 Requires-Dist: torch>=2.5
+Requires-Dist: tqdm
 Requires-Dist: x-mlps-pytorch
 Provides-Extra: examples
 Provides-Extra: test
+Requires-Dist: accelerate; extra == 'test'
+Requires-Dist: diffusers>=0.32.0; extra == 'test'
 Requires-Dist: pytest; extra == 'test'
+Requires-Dist: transformers; extra == 'test'
 Description-Content-Type: text/markdown
 <img src="./mimic-video.png" width="450px"></img>
@@ -50,6 +55,73 @@ Description-Content-Type: text/markdown
 Implementation of [Mimic-Video](https://mimic-video.github.io/), Video-Action Models for Generalizable Robot Control Beyond VLAs
+## Appreciation
+- [Pranoy](https://github.com/pranoyr) for submitting a pull request for proprioception masking
+## Install
+```shell
+$ pip install mimic-video
+```
+## Usage
+```python
+import torch
+# video wrapper
+# but will be agnostic to the model
+from mimic_video.cosmos_predict import CosmosPredictWrapper
+video_wrapper = CosmosPredictWrapper(
+    extract_layer = 1,
+    random_weights = True,
+    tiny = True
+)
+# mimic video
+from mimic_video import MimicVideo
+model = MimicVideo(512, video_wrapper)
+# states
+video = torch.rand(2, 3, 3, 32, 32)
+joint_state = torch.randn(2, 32)
+# action
+actions = torch.randn(2, 32, 20)
+# training
+loss = model(
+    prompts = [
+        'put the package on the conveyer belt',
+        'pass the butter'
+    ],
+    video = video,
+    actions = actions,
+    joint_state = joint_state
+)
+loss.backward()
+# inference
+actions = model.sample(
+    prompts = 'peel the orange',
+    video = video[:1],
+    joint_state = joint_state[:1]
+)
+assert actions.shape == (1, 32, 20)
+```
 ## Contributing
 First make sure `pytest` and test dependencies are installed with
@@ -76,3 +148,16 @@ That's it
     url     = {https://api.semanticscholar.org/CorpusID:283920528}
 }
 ```
+```bibtex
+@misc{black2025trainingtimeactionconditioningefficient,
+    title   = {Training-Time Action Conditioning for Efficient Real-Time Chunking},
+    author  = {Kevin Black and Allen Z. Ren and Michael Equi and Sergey Levine},
+    year    = {2025},
+    eprint  = {2512.05964},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.RO},
+    url     = {https://arxiv.org/abs/2512.05964},
+}
+```

mimic_video-0.0.24.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+mimic_video/__init__.py,sha256=Rs3QeBBGBKKi1U1ykcyeBrCL2XCbfNvppeeD1Fb1pdY,47
+mimic_video/cosmos_predict.py,sha256=2XR9cqcUC4gKpjEDBy-GtLtMkLXvs8yKe7w8g6EeS6s,8471
+mimic_video/mimic_video.py,sha256=Qr0Dc4z-LTRlTt0qXlgcJtdSP1pBsarXeOnJSUxj_yY,17388
+mimic_video-0.0.24.dist-info/METADATA,sha256=4kXYmqL3XtJbZ35iX42Z85RFV_ZGMM_phKGUZWnfcaw,4581
+mimic_video-0.0.24.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+mimic_video-0.0.24.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+mimic_video-0.0.24.dist-info/RECORD,,

mimic_video-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-mimic_video/__init__.py,sha256=-4HP_pbT4YLhRUwNwuL4qyLHbgDyQ099nHL7eVi0_Ag,48
-mimic_video/mimic_video.py,sha256=-2HVpXAgEG28JFkJeUlypdmOMyYDD2tw0Fisf9-BZ-M,10243
-mimic_video-0.0.3.dist-info/METADATA,sha256=MVJMzysTCCpsgxBKUA9ye-aFSeQAXinyP3ejCtJ8JD8,2960
-mimic_video-0.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-mimic_video-0.0.3.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-mimic_video-0.0.3.dist-info/RECORD,,

{mimic_video-0.0.3.dist-info → mimic_video-0.0.24.dist-info}/WHEEL RENAMED Viewed

File without changes

{mimic_video-0.0.3.dist-info → mimic_video-0.0.24.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mimic-video 0.0.3__py3-none-any.whl → 0.0.24__py3-none-any.whl

Potentially problematic release.

mimic-video 0.0.3py3-none-any.whl → 0.0.24py3-none-any.whl