PyPI - mimic-video - Versions diffs - 0.0.24__py3-none-any.whl → 0.0.27__py3-none-any.whl - Mend

mimic-video 0.0.24py3-none-any.whl → 0.0.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mimic-video might be problematic. Click here for more details.

Files changed (6) hide show

mimic_video/mimic_video.py CHANGED Viewed

@@ -47,9 +47,22 @@ def exists(v):
 def default(v, d):
     return v if exists(v) else d
+def identity(t):
+    return t
 def divisible_by(num, den):
     return (num % den) == 0
+# wrappers
+def eval_no_grad(fn):
+    def inner(*args, **kwargs):
+        with torch.no_grad():
+            fn.eval()
+            return fn(*args, **kwargs)
+    return inner
 # tensor function
 def cast_tensor(val, device = None):
@@ -69,6 +82,30 @@ def shift_feature_dim(t):
     x_shift = pad_at_dim(x_shift, (1, -1), dim = 1)
     return cat((x, x_shift), dim = -1)
+# action normalization
+class Normalizer(Module):
+    def __init__(
+        self,
+        mean,
+        std,
+        eps = 1e-6
+    ):
+        super().__init__()
+        assert (std > 0.).all(), 'std must be positive'
+        self.eps = eps
+        self.register_buffer('mean', mean)
+        self.register_buffer('std', std)
+    def normalize(self, t):
+        mean, std = self.mean, self.std
+        return (t - mean) / std.clamp_min(self.eps)
+    def inverse_normalize(self, t):
+        mean, std = self.mean, self.std
+        return (t * std) + mean
 # time
 # they follow p0's research finding with the beta distribution
@@ -256,7 +293,8 @@ class MimicVideo(Module):
         train_time_rtc = False,
         train_time_rtc_max_delay = None,
         num_residual_streams = 1,
-        mhc_kwargs: dict = dict()
+        mhc_kwargs: dict = dict(),
+        action_mean_std: Tensor | None = None
     ):
         super().__init__()
@@ -266,12 +304,21 @@ class MimicVideo(Module):
         self.video_predict_wrapper = video_predict_wrapper
-        # dims
+        # action related
         self.action_chunk_len = action_chunk_len
         self.dim_action = dim_action
         self.action_shape = (action_chunk_len, dim_action)
+        self.action_normalizer = None
+        if exists(action_mean_std):
+            assert action_mean_std.shape == (2, dim_action), f'must be in shape of (2 action_dim)'
+            self.action_normalizer = Normalizer(*action_mean_std)
+        # joint dim
         self.dim_joint_state = dim_joint_state
         dim_video_hidden = default(dim_video_hidden, video_predict_wrapper.dim_latent if exists(video_predict_wrapper) else None)
@@ -371,6 +418,12 @@ class MimicVideo(Module):
         self.register_buffer('zero', tensor(0.), persistent = False)
+    # only action parameters
+    def action_parameters(self):
+        video_model_params = set(self.video_predict_wrapper.parameters()) if exists(self.video_predict_wrapper) else {}
+        return set(self.parameters()) - video_model_params
     @property
     def device(self):
         return self.zero.device
@@ -380,26 +433,60 @@ class MimicVideo(Module):
         self,
         steps = 16,
         batch_size = 1,
+        prefix_action_chunk = None,
         disable_progress_bar = False,
         **kwargs
     ):
         self.eval()
+        inpainting = exists(prefix_action_chunk)
+        if inpainting:
+            prefix_len = prefix_action_chunk.shape[1]
+            assert prefix_len < self.action_chunk_len
+            maybe_normed_prefix = prefix_action_chunk
+            if exists(self.action_normalizer):
+                maybe_normed_prefix = self.action_normalizer.normalize(prefix_action_chunk)
+        # noise
         noise = torch.randn((batch_size, *self.action_shape), device = self.device)
+        # times
         times = torch.linspace(0., 1., steps + 1, device = self.device)[:-1]
         delta = 1. / steps
+        # denoised action starts as noise
         denoised = noise
         cache = None
+        # denoise
         for time in tqdm(times, disable = disable_progress_bar):
+            if inpainting:
+                denoised[:, :prefix_len] = maybe_normed_prefix
             pred_flow, cache = self.forward(actions = denoised, time = time, cache = cache, return_cache = True, **kwargs)
             denoised = denoised + delta * pred_flow
+        # handle action inverse norm
+        if exists(self.action_normalizer):
+            denoised = self.action_normalizer.inverse_normalize(denoised)
+        # final set, with unnormalized prefix, if inpainting
+        if inpainting:
+            denoised[:, :prefix_len] = prefix_action_chunk
         return denoised
     def forward(
@@ -414,6 +501,8 @@ class MimicVideo(Module):
         time_video_denoise = 0., # 0 is noise in the scheme i prefer - default to their optimal choice, but can be changed
         prompts = None,
         prompt_token_ids = None,
+        detach_video_hiddens = False,
+        no_grad_video_model_forward = False,
         cache = None,
         return_cache = False,
         return_flow = False
@@ -421,6 +510,9 @@ class MimicVideo(Module):
         assert not exists(self.video_predict_wrapper) or (exists(prompts) ^ exists(prompt_token_ids))
         assert actions.shape[-2:] == self.action_shape
+        if exists(self.action_normalizer):
+            actions = self.action_normalizer.normalize(actions)
         batch, device = actions.shape[0], actions.device
         orig_actions = actions
@@ -435,7 +527,9 @@ class MimicVideo(Module):
             if not exists(video_hiddens):
                 assert exists(self.video_predict_wrapper), f'`video_predict_wrapper` must be passed in if raw video is passed into MimicVideo'
-                video_hiddens = self.video_predict_wrapper(video, prompts = prompts, prompt_token_ids = prompt_token_ids)
+                video_forward_wrap = eval_no_grad if no_grad_video_model_forward else identity
+                video_hiddens = video_forward_wrap(self.video_predict_wrapper)(video, prompts = prompts, prompt_token_ids = prompt_token_ids)
                 video_hiddens = video_hiddens.to(self.device).float() # maybe bfloat to float32
@@ -445,6 +539,9 @@ class MimicVideo(Module):
             # handle video hiddens
+            if detach_video_hiddens:
+                video_hiddens = video_hiddens.detach()
             video_hiddens = self.video_hidden_norm(video_hiddens)
         # handle caching

{mimic_video-0.0.24.dist-info → mimic_video-0.0.27.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mimic-video
-Version: 0.0.24
+Version: 0.0.27
 Summary: Mimic Video
 Project-URL: Homepage, https://pypi.org/project/mimic-video/
 Project-URL: Repository, https://github.com/lucidrains/mimic-video

mimic_video-0.0.27.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+mimic_video/__init__.py,sha256=Rs3QeBBGBKKi1U1ykcyeBrCL2XCbfNvppeeD1Fb1pdY,47
+mimic_video/cosmos_predict.py,sha256=2XR9cqcUC4gKpjEDBy-GtLtMkLXvs8yKe7w8g6EeS6s,8471
+mimic_video/mimic_video.py,sha256=WlwFfFvOW5k6X-BxRvF0zjwpKEET9C_FIyewD6_GmcE,20017
+mimic_video-0.0.27.dist-info/METADATA,sha256=al9--DJ_U_jwWilronv3IADdbCIuQfEQRMCJ3vEtE80,4581
+mimic_video-0.0.27.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+mimic_video-0.0.27.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+mimic_video-0.0.27.dist-info/RECORD,,

mimic_video-0.0.24.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-mimic_video/__init__.py,sha256=Rs3QeBBGBKKi1U1ykcyeBrCL2XCbfNvppeeD1Fb1pdY,47
-mimic_video/cosmos_predict.py,sha256=2XR9cqcUC4gKpjEDBy-GtLtMkLXvs8yKe7w8g6EeS6s,8471
-mimic_video/mimic_video.py,sha256=Qr0Dc4z-LTRlTt0qXlgcJtdSP1pBsarXeOnJSUxj_yY,17388
-mimic_video-0.0.24.dist-info/METADATA,sha256=4kXYmqL3XtJbZ35iX42Z85RFV_ZGMM_phKGUZWnfcaw,4581
-mimic_video-0.0.24.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-mimic_video-0.0.24.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-mimic_video-0.0.24.dist-info/RECORD,,

{mimic_video-0.0.24.dist-info → mimic_video-0.0.27.dist-info}/WHEEL RENAMED Viewed

File without changes

{mimic_video-0.0.24.dist-info → mimic_video-0.0.27.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

mimic-video 0.0.24__py3-none-any.whl → 0.0.27__py3-none-any.whl

Potentially problematic release.

mimic-video 0.0.24py3-none-any.whl → 0.0.27py3-none-any.whl