PyPI - dreamer4 - Versions diffs - 0.0.70__tar.gz → 0.0.72__tar.gz - Mend

dreamer4 0.0.70tar.gz → 0.0.72tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dreamer4 might be problematic. Click here for more details.

Files changed (13) hide show

{dreamer4-0.0.70 → dreamer4-0.0.72}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dreamer4
-Version: 0.0.70
+Version: 0.0.72
 Summary: Dreamer 4
 Project-URL: Homepage, https://pypi.org/project/dreamer4/
 Project-URL: Repository, https://github.com/lucidrains/dreamer4

{dreamer4-0.0.70 → dreamer4-0.0.72}/dreamer4/dreamer4.py RENAMED Viewed

@@ -45,6 +45,7 @@ from assoc_scan import AssocScan
 # vc - video channels
 # vh, vw - video height and width
 # mtp - multi token prediction length
+# v - video viewpoints
 import einx
 from einx import add, multiply
@@ -154,6 +155,15 @@ def is_power_two(num):
 def is_empty(t):
     return t.numel() == 0
+def lens_to_mask(t, max_len = None):
+    if not exists(max_len):
+        max_len = t.amax().item()
+    device = t.device
+    seq = torch.arange(max_len, device = device)
+    return einx.less('j, i -> i j', seq, t)
 def log(t, eps = 1e-20):
     return t.clamp(min = eps).log()
@@ -1735,6 +1745,7 @@ class DynamicsWorldModel(Module):
         num_latent_tokens = None,
         num_agents = 1,
         num_tasks = 0,
+        num_video_views = 1,
         dim_proprio = None,
         reward_encoder_kwargs: dict = dict(),
         depth = 4,
@@ -1800,7 +1811,7 @@ class DynamicsWorldModel(Module):
             )
             self.to_latent_pred = Sequential(
-                Reduce('b t n s d -> b t n d', 'mean'),
+                Reduce('b t v n s d -> b t v n d', 'mean'),
                 RMSNorm(dim),
                 LinearNoBias(dim, dim_latent)
             )
@@ -1810,17 +1821,27 @@ class DynamicsWorldModel(Module):
             latent_tokens_to_space = num_latent_tokens // num_spatial_tokens
             self.latents_to_spatial_tokens = Sequential(
-                Rearrange('b t n d -> b t (n d)'),
+                Rearrange('... n d -> ... (n d)'),
                 Linear(num_latent_tokens * dim_latent, dim * num_spatial_tokens),
-                Rearrange('b t (s d) -> b t s d', s = num_spatial_tokens)
+                Rearrange('... (s d) -> ... s d', s = num_spatial_tokens)
             )
             self.to_latent_pred = Sequential(
                 RMSNorm(dim),
                 LinearNoBias(dim, dim_latent * latent_tokens_to_space),
-                Rearrange('b t s (n d) -> b t (s n) d', n = latent_tokens_to_space)
+                Rearrange('b t v s (n d) -> b t v (s n) d', n = latent_tokens_to_space)
             )
+        # number of video views, for robotics, which could have third person + wrist camera at least
+        assert num_video_views >= 1
+        self.video_has_multi_view = num_video_views > 1
+        self.num_video_views = num_video_views
+        if self.video_has_multi_view:
+            self.view_emb = nn.Parameter(torch.randn(num_video_views, dim) * 1e-2)
         # proprioception
         self.has_proprio = exists(dim_proprio)
@@ -2318,7 +2339,7 @@ class DynamicsWorldModel(Module):
         # denoising
         # teacher forcing to start with
-        latents = empty((batch_size, 0, *latent_shape), device = self.device)
+        latents = empty((batch_size, 0, self.num_video_views, *latent_shape), device = self.device)
         past_latents_context_noise = latents.clone()
@@ -2354,7 +2375,7 @@ class DynamicsWorldModel(Module):
             curr_time_steps = latents.shape[1]
-            noised_latent = randn((batch_size, 1, *latent_shape), device = self.device)
+            noised_latent = randn((batch_size, 1, self.num_video_views, *latent_shape), device = self.device)
             noised_proprio = None
@@ -2365,12 +2386,12 @@ class DynamicsWorldModel(Module):
                 is_last_step = (step + 1) == num_steps
                 signal_levels = full((batch_size, 1), step * step_size, dtype = torch.long, device = self.device)
                 # noising past latent context
                 noised_context = latents.lerp(past_latents_context_noise, context_signal_noise) # the paragraph after eq (8)
-                noised_latent_with_context, pack_context_shape = pack((noised_context, noised_latent), 'b * n d')
+                noised_latent_with_context, pack_context_shape = pack((noised_context, noised_latent), 'b * v n d')
                 # handle proprio
@@ -2395,6 +2416,7 @@ class DynamicsWorldModel(Module):
                     proprio = noised_proprio_with_context,
                     time_kv_cache = time_kv_cache,
                     latent_is_noised = True,
+                    latent_has_view_dim = True,
                     return_pred_only = True,
                     return_intermediates = True,
                 )
@@ -2409,12 +2431,11 @@ class DynamicsWorldModel(Module):
                 # unpack pred
-                _, pred = unpack(pred, pack_context_shape, 'b * n d')
+                _, pred = unpack(pred, pack_context_shape, 'b * v n d')
                 if has_proprio:
                     _, pred_proprio = unpack(pred_proprio, pack_context_shape, 'b * d')
                 # derive flow, based on whether in x-space or not
                 def denoise_step(pred, noised, signal_levels):
@@ -2507,12 +2528,26 @@ class DynamicsWorldModel(Module):
         video = None
         if return_decoded_video:
+            latents_for_video = rearrange(latents, 'b t v n d -> b v t n d')
+            latents_for_video, unpack_view = pack_one(latents_for_video, '* t n d')
             video = self.video_tokenizer.decode(
-                latents,
+                latents_for_video,
                 height = image_height,
                 width = image_width
             )
+            video = unpack_view(video, '* t c vh vw')
+        # remove the lone view dimension
+        if not self.video_has_multi_view:
+            latents = rearrange(latents, 'b t 1 ... -> b t ...')
+            if exists(video):
+                video = rearrange(video, 'b 1 ... -> b ...')
         # only return video or latent if not requesting anything else, for first stage training
         if not has_at_least_one(return_rewards_per_frame, return_agent_actions, has_proprio):
@@ -2553,8 +2588,9 @@ class DynamicsWorldModel(Module):
     def forward(
         self,
         *,
-        video = None,                    # (b c t vh vw)
-        latents = None,                  # (b t n d) | (b t d)
+        video = None,                    # (b v? c t vh vw)
+        latents = None,                  # (b t v? n d) | (b t v? d)
+        lens = None,                     # (b)
         signal_levels = None,            # () | (b) | (b t)
         step_sizes = None,               # () | (b)
         step_sizes_log2 = None,          # () | (b)
@@ -2572,21 +2608,39 @@ class DynamicsWorldModel(Module):
         return_all_losses = False,
         return_intermediates = False,
         add_autoregressive_action_loss = False,
-        update_loss_ema = None
+        update_loss_ema = None,
+        latent_has_view_dim = False
     ):
         # handle video or latents
         assert exists(video) ^ exists(latents)
+        # standardize view dimension
+        if not self.video_has_multi_view:
+            if exists(video):
+                video = rearrange(video, 'b ... -> b 1 ...')
+            if exists(latents) and not latent_has_view_dim:
+                latents = rearrange(latents, 'b t ... -> b t 1 ...')
+        # if raw video passed in, tokenize
         if exists(video):
+            assert video.ndim == 6
+            video, unpack_views = pack_one(video, '* c t vh vw')
             assert exists(self.video_tokenizer), 'video_tokenizer must be passed in if training from raw video on dynamics model'
             latents = self.video_tokenizer.tokenize(video)
+            latents = unpack_views(latents, '* t n d')
+            latents = rearrange(latents, 'b v t n d -> b t v n d')
-        if latents.ndim == 3:
-            latents = rearrange(latents, 'b t d -> b t 1 d') # 1 latent edge case
+        if latents.ndim == 4:
+            latents = rearrange(latents, 'b t v d -> b t v 1 d') # 1 latent edge case
         assert latents.shape[-2:] == self.latent_shape
+        assert latents.shape[2] == self.num_video_views
         # variables
@@ -2769,6 +2823,7 @@ class DynamicsWorldModel(Module):
         # main function, needs to be defined as such for shortcut training - additional calls for consistency loss
         def get_prediction(noised_latents, noised_proprio, signal_levels, step_sizes_log2, action_tokens, reward_tokens, agent_tokens, return_agent_tokens = False, return_time_kv_cache = False):
             # latents to spatial tokens
             space_tokens = self.latents_to_spatial_tokens(noised_latents)
@@ -2969,7 +3024,19 @@ class DynamicsWorldModel(Module):
             flow_losses = flow_losses * loss_weight
-        flow_loss = flow_losses.mean()
+        # handle variable lengths if needed
+        is_var_len = exists(lens)
+        if is_var_len:
+            loss_mask = lens_to_mask(lens, time)
+            loss_mask_without_last = loss_mask[:, :-1]
+            flow_loss = flow_losses[loss_mask].mean()
+        else:
+            flow_loss = flow_losses.mean()
         # now take care of the agent token losses
@@ -2992,7 +3059,10 @@ class DynamicsWorldModel(Module):
             reward_losses = reward_losses.masked_fill(reward_loss_mask, 0.)
-            reward_loss = reduce(reward_losses, '... mtp -> mtp', 'mean') # they sum across the prediction steps (mtp dimension) - eq(9)
+            if is_var_len:
+                reward_loss = reward_losses[loss_mask_without_last].mean(dim = 0)
+            else:
+                reward_loss = reduce(reward_losses, '... mtp -> mtp', 'mean') # they sum across the prediction steps (mtp dimension) - eq(9)
         # maybe autoregressive action loss
@@ -3035,12 +3105,20 @@ class DynamicsWorldModel(Module):
             if exists(discrete_log_probs):
                 discrete_log_probs = discrete_log_probs.masked_fill(~discrete_mask[..., None], 0.)
-                discrete_action_loss = reduce(-discrete_log_probs, 'mtp b t na -> mtp', 'mean')
+                if is_var_len:
+                    discrete_action_losses = rearrange(-discrete_log_probs, 'mtp b t na -> b t na mtp')
+                    discrete_action_loss = reduce(discrete_action_losses[loss_mask_without_last], '... mtp -> mtp', 'mean')
+                else:
+                    discrete_action_loss = reduce(-discrete_log_probs, 'mtp b t na -> mtp', 'mean')
             if exists(continuous_log_probs):
                 continuous_log_probs = continuous_log_probs.masked_fill(~continuous_mask[..., None], 0.)
-                continuous_action_loss = reduce(-continuous_log_probs, 'mtp b t na -> mtp', 'mean')
+                if is_var_len:
+                    continuous_action_losses = rearrange(-continuous_log_probs, 'mtp b t na -> b t na mtp')
+                    continuous_action_loss = reduce(continuous_action_losses[loss_mask_without_last], '... mtp -> mtp', 'mean')
+                else:
+                    continuous_action_loss = reduce(-continuous_log_probs, 'mtp b t na -> mtp', 'mean')
         # handle loss normalization

{dreamer4-0.0.70 → dreamer4-0.0.72}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dreamer4"
-version = "0.0.70"
+version = "0.0.72"
 description = "Dreamer 4"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{dreamer4-0.0.70 → dreamer4-0.0.72}/tests/test_dreamer.py RENAMED Viewed

@@ -16,6 +16,7 @@ def exists(v):
 @param('num_residual_streams', (1, 4))
 @param('add_reward_embed_to_agent_token', (False, True))
 @param('use_time_kv_cache', (False, True))
+@param('var_len', (False, True))
 def test_e2e(
     pred_orig_latent,
     grouped_query_attn,
@@ -27,7 +28,8 @@ def test_e2e(
     condition_on_actions,
     num_residual_streams,
     add_reward_embed_to_agent_token,
-    use_time_kv_cache
+    use_time_kv_cache,
+    var_len
 ):
     from dreamer4.dreamer4 import VideoTokenizer, DynamicsWorldModel
@@ -95,8 +97,13 @@ def test_e2e(
     if condition_on_actions:
         actions = torch.randint(0, 4, (2, 3, 1))
+    lens = None
+    if var_len:
+        lens = torch.randint(1, 4, (2,))
     flow_loss = dynamics(
         **dynamics_input,
+        lens = lens,
         tasks = tasks,
         signal_levels = signal_levels,
         step_sizes_log2 = step_sizes_log2,
@@ -668,7 +675,10 @@ def test_online_rl(
     trainer(mock_env, num_episodes = 2, env_is_vectorized = vectorized)
-def test_proprioception():
+@param('num_video_views', (1, 2))
+def test_proprioception(
+    num_video_views
+):
     from dreamer4.dreamer4 import VideoTokenizer, DynamicsWorldModel
     tokenizer = VideoTokenizer(
@@ -693,11 +703,17 @@ def test_proprioception():
         dim_latent = 32,
         dim_proprio = 21,
         num_tasks = 4,
+        num_video_views = num_video_views,
         num_discrete_actions = 4,
         num_residual_streams = 1
     )
-    video = torch.randn(2, 3, 10, 256, 256)
+    if num_video_views > 1:
+        video_shape = (2, num_video_views, 3, 10, 256, 256)
+    else:
+        video_shape = (2, 3, 10, 256, 256)
+    video = torch.randn(*video_shape)
     rewards = torch.randn(2, 10)
     proprio = torch.randn(2, 10, 21)
     discrete_actions = torch.randint(0, 4, (2, 10, 1))
@@ -714,8 +730,10 @@ def test_proprioception():
     loss.backward()
     generations = dynamics.generate(
-        4,
+        10,
         batch_size = 2,
+        return_decoded_video = True
     )
     assert exists(generations.proprio)
+    assert generations.video.shape == video_shape