PyPI - dreamer4 - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

dreamer4 0.1.4py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

dreamer4/dreamer4.py +142 -25
dreamer4/trainers.py +1 -1
{dreamer4-0.1.4.dist-info → dreamer4-0.1.15.dist-info}/METADATA +4 -3
dreamer4-0.1.15.dist-info/RECORD +8 -0
dreamer4-0.1.4.dist-info/RECORD +0 -8
{dreamer4-0.1.4.dist-info → dreamer4-0.1.15.dist-info}/WHEEL +0 -0
{dreamer4-0.1.4.dist-info → dreamer4-0.1.15.dist-info}/licenses/LICENSE +0 -0

dreamer4/dreamer4.py CHANGED Viewed

@@ -14,7 +14,7 @@ from torch.nested import nested_tensor
 from torch.distributions import Normal, kl
 from torch.nn import Module, ModuleList, Embedding, Parameter, Sequential, Linear, RMSNorm, Identity
 from torch import nn, cat, stack, arange, tensor, Tensor, is_tensor, full, zeros, ones, randint, rand, randn, randn_like, empty, full, linspace, arange
-from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 import torchvision
 from torchvision.models import VGG16_Weights
@@ -27,6 +27,8 @@ from x_mlps_pytorch.normed_mlp import create_mlp
 from hyper_connections import get_init_and_expand_reduce_stream_functions
+from vit_pytorch.vit_with_decorr import DecorrelationLoss
 from assoc_scan import AssocScan
 # ein related
@@ -68,10 +70,14 @@ except ImportError:
 LinearNoBias = partial(Linear, bias = False)
-TokenizerLosses = namedtuple('TokenizerLosses', ('recon', 'lpips'))
+TokenizerLosses = namedtuple('TokenizerLosses', ('recon', 'lpips', 'time_decorr', 'space_decorr'))
 WorldModelLosses = namedtuple('WorldModelLosses', ('flow', 'rewards', 'discrete_actions', 'continuous_actions'))
+AttentionIntermediates = namedtuple('AttentionIntermediates', ('next_kv_cache', 'normed_inputs'))
+TransformerIntermediates = namedtuple('TransformerIntermediates', ('next_kv_cache', 'normed_time_inputs', 'normed_space_inputs'))
 MaybeTensor = Tensor | None
 @dataclass
@@ -91,6 +97,14 @@ class Experience:
     agent_index: int = 0
     is_from_world_model: bool = True
+    def cpu(self):
+        return self.to(torch.device('cpu'))
+    def to(self, device):
+        experience_dict = asdict(self)
+        experience_dict = tree_map(lambda t: t.to(device) if is_tensor(t) else t, experience_dict)
+        return Experience(**experience_dict)
 def combine_experiences(
     exps: list[Experiences]
 ) -> Experience:
@@ -175,6 +189,13 @@ def sample_prob(prob):
 def is_power_two(num):
     return log2(num).is_integer()
+def maybe(fn):
+    def inner(t, *args, **kwargs):
+        if not exists(t) or not exists(fn):
+            return None
+        return fn(t)
+    return inner
 # tensor helpers
 def is_empty(t):
@@ -209,6 +230,14 @@ def mean_log_var_to_distr(
     std = (0.5 * log_var).exp()
     return Normal(mean, std)
+def safe_stack(tensors, dim = 0):
+    tensors = [*filter(exists, tensors)]
+    if len(tensors) == 0:
+        return None
+    return stack(tensors, dim = dim)
 def safe_cat(tensors, dim):
     tensors = [*filter(exists, tensors)]
@@ -1262,7 +1291,8 @@ class Attention(Module):
         pre_rmsnorm = True,
         gate_values = True,
         rmsnorm_query = False, # a paper claims that it is better to just norm only the keys https://openreview.net/forum?id=HkztQWZfl2
-        rmsnorm_key = True
+        rmsnorm_key = True,
+        value_residual = True
     ):
         super().__init__()
         self.norm = RMSNorm(dim) if pre_rmsnorm else Identity()
@@ -1301,6 +1331,14 @@ class Attention(Module):
         self.q_heads_rmsnorm = MultiHeadRMSNorm(dim_head, heads = query_heads) if rmsnorm_query else nn.Identity()
         self.k_heads_rmsnorm = MultiHeadRMSNorm(dim_head, heads = heads) if rmsnorm_key else nn.Identity()
+        # value residual
+        self.to_learned_value_residual_mix = nn.Sequential(
+            nn.Linear(dim, heads),
+            Rearrange('b n h -> b h n 1'),
+            nn.Sigmoid()
+        ) if value_residual else None
     def muon_parameters(self):
         # omit the queries and keys for now given what we learned from kimi 2 paper
@@ -1313,8 +1351,9 @@ class Attention(Module):
         self,
         tokens, # (b n d)
         kv_cache = None,
-        return_kv_cache = False,
+        return_intermediates = False,
         rotary_pos_emb = None,
+        residual_values = None,  # (b n h d)
         attend_fn: Callable | None = None
     ):
         tokens, inverse_packed_batch = pack_one(tokens, '* n d')
@@ -1327,6 +1366,17 @@ class Attention(Module):
         q, k, v = map(self.split_heads, (q, k, v))
+        # handle maybe value residual
+        if exists(residual_values):
+            residual_values = rearrange(residual_values, '... n h d -> (...) h n d')
+            assert exists(self.to_learned_value_residual_mix)
+            learned_mix = self.to_learned_value_residual_mix(tokens)
+            v = v.lerp(residual_values, learned_mix)
         # qk rmsnorm
         q = self.q_heads_rmsnorm(q)
@@ -1367,10 +1417,10 @@ class Attention(Module):
         out = inverse_packed_batch(out)
-        if not return_kv_cache:
+        if not return_intermediates:
             return out
-        return out, stack((k, v))
+        return out, AttentionIntermediates(stack((k, v)), tokens)
 # feedforward
@@ -1410,6 +1460,7 @@ class AxialSpaceTimeTransformer(Module):
         self,
         dim,
         depth,
+        attn_heads = 8,
         attn_dim_head = 64,
         attn_softclamp_value = 50.,
         time_block_every = 4,
@@ -1418,7 +1469,8 @@ class AxialSpaceTimeTransformer(Module):
         num_residual_streams = 1,
         num_special_spatial_tokens = 1,
         special_attend_only_itself = False,  # this is set to True for the video tokenizer decoder (latents can only attend to itself while spatial modalities attend to the latents and everything)
-        final_norm = True
+        final_norm = True,
+        value_residual = True                # https://arxiv.org/abs/2410.17897 - but with learned mixing from OSS
     ):
         super().__init__()
         assert depth >= time_block_every, f'depth must be at least {time_block_every}'
@@ -1439,6 +1491,19 @@ class AxialSpaceTimeTransformer(Module):
         self.time_rotary = Rotary1D(attn_dim_head)
+        # project initial for value residuals
+        self.value_residual = value_residual
+        if value_residual:
+            dim_inner = attn_dim_head * attn_heads
+            self.to_value_residual = nn.Sequential(
+                nn.RMSNorm(dim),
+                nn.Linear(dim, dim_inner, bias = False),
+                Rearrange('... (h d) -> ... h d', h = attn_heads)
+            )
         # transformer
         layers = []
@@ -1450,13 +1515,13 @@ class AxialSpaceTimeTransformer(Module):
             is_time_block = divisible_by(layer_index, time_block_every)
             is_time.append(is_time_block)
-            rearrange_to_attend = Rearrange('b t s d -> b s t d') if is_time_block else Identity()
-            rearrange_from_attend = Rearrange('b s t d -> b t s d') if is_time_block else Identity()
+            rearrange_to_attend = Rearrange('b t s ... -> b s t ...') if is_time_block else Identity()
+            rearrange_from_attend = Rearrange('b s t ... -> b t s ...') if is_time_block else Identity()
             layers.append(ModuleList([
                 rearrange_to_attend,
                 rearrange_from_attend,
-                hyper_conn(branch = Attention(dim = dim, dim_head = attn_dim_head, **attn_kwargs)),
+                hyper_conn(branch = Attention(dim = dim, heads = attn_heads, dim_head = attn_dim_head, value_residual = value_residual, **attn_kwargs)),
                 hyper_conn(branch = SwiGLUFeedforward(dim = dim, **ff_kwargs))
             ]))
@@ -1484,7 +1549,7 @@ class AxialSpaceTimeTransformer(Module):
         self,
         tokens,                          # (b t s d)
         kv_cache: Tensor | None = None,  # (y 2 b h t d)
-        return_kv_cache = False
+        return_intermediates = False
     ): # (b t s d) | (y 2 b h t d)
@@ -1507,7 +1572,6 @@ class AxialSpaceTimeTransformer(Module):
         time_attn_kv_caches = []
         if has_kv_cache:
             past_tokens, tokens = tokens[:, :-1], tokens[:, -1:]
@@ -1525,6 +1589,18 @@ class AxialSpaceTimeTransformer(Module):
         rotary_pos_emb = self.time_rotary(rotary_seq_len, offset = rotary_pos_offset)
+        # value residual
+        residual_values = None
+        if self.value_residual:
+            residual_values = self.to_value_residual(tokens)
+        # normed attention inputs
+        normed_time_attn_inputs = []
+        normed_space_attn_inputs = []
         # attention
         tokens = self.expand_streams(tokens)
@@ -1543,14 +1619,19 @@ class AxialSpaceTimeTransformer(Module):
             maybe_kv_cache = next(iter_kv_cache, None) if layer_is_time else None
+            # residual values
+            layer_residual_values = maybe(pre_attn_rearrange)(residual_values)
             # attention layer
-            tokens, next_kv_cache = attn(
+            tokens, attn_intermediates = attn(
                 tokens,
                 rotary_pos_emb = layer_rotary_pos_emb,
                 attend_fn = attend_fn,
                 kv_cache = maybe_kv_cache,
-                return_kv_cache = True
+                residual_values = layer_residual_values,
+                return_intermediates = True
             )
             tokens = post_attn_rearrange(tokens)
@@ -1562,7 +1643,13 @@ class AxialSpaceTimeTransformer(Module):
             # save kv cache if is time layer
             if layer_is_time:
-                time_attn_kv_caches.append(next_kv_cache)
+                time_attn_kv_caches.append(attn_intermediates.next_kv_cache)
+            # save time attention inputs for decorr
+            space_or_time_inputs = normed_time_attn_inputs if layer_is_time else normed_space_attn_inputs
+            space_or_time_inputs.append(attn_intermediates.normed_inputs)
         tokens = self.reduce_streams(tokens)
@@ -1572,10 +1659,16 @@ class AxialSpaceTimeTransformer(Module):
             # just concat the past tokens back on for now, todo - clean up the logic
             out = cat((past_tokens, out), dim = 1)
-        if not return_kv_cache:
+        if not return_intermediates:
             return out
-        return out, stack(time_attn_kv_caches)
+        intermediates = TransformerIntermediates(
+            stack(time_attn_kv_caches),
+            safe_stack(normed_time_attn_inputs),
+            safe_stack(normed_space_attn_inputs)
+        )
+        return out, intermediates
 # video tokenizer
@@ -1601,12 +1694,15 @@ class VideoTokenizer(Module):
         per_image_patch_mask_prob = (0., 0.9), # probability of patch masking appears to be per image probabilities drawn uniformly between 0. and 0.9 - if you are a phd student and think i'm mistakened, please open an issue
         lpips_loss_network: Module | None = None,
         lpips_loss_weight = 0.2,
+        encoder_add_decor_aux_loss = False,
+        decor_auxx_loss_weight = 0.1,
+        decorr_sample_frac = 0.25,
         nd_rotary_kwargs: dict = dict(
             rope_min_freq = 1.,
             rope_max_freq = 10000.,
             rope_p_zero_freqs = 0.
         ),
-        num_residual_streams = 1
+        num_residual_streams = 1,
     ):
         super().__init__()
@@ -1701,6 +1797,14 @@ class VideoTokenizer(Module):
         if self.has_lpips_loss:
             self.lpips = LPIPSLoss(lpips_loss_network)
+        # decorr aux loss
+        # https://arxiv.org/abs/2510.14657
+        self.encoder_add_decor_aux_loss = encoder_add_decor_aux_loss
+        self.decorr_aux_loss_weight = decor_auxx_loss_weight
+        self.decorr_loss = DecorrelationLoss(decorr_sample_frac, soft_validate_num_sampled = True) if encoder_add_decor_aux_loss else None
     @property
     def device(self):
         return self.zero.device
@@ -1814,7 +1918,7 @@ class VideoTokenizer(Module):
         # encoder attention
-        tokens = self.encoder_transformer(tokens)
+        tokens, (_, time_attn_normed_inputs, space_attn_normed_inputs) = self.encoder_transformer(tokens, return_intermediates = True)
         # latent bottleneck
@@ -1836,17 +1940,28 @@ class VideoTokenizer(Module):
         if self.has_lpips_loss:
             lpips_loss = self.lpips(video, recon_video)
+        time_decorr_loss = space_decorr_loss = self.zero
+        if self.encoder_add_decor_aux_loss:
+            if exists(time_attn_normed_inputs):
+                time_decorr_loss = self.decorr_loss(time_attn_normed_inputs)
+            if exists(space_attn_normed_inputs):
+                space_decorr_loss = self.decorr_loss(space_attn_normed_inputs)
         # losses
         total_loss = (
             recon_loss +
-            lpips_loss * self.lpips_loss_weight
+            lpips_loss * self.lpips_loss_weight +
+            time_decorr_loss * self.decorr_aux_loss_weight +
+            space_decorr_loss * self.decorr_aux_loss_weight
         )
         if not return_all_losses:
             return total_loss
-        losses = (recon_loss, lpips_loss)
+        losses = (recon_loss, lpips_loss, decorr_loss)
         return total_loss, TokenizerLosses(*losses)
@@ -1870,10 +1985,9 @@ class DynamicsWorldModel(Module):
         depth = 4,
         pred_orig_latent = True,   # directly predicting the original x0 data yield better results, rather than velocity (x-space vs v-space)
         time_block_every = 4,      # every 4th block is time
-        attn_kwargs: dict = dict(
-            heads = 8,
-        ),
+        attn_kwargs: dict = dict(),
         transformer_kwargs: dict = dict(),
+        attn_heads = 8,
         attn_dim_head = 64,
         attn_softclamp_value = 50.,
         ff_kwargs: dict = dict(),
@@ -2086,6 +2200,7 @@ class DynamicsWorldModel(Module):
         self.transformer = AxialSpaceTimeTransformer(
             dim = dim,
             depth = depth,
+            attn_heads = attn_heads,
             attn_dim_head = attn_dim_head,
             attn_softclamp_value = attn_softclamp_value,
             attn_kwargs = attn_kwargs,
@@ -2435,6 +2550,8 @@ class DynamicsWorldModel(Module):
     ):
         assert isinstance(experience, Experience)
+        experience = experience.to(self.device)
         latents = experience.latents
         actions = experience.actions
         old_log_probs = experience.log_probs
@@ -3325,7 +3442,7 @@ class DynamicsWorldModel(Module):
             # attention
-            tokens, next_time_kv_cache = self.transformer(tokens, kv_cache = time_kv_cache, return_kv_cache = True)
+            tokens, (next_time_kv_cache, *_) = self.transformer(tokens, kv_cache = time_kv_cache, return_intermediates = True)
             # unpack

dreamer4/trainers.py CHANGED Viewed

@@ -528,7 +528,7 @@ class SimTrainer(Module):
                 total_experience += num_experience
-                experiences.append(experience)
+                experiences.append(experience.cpu())
             combined_experiences = combine_experiences(experiences)

{dreamer4-0.1.4.dist-info → dreamer4-0.1.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dreamer4
-Version: 0.1.4
+Version: 0.1.15
 Summary: Dreamer 4
 Project-URL: Homepage, https://pypi.org/project/dreamer4/
 Project-URL: Repository, https://github.com/lucidrains/dreamer4
@@ -44,6 +44,7 @@ Requires-Dist: hl-gauss-pytorch
 Requires-Dist: hyper-connections>=0.2.1
 Requires-Dist: torch>=2.4
 Requires-Dist: torchvision
+Requires-Dist: vit-pytorch>=1.15.3
 Requires-Dist: x-mlps-pytorch>=0.0.29
 Provides-Extra: examples
 Provides-Extra: test
@@ -57,7 +58,7 @@ Description-Content-Type: text/markdown
 Implementation of Danijar's [latest iteration](https://arxiv.org/abs/2509.24527v1) for his [Dreamer](https://danijar.com/project/dreamer4/) line of work
-[Discord channel](https://discord.gg/ab4BEk3W) for collaborating with other researchers interested in this work
+[Discord channel](https://discord.gg/PmGR7KRwxq) for collaborating with other researchers interested in this work
 ## Appreciation
@@ -90,7 +91,7 @@ video = torch.randn(2, 3, 10, 256, 256)
 # learn the tokenizer
 loss = tokenizer(video)
-loss.backward() # ler
+loss.backward()
 # dynamics world model

dreamer4-0.1.15.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+dreamer4/__init__.py,sha256=Jssh1obzDRtTfBLZl36kXge1cIQlMjf_8DyjPulvKSk,183
+dreamer4/dreamer4.py,sha256=BVMAIfhqv7wO0FWo-SBfUnyXEQcMljh6CyaHeZ8GmCI,125018
+dreamer4/mocks.py,sha256=TfqOB_Gq6N_GggBYwa6ZAJQx38ntlYbXZe23Ne4jshw,2502
+dreamer4/trainers.py,sha256=h_BMi-P2QMVi-IWQCkejPmyA0UzHgKtE1n7Qn1-IrxE,15093
+dreamer4-0.1.15.dist-info/METADATA,sha256=ghChOd76397jZ_XwFwKRv1lxP1ZFqNgQfSKBUB7DXoo,4973
+dreamer4-0.1.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+dreamer4-0.1.15.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+dreamer4-0.1.15.dist-info/RECORD,,

dreamer4-0.1.4.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-dreamer4/__init__.py,sha256=Jssh1obzDRtTfBLZl36kXge1cIQlMjf_8DyjPulvKSk,183
-dreamer4/dreamer4.py,sha256=ghestMgz7B1oEqBRR0XkkdWe0kkh7bshhzmi6-n-XIs,120790
-dreamer4/mocks.py,sha256=TfqOB_Gq6N_GggBYwa6ZAJQx38ntlYbXZe23Ne4jshw,2502
-dreamer4/trainers.py,sha256=JsnJwQJcbI_75KBTNddG6b7QVkO6LD1N_HQiVe-VnCM,15087
-dreamer4-0.1.4.dist-info/METADATA,sha256=GkzuqKtNJJCSh5FycWJOr49253_w926biJkSz9ic4TQ,4941
-dreamer4-0.1.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-dreamer4-0.1.4.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-dreamer4-0.1.4.dist-info/RECORD,,

{dreamer4-0.1.4.dist-info → dreamer4-0.1.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{dreamer4-0.1.4.dist-info → dreamer4-0.1.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

dreamer4 0.1.4__py3-none-any.whl → 0.1.15__py3-none-any.whl

dreamer4 0.1.4py3-none-any.whl → 0.1.15py3-none-any.whl