PyPI - dreamer4 - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

dreamer4 0.1.10py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

dreamer4/dreamer4.py CHANGED Viewed

@@ -189,6 +189,13 @@ def sample_prob(prob):
 def is_power_two(num):
     return log2(num).is_integer()
+def maybe(fn):
+    def inner(t, *args, **kwargs):
+        if not exists(t) or not exists(fn):
+            return None
+        return fn(t)
+    return inner
 # tensor helpers
 def is_empty(t):
@@ -223,6 +230,14 @@ def mean_log_var_to_distr(
     std = (0.5 * log_var).exp()
     return Normal(mean, std)
+def safe_stack(tensors, dim = 0):
+    tensors = [*filter(exists, tensors)]
+    if len(tensors) == 0:
+        return None
+    return stack(tensors, dim = dim)
 def safe_cat(tensors, dim):
     tensors = [*filter(exists, tensors)]
@@ -1276,7 +1291,8 @@ class Attention(Module):
         pre_rmsnorm = True,
         gate_values = True,
         rmsnorm_query = False, # a paper claims that it is better to just norm only the keys https://openreview.net/forum?id=HkztQWZfl2
-        rmsnorm_key = True
+        rmsnorm_key = True,
+        value_residual = True
     ):
         super().__init__()
         self.norm = RMSNorm(dim) if pre_rmsnorm else Identity()
@@ -1315,6 +1331,14 @@ class Attention(Module):
         self.q_heads_rmsnorm = MultiHeadRMSNorm(dim_head, heads = query_heads) if rmsnorm_query else nn.Identity()
         self.k_heads_rmsnorm = MultiHeadRMSNorm(dim_head, heads = heads) if rmsnorm_key else nn.Identity()
+        # value residual
+        self.to_learned_value_residual_mix = nn.Sequential(
+            nn.Linear(dim, heads),
+            Rearrange('b n h -> b h n 1'),
+            nn.Sigmoid()
+        ) if value_residual else None
     def muon_parameters(self):
         # omit the queries and keys for now given what we learned from kimi 2 paper
@@ -1329,6 +1353,7 @@ class Attention(Module):
         kv_cache = None,
         return_intermediates = False,
         rotary_pos_emb = None,
+        residual_values = None,  # (b n h d)
         attend_fn: Callable | None = None
     ):
         tokens, inverse_packed_batch = pack_one(tokens, '* n d')
@@ -1341,6 +1366,17 @@ class Attention(Module):
         q, k, v = map(self.split_heads, (q, k, v))
+        # handle maybe value residual
+        if exists(residual_values):
+            residual_values = rearrange(residual_values, '... n h d -> (...) h n d')
+            assert exists(self.to_learned_value_residual_mix)
+            learned_mix = self.to_learned_value_residual_mix(tokens)
+            v = v.lerp(residual_values, learned_mix)
         # qk rmsnorm
         q = self.q_heads_rmsnorm(q)
@@ -1424,6 +1460,7 @@ class AxialSpaceTimeTransformer(Module):
         self,
         dim,
         depth,
+        attn_heads = 8,
         attn_dim_head = 64,
         attn_softclamp_value = 50.,
         time_block_every = 4,
@@ -1432,7 +1469,8 @@ class AxialSpaceTimeTransformer(Module):
         num_residual_streams = 1,
         num_special_spatial_tokens = 1,
         special_attend_only_itself = False,  # this is set to True for the video tokenizer decoder (latents can only attend to itself while spatial modalities attend to the latents and everything)
-        final_norm = True
+        final_norm = True,
+        value_residual = True                # https://arxiv.org/abs/2410.17897 - but with learned mixing from OSS
     ):
         super().__init__()
         assert depth >= time_block_every, f'depth must be at least {time_block_every}'
@@ -1453,6 +1491,19 @@ class AxialSpaceTimeTransformer(Module):
         self.time_rotary = Rotary1D(attn_dim_head)
+        # project initial for value residuals
+        self.value_residual = value_residual
+        if value_residual:
+            dim_inner = attn_dim_head * attn_heads
+            self.to_value_residual = nn.Sequential(
+                nn.RMSNorm(dim),
+                nn.Linear(dim, dim_inner, bias = False),
+                Rearrange('... (h d) -> ... h d', h = attn_heads)
+            )
         # transformer
         layers = []
@@ -1464,13 +1515,13 @@ class AxialSpaceTimeTransformer(Module):
             is_time_block = divisible_by(layer_index, time_block_every)
             is_time.append(is_time_block)
-            rearrange_to_attend = Rearrange('b t s d -> b s t d') if is_time_block else Identity()
-            rearrange_from_attend = Rearrange('b s t d -> b t s d') if is_time_block else Identity()
+            rearrange_to_attend = Rearrange('b t s ... -> b s t ...') if is_time_block else Identity()
+            rearrange_from_attend = Rearrange('b s t ... -> b t s ...') if is_time_block else Identity()
             layers.append(ModuleList([
                 rearrange_to_attend,
                 rearrange_from_attend,
-                hyper_conn(branch = Attention(dim = dim, dim_head = attn_dim_head, **attn_kwargs)),
+                hyper_conn(branch = Attention(dim = dim, heads = attn_heads, dim_head = attn_dim_head, value_residual = value_residual, **attn_kwargs)),
                 hyper_conn(branch = SwiGLUFeedforward(dim = dim, **ff_kwargs))
             ]))
@@ -1521,7 +1572,6 @@ class AxialSpaceTimeTransformer(Module):
         time_attn_kv_caches = []
         if has_kv_cache:
             past_tokens, tokens = tokens[:, :-1], tokens[:, -1:]
@@ -1539,6 +1589,13 @@ class AxialSpaceTimeTransformer(Module):
         rotary_pos_emb = self.time_rotary(rotary_seq_len, offset = rotary_pos_offset)
+        # value residual
+        residual_values = None
+        if self.value_residual:
+            residual_values = self.to_value_residual(tokens)
         # normed attention inputs
         normed_time_attn_inputs = []
@@ -1562,6 +1619,10 @@ class AxialSpaceTimeTransformer(Module):
             maybe_kv_cache = next(iter_kv_cache, None) if layer_is_time else None
+            # residual values
+            layer_residual_values = maybe(pre_attn_rearrange)(residual_values)
             # attention layer
             tokens, attn_intermediates = attn(
@@ -1569,6 +1630,7 @@ class AxialSpaceTimeTransformer(Module):
                 rotary_pos_emb = layer_rotary_pos_emb,
                 attend_fn = attend_fn,
                 kv_cache = maybe_kv_cache,
+                residual_values = layer_residual_values,
                 return_intermediates = True
             )
@@ -1602,8 +1664,8 @@ class AxialSpaceTimeTransformer(Module):
         intermediates = TransformerIntermediates(
             stack(time_attn_kv_caches),
-            stack(normed_time_attn_inputs),
-            stack(normed_space_attn_inputs)
+            safe_stack(normed_time_attn_inputs),
+            safe_stack(normed_space_attn_inputs)
         )
         return out, intermediates
@@ -1881,8 +1943,11 @@ class VideoTokenizer(Module):
         time_decorr_loss = space_decorr_loss = self.zero
         if self.encoder_add_decor_aux_loss:
-            time_decorr_loss = self.decorr_loss(time_attn_normed_inputs)
-            space_decorr_loss = self.decorr_loss(space_attn_normed_inputs)
+            if exists(time_attn_normed_inputs):
+                time_decorr_loss = self.decorr_loss(time_attn_normed_inputs)
+            if exists(space_attn_normed_inputs):
+                space_decorr_loss = self.decorr_loss(space_attn_normed_inputs)
         # losses
@@ -1920,10 +1985,9 @@ class DynamicsWorldModel(Module):
         depth = 4,
         pred_orig_latent = True,   # directly predicting the original x0 data yield better results, rather than velocity (x-space vs v-space)
         time_block_every = 4,      # every 4th block is time
-        attn_kwargs: dict = dict(
-            heads = 8,
-        ),
+        attn_kwargs: dict = dict(),
         transformer_kwargs: dict = dict(),
+        attn_heads = 8,
         attn_dim_head = 64,
         attn_softclamp_value = 50.,
         ff_kwargs: dict = dict(),
@@ -2136,6 +2200,7 @@ class DynamicsWorldModel(Module):
         self.transformer = AxialSpaceTimeTransformer(
             dim = dim,
             depth = depth,
+            attn_heads = attn_heads,
             attn_dim_head = attn_dim_head,
             attn_softclamp_value = attn_softclamp_value,
             attn_kwargs = attn_kwargs,

{dreamer4-0.1.10.dist-info → dreamer4-0.1.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dreamer4
-Version: 0.1.10
+Version: 0.1.15
 Summary: Dreamer 4
 Project-URL: Homepage, https://pypi.org/project/dreamer4/
 Project-URL: Repository, https://github.com/lucidrains/dreamer4

dreamer4-0.1.15.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+dreamer4/__init__.py,sha256=Jssh1obzDRtTfBLZl36kXge1cIQlMjf_8DyjPulvKSk,183
+dreamer4/dreamer4.py,sha256=BVMAIfhqv7wO0FWo-SBfUnyXEQcMljh6CyaHeZ8GmCI,125018
+dreamer4/mocks.py,sha256=TfqOB_Gq6N_GggBYwa6ZAJQx38ntlYbXZe23Ne4jshw,2502
+dreamer4/trainers.py,sha256=h_BMi-P2QMVi-IWQCkejPmyA0UzHgKtE1n7Qn1-IrxE,15093
+dreamer4-0.1.15.dist-info/METADATA,sha256=ghChOd76397jZ_XwFwKRv1lxP1ZFqNgQfSKBUB7DXoo,4973
+dreamer4-0.1.15.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+dreamer4-0.1.15.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
+dreamer4-0.1.15.dist-info/RECORD,,

dreamer4-0.1.10.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-dreamer4/__init__.py,sha256=Jssh1obzDRtTfBLZl36kXge1cIQlMjf_8DyjPulvKSk,183
-dreamer4/dreamer4.py,sha256=_xr_XJGfqhCabRV0vnue4zypHZ4kXeUDZp1N6RF2AoY,122988
-dreamer4/mocks.py,sha256=TfqOB_Gq6N_GggBYwa6ZAJQx38ntlYbXZe23Ne4jshw,2502
-dreamer4/trainers.py,sha256=h_BMi-P2QMVi-IWQCkejPmyA0UzHgKtE1n7Qn1-IrxE,15093
-dreamer4-0.1.10.dist-info/METADATA,sha256=oTK9b_fWDCQTC89Y30OBY_2BzJJ6ih25BzgO0D-SApg,4973
-dreamer4-0.1.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-dreamer4-0.1.10.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
-dreamer4-0.1.10.dist-info/RECORD,,

{dreamer4-0.1.10.dist-info → dreamer4-0.1.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{dreamer4-0.1.10.dist-info → dreamer4-0.1.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

dreamer4 0.1.10__py3-none-any.whl → 0.1.15__py3-none-any.whl

dreamer4 0.1.10py3-none-any.whl → 0.1.15py3-none-any.whl