PyPI - d4rt - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

d4rt 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{d4rt-0.0.2 → d4rt-0.0.4}/PKG-INFO +65 -1
d4rt-0.0.4/README.md +90 -0
{d4rt-0.0.2 → d4rt-0.0.4}/d4rt/d4rt.py +45 -9
{d4rt-0.0.2 → d4rt-0.0.4}/pyproject.toml +1 -1
d4rt-0.0.2/README.md +0 -26
{d4rt-0.0.2 → d4rt-0.0.4}/.gitignore +0 -0
{d4rt-0.0.2 → d4rt-0.0.4}/LICENSE +0 -0
{d4rt-0.0.2 → d4rt-0.0.4}/d4rt/__init__.py +0 -0

{d4rt-0.0.2 → d4rt-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: d4rt
-Version: 0.0.2
+Version: 0.0.4
 Summary: Implementation of D4RT, Efficiently Reconstructing Dynamic Scenes
 Project-URL: Homepage, https://pypi.org/project/d4rt/
 Project-URL: Repository, https://codeberg.org/lucidrains/d4rt
@@ -50,6 +50,70 @@ Description-Content-Type: text/markdown
 Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
+## install
+```shell
+$ pip install d4rt
+```
+## usage
+```python
+from torch import randn, randint
+from d4rt.d4rt import D4RT
+model = D4RT(
+    dim = 512,
+    video_image_size = 128,
+    video_patch_size = 32,
+    video_max_time_len = 10,
+    enc_depth = 6,
+    dec_depth = 6
+)
+videos = randn(2, 10, 3, 128, 128)
+video_lens = randint(2, 10, (2,)) # handle variable lengthed video, can be None for max length always
+# inputs
+coors = randint(0, 128, (2, 5, 2))
+time_src = randint(0, 10, (2, 5))
+time_tgt = randint(0, 10, (2, 5))
+time_camera = randint(0, 10, (2, 5))
+query_lens = randint(1, 5, (2,)) # handle varaible lengthed queries
+# output
+points = randn(2, 5, 3)
+loss = model(
+    videos,
+    video_lens = video_lens,
+    coors = coors,
+    time_src = time_src,
+    time_tgt = time_tgt,
+    time_camera = time_camera,
+    query_lens = query_lens,
+    points = points,
+)
+loss.backward()
+# without giving the output, it returns the prediction
+pred = model(
+    videos,
+    coors = coors,
+    time_src = time_src,
+    time_tgt = time_tgt,
+    time_camera = time_camera
+)
+assert pred.shape == (2, 5, 3)
+```
 ## citations
 ```bibtex

d4rt-0.0.4/README.md ADDED Viewed

@@ -0,0 +1,90 @@
+<img src="./d4rt.png" width="400px"></img>
+## d4rt (wip)
+Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
+## install
+```shell
+$ pip install d4rt
+```
+## usage
+```python
+from torch import randn, randint
+from d4rt.d4rt import D4RT
+model = D4RT(
+    dim = 512,
+    video_image_size = 128,
+    video_patch_size = 32,
+    video_max_time_len = 10,
+    enc_depth = 6,
+    dec_depth = 6
+)
+videos = randn(2, 10, 3, 128, 128)
+video_lens = randint(2, 10, (2,)) # handle variable lengthed video, can be None for max length always
+# inputs
+coors = randint(0, 128, (2, 5, 2))
+time_src = randint(0, 10, (2, 5))
+time_tgt = randint(0, 10, (2, 5))
+time_camera = randint(0, 10, (2, 5))
+query_lens = randint(1, 5, (2,)) # handle varaible lengthed queries
+# output
+points = randn(2, 5, 3)
+loss = model(
+    videos,
+    video_lens = video_lens,
+    coors = coors,
+    time_src = time_src,
+    time_tgt = time_tgt,
+    time_camera = time_camera,
+    query_lens = query_lens,
+    points = points,
+)
+loss.backward()
+# without giving the output, it returns the prediction
+pred = model(
+    videos,
+    coors = coors,
+    time_src = time_src,
+    time_tgt = time_tgt,
+    time_camera = time_camera
+)
+assert pred.shape == (2, 5, 3)
+```
+## citations
+```bibtex
+@article{zhang2025d4rt,
+    title   = {Efficiently Reconstructing Dynamic Scenes One D4RT at a Time},
+    author  = {Zhang, Chuhan and Le Moing, Guillaume and Koppula, Skanda and Rocco, Ignacio and Momeni, Liliane and Xie, Junyu and Sun, Shuyang and Sukthankar, Rahul and Barral, Jo{\"e}lle K. and Hadsell, Raia and Ghahramani, Zoubin and Zisserman, Andrew and Zhang, Junlin and Sajjadi, Mehdi S. M.},
+    journal = {arXiv preprint},
+    year    = {2025}
+}
+```
+```bibtex
+@inproceedings{liu2026geometryaware,
+    title   = {Geometry-aware 4D Video Generation for Robot Manipulation},
+    author  = {Zeyi Liu and Shuang Li and Eric Cousineau and Siyuan Feng and Benjamin Burchfiel and Shuran Song},
+    booktitle = {The Fourteenth International Conference on Learning Representations},
+    year    = {2026},
+    url     = {https://openreview.net/forum?id=18gC6pZVVc}
+}
+```

{d4rt-0.0.2 → d4rt-0.0.4}/d4rt/d4rt.py RENAMED Viewed

@@ -10,9 +10,9 @@ from x_transformers import Encoder, CrossAttender, Attention, FeedForward
 # ein notation
 import einx
-from einops import rearrange
+from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
-from torch_einops_utils import pack_with_inverse
+from torch_einops_utils import pack_with_inverse, lens_to_mask, maybe
 # helpers
@@ -24,7 +24,12 @@ def divisible_by(num, den):
 # function for the patch embedding in the query
-def extract_patches(video, coors, time_src, patch_size):
+def extract_patches(
+    video,      # float[b t c h w]
+    coors,      # int[b q 2]
+    time_src,   # int[b q]
+    patch_size
+):
     b, q, p, device = *time_src.shape, patch_size, video.device
     padded_video = F.pad(video, (p,) * 4)
@@ -112,11 +117,15 @@ class VideoEncoder(Module):
     def forward(
         self,
-        video # float[b t c h w]
+        video,      # float[b t c h w],
+        mask = None # bool[b t]
     ): # float[b n d]
         tokens = self.patch_to_tokens(video) # float[b t s d]
+        if exists(mask):
+            mask = repeat(mask, 'b ... -> (b s) ...', s = tokens.shape[-2])
         for spatial_attn, time_attn, ff in self.layers:
             # space attn
@@ -133,7 +142,7 @@ class VideoEncoder(Module):
             tokens, inverse_pack = pack_with_inverse(tokens, '* t d')
-            tokens = time_attn(tokens) + tokens
+            tokens = time_attn(tokens,  mask = mask) + tokens
             tokens = inverse_pack(tokens)
@@ -223,8 +232,14 @@ class D4RT(Module):
         time_camera = None, # int[b q]
         queries = None,     # float[b q d]
         points = None,      # float[b q 3]
-        return_pred = False
+        return_pred = False,
+        video_lens = None,  # int[b]
+        query_lens = None   # int[b q]
     ):
+        max_time = video.shape[1]
+        # embedding to queries
         assert (
             exists(queries) or
             all([exists(p) for p in (coors, time_src, time_tgt, time_camera)])
@@ -245,18 +260,39 @@ class D4RT(Module):
             queries = self.norm_queries(queries)
-        global_spatial_repr = self.to_global_spatial_repr(video)
+        max_queries = queries.shape[1]
+        # self attention
+        video_mask = maybe(lens_to_mask)(video_lens, max_time)
+        global_spatial_repr = self.to_global_spatial_repr(video, mask = video_mask)
         global_spatial_repr, inverse_pack_spacetime = pack_with_inverse(global_spatial_repr, 'b * d')
-        queried = self.cross_attender(queries, context = global_spatial_repr)
+        # cross attention
+        global_spatial_repr_mask = None
+        if exists(video_mask):
+            global_spatial_repr_mask = repeat(video_mask, 'b t -> b (t s)', s = global_spatial_repr.shape[1] // video_mask.shape[1])
+        queried = self.cross_attender(queries, context = global_spatial_repr, context_mask = global_spatial_repr_mask)
+        # prediction
         pred = self.to_pred(queried)
         if not exists(points):
             return pred
-        loss = F.mse_loss(pred, points)
+        query_mask = maybe(lens_to_mask)(query_lens, max_queries)
+        var_len_queries = exists(query_mask)
+        loss = F.mse_loss(pred, points, reduction = 'none' if var_len_queries else 'mean')
+        if var_len_queries:
+            loss = loss[query_mask].mean()
         if not return_pred:
             return loss

{d4rt-0.0.2 → d4rt-0.0.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "d4rt"
-version = "0.0.2"
+version = "0.0.4"
 description = "Implementation of D4RT, Efficiently Reconstructing Dynamic Scenes"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

d4rt-0.0.2/README.md DELETED Viewed

@@ -1,26 +0,0 @@
-<img src="./d4rt.png" width="400px"></img>
-## d4rt (wip)
-Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
-## citations
-```bibtex
-@article{zhang2025d4rt,
-    title   = {Efficiently Reconstructing Dynamic Scenes One D4RT at a Time},
-    author  = {Zhang, Chuhan and Le Moing, Guillaume and Koppula, Skanda and Rocco, Ignacio and Momeni, Liliane and Xie, Junyu and Sun, Shuyang and Sukthankar, Rahul and Barral, Jo{\"e}lle K. and Hadsell, Raia and Ghahramani, Zoubin and Zisserman, Andrew and Zhang, Junlin and Sajjadi, Mehdi S. M.},
-    journal = {arXiv preprint},
-    year    = {2025}
-}
-```
-```bibtex
-@inproceedings{liu2026geometryaware,
-    title   = {Geometry-aware 4D Video Generation for Robot Manipulation},
-    author  = {Zeyi Liu and Shuang Li and Eric Cousineau and Siyuan Feng and Benjamin Burchfiel and Shuran Song},
-    booktitle = {The Fourteenth International Conference on Learning Representations},
-    year    = {2026},
-    url     = {https://openreview.net/forum?id=18gC6pZVVc}
-}
-```

{d4rt-0.0.2 → d4rt-0.0.4}/.gitignore RENAMED Viewed

File without changes

{d4rt-0.0.2 → d4rt-0.0.4}/LICENSE RENAMED Viewed

File without changes

{d4rt-0.0.2 → d4rt-0.0.4}/d4rt/__init__.py RENAMED Viewed

File without changes

d4rt 0.0.2__tar.gz → 0.0.4__tar.gz

d4rt 0.0.2tar.gz → 0.0.4tar.gz