d4rt 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: d4rt
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Implementation of D4RT, Efficiently Reconstructing Dynamic Scenes
5
5
  Project-URL: Homepage, https://pypi.org/project/d4rt/
6
6
  Project-URL: Repository, https://codeberg.org/lucidrains/d4rt
@@ -50,6 +50,70 @@ Description-Content-Type: text/markdown
50
50
 
51
51
  Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
52
52
 
53
+ ## install
54
+
55
+ ```shell
56
+ $ pip install d4rt
57
+ ```
58
+
59
+ ## usage
60
+
61
+ ```python
62
+ from torch import randn, randint
63
+ from d4rt.d4rt import D4RT
64
+
65
+ model = D4RT(
66
+ dim = 512,
67
+ video_image_size = 128,
68
+ video_patch_size = 32,
69
+ video_max_time_len = 10,
70
+ enc_depth = 6,
71
+ dec_depth = 6
72
+ )
73
+
74
+ videos = randn(2, 10, 3, 128, 128)
75
+
76
+ video_lens = randint(2, 10, (2,)) # handle variable lengthed video, can be None for max length always
77
+
78
+ # inputs
79
+
80
+ coors = randint(0, 128, (2, 5, 2))
81
+ time_src = randint(0, 10, (2, 5))
82
+ time_tgt = randint(0, 10, (2, 5))
83
+ time_camera = randint(0, 10, (2, 5))
84
+
85
+ query_lens = randint(1, 5, (2,)) # handle varaible lengthed queries
86
+
87
+ # output
88
+
89
+ points = randn(2, 5, 3)
90
+
91
+ loss = model(
92
+ videos,
93
+ video_lens = video_lens,
94
+ coors = coors,
95
+ time_src = time_src,
96
+ time_tgt = time_tgt,
97
+ time_camera = time_camera,
98
+ query_lens = query_lens,
99
+ points = points,
100
+ )
101
+
102
+ loss.backward()
103
+
104
+ # without giving the output, it returns the prediction
105
+
106
+ pred = model(
107
+ videos,
108
+ coors = coors,
109
+ time_src = time_src,
110
+ time_tgt = time_tgt,
111
+ time_camera = time_camera
112
+ )
113
+
114
+ assert pred.shape == (2, 5, 3)
115
+ ```
116
+
53
117
  ## citations
54
118
 
55
119
  ```bibtex
d4rt-0.0.4/README.md ADDED
@@ -0,0 +1,90 @@
1
+ <img src="./d4rt.png" width="400px"></img>
2
+
3
+ ## d4rt (wip)
4
+
5
+ Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
6
+
7
+ ## install
8
+
9
+ ```shell
10
+ $ pip install d4rt
11
+ ```
12
+
13
+ ## usage
14
+
15
+ ```python
16
+ from torch import randn, randint
17
+ from d4rt.d4rt import D4RT
18
+
19
+ model = D4RT(
20
+ dim = 512,
21
+ video_image_size = 128,
22
+ video_patch_size = 32,
23
+ video_max_time_len = 10,
24
+ enc_depth = 6,
25
+ dec_depth = 6
26
+ )
27
+
28
+ videos = randn(2, 10, 3, 128, 128)
29
+
30
+ video_lens = randint(2, 10, (2,)) # handle variable lengthed video, can be None for max length always
31
+
32
+ # inputs
33
+
34
+ coors = randint(0, 128, (2, 5, 2))
35
+ time_src = randint(0, 10, (2, 5))
36
+ time_tgt = randint(0, 10, (2, 5))
37
+ time_camera = randint(0, 10, (2, 5))
38
+
39
+ query_lens = randint(1, 5, (2,)) # handle varaible lengthed queries
40
+
41
+ # output
42
+
43
+ points = randn(2, 5, 3)
44
+
45
+ loss = model(
46
+ videos,
47
+ video_lens = video_lens,
48
+ coors = coors,
49
+ time_src = time_src,
50
+ time_tgt = time_tgt,
51
+ time_camera = time_camera,
52
+ query_lens = query_lens,
53
+ points = points,
54
+ )
55
+
56
+ loss.backward()
57
+
58
+ # without giving the output, it returns the prediction
59
+
60
+ pred = model(
61
+ videos,
62
+ coors = coors,
63
+ time_src = time_src,
64
+ time_tgt = time_tgt,
65
+ time_camera = time_camera
66
+ )
67
+
68
+ assert pred.shape == (2, 5, 3)
69
+ ```
70
+
71
+ ## citations
72
+
73
+ ```bibtex
74
+ @article{zhang2025d4rt,
75
+ title = {Efficiently Reconstructing Dynamic Scenes One D4RT at a Time},
76
+ author = {Zhang, Chuhan and Le Moing, Guillaume and Koppula, Skanda and Rocco, Ignacio and Momeni, Liliane and Xie, Junyu and Sun, Shuyang and Sukthankar, Rahul and Barral, Jo{\"e}lle K. and Hadsell, Raia and Ghahramani, Zoubin and Zisserman, Andrew and Zhang, Junlin and Sajjadi, Mehdi S. M.},
77
+ journal = {arXiv preprint},
78
+ year = {2025}
79
+ }
80
+ ```
81
+
82
+ ```bibtex
83
+ @inproceedings{liu2026geometryaware,
84
+ title = {Geometry-aware 4D Video Generation for Robot Manipulation},
85
+ author = {Zeyi Liu and Shuang Li and Eric Cousineau and Siyuan Feng and Benjamin Burchfiel and Shuran Song},
86
+ booktitle = {The Fourteenth International Conference on Learning Representations},
87
+ year = {2026},
88
+ url = {https://openreview.net/forum?id=18gC6pZVVc}
89
+ }
90
+ ```
@@ -10,9 +10,9 @@ from x_transformers import Encoder, CrossAttender, Attention, FeedForward
10
10
  # ein notation
11
11
 
12
12
  import einx
13
- from einops import rearrange
13
+ from einops import rearrange, repeat
14
14
  from einops.layers.torch import Rearrange
15
- from torch_einops_utils import pack_with_inverse
15
+ from torch_einops_utils import pack_with_inverse, lens_to_mask, maybe
16
16
 
17
17
  # helpers
18
18
 
@@ -24,7 +24,12 @@ def divisible_by(num, den):
24
24
 
25
25
  # function for the patch embedding in the query
26
26
 
27
- def extract_patches(video, coors, time_src, patch_size):
27
+ def extract_patches(
28
+ video, # float[b t c h w]
29
+ coors, # int[b q 2]
30
+ time_src, # int[b q]
31
+ patch_size
32
+ ):
28
33
  b, q, p, device = *time_src.shape, patch_size, video.device
29
34
 
30
35
  padded_video = F.pad(video, (p,) * 4)
@@ -112,11 +117,15 @@ class VideoEncoder(Module):
112
117
 
113
118
  def forward(
114
119
  self,
115
- video # float[b t c h w]
120
+ video, # float[b t c h w],
121
+ mask = None # bool[b t]
116
122
  ): # float[b n d]
117
123
 
118
124
  tokens = self.patch_to_tokens(video) # float[b t s d]
119
125
 
126
+ if exists(mask):
127
+ mask = repeat(mask, 'b ... -> (b s) ...', s = tokens.shape[-2])
128
+
120
129
  for spatial_attn, time_attn, ff in self.layers:
121
130
 
122
131
  # space attn
@@ -133,7 +142,7 @@ class VideoEncoder(Module):
133
142
 
134
143
  tokens, inverse_pack = pack_with_inverse(tokens, '* t d')
135
144
 
136
- tokens = time_attn(tokens) + tokens
145
+ tokens = time_attn(tokens, mask = mask) + tokens
137
146
 
138
147
  tokens = inverse_pack(tokens)
139
148
 
@@ -223,8 +232,14 @@ class D4RT(Module):
223
232
  time_camera = None, # int[b q]
224
233
  queries = None, # float[b q d]
225
234
  points = None, # float[b q 3]
226
- return_pred = False
235
+ return_pred = False,
236
+ video_lens = None, # int[b]
237
+ query_lens = None # int[b q]
227
238
  ):
239
+ max_time = video.shape[1]
240
+
241
+ # embedding to queries
242
+
228
243
  assert (
229
244
  exists(queries) or
230
245
  all([exists(p) for p in (coors, time_src, time_tgt, time_camera)])
@@ -245,18 +260,39 @@ class D4RT(Module):
245
260
 
246
261
  queries = self.norm_queries(queries)
247
262
 
248
- global_spatial_repr = self.to_global_spatial_repr(video)
263
+ max_queries = queries.shape[1]
264
+
265
+ # self attention
266
+
267
+ video_mask = maybe(lens_to_mask)(video_lens, max_time)
268
+
269
+ global_spatial_repr = self.to_global_spatial_repr(video, mask = video_mask)
249
270
 
250
271
  global_spatial_repr, inverse_pack_spacetime = pack_with_inverse(global_spatial_repr, 'b * d')
251
272
 
252
- queried = self.cross_attender(queries, context = global_spatial_repr)
273
+ # cross attention
274
+
275
+ global_spatial_repr_mask = None
276
+
277
+ if exists(video_mask):
278
+ global_spatial_repr_mask = repeat(video_mask, 'b t -> b (t s)', s = global_spatial_repr.shape[1] // video_mask.shape[1])
279
+
280
+ queried = self.cross_attender(queries, context = global_spatial_repr, context_mask = global_spatial_repr_mask)
281
+
282
+ # prediction
253
283
 
254
284
  pred = self.to_pred(queried)
255
285
 
256
286
  if not exists(points):
257
287
  return pred
258
288
 
259
- loss = F.mse_loss(pred, points)
289
+ query_mask = maybe(lens_to_mask)(query_lens, max_queries)
290
+ var_len_queries = exists(query_mask)
291
+
292
+ loss = F.mse_loss(pred, points, reduction = 'none' if var_len_queries else 'mean')
293
+
294
+ if var_len_queries:
295
+ loss = loss[query_mask].mean()
260
296
 
261
297
  if not return_pred:
262
298
  return loss
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "d4rt"
3
- version = "0.0.2"
3
+ version = "0.0.4"
4
4
  description = "Implementation of D4RT, Efficiently Reconstructing Dynamic Scenes"
5
5
  authors = [
6
6
  { name = "Phil Wang", email = "lucidrains@gmail.com" }
d4rt-0.0.2/README.md DELETED
@@ -1,26 +0,0 @@
1
- <img src="./d4rt.png" width="400px"></img>
2
-
3
- ## d4rt (wip)
4
-
5
- Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
6
-
7
- ## citations
8
-
9
- ```bibtex
10
- @article{zhang2025d4rt,
11
- title = {Efficiently Reconstructing Dynamic Scenes One D4RT at a Time},
12
- author = {Zhang, Chuhan and Le Moing, Guillaume and Koppula, Skanda and Rocco, Ignacio and Momeni, Liliane and Xie, Junyu and Sun, Shuyang and Sukthankar, Rahul and Barral, Jo{\"e}lle K. and Hadsell, Raia and Ghahramani, Zoubin and Zisserman, Andrew and Zhang, Junlin and Sajjadi, Mehdi S. M.},
13
- journal = {arXiv preprint},
14
- year = {2025}
15
- }
16
- ```
17
-
18
- ```bibtex
19
- @inproceedings{liu2026geometryaware,
20
- title = {Geometry-aware 4D Video Generation for Robot Manipulation},
21
- author = {Zeyi Liu and Shuang Li and Eric Cousineau and Siyuan Feng and Benjamin Burchfiel and Shuran Song},
22
- booktitle = {The Fourteenth International Conference on Learning Representations},
23
- year = {2026},
24
- url = {https://openreview.net/forum?id=18gC6pZVVc}
25
- }
26
- ```
File without changes
File without changes
File without changes