d4rt 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {d4rt-0.0.2 → d4rt-0.0.4}/PKG-INFO +65 -1
- d4rt-0.0.4/README.md +90 -0
- {d4rt-0.0.2 → d4rt-0.0.4}/d4rt/d4rt.py +45 -9
- {d4rt-0.0.2 → d4rt-0.0.4}/pyproject.toml +1 -1
- d4rt-0.0.2/README.md +0 -26
- {d4rt-0.0.2 → d4rt-0.0.4}/.gitignore +0 -0
- {d4rt-0.0.2 → d4rt-0.0.4}/LICENSE +0 -0
- {d4rt-0.0.2 → d4rt-0.0.4}/d4rt/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: d4rt
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Implementation of D4RT, Efficiently Reconstructing Dynamic Scenes
|
|
5
5
|
Project-URL: Homepage, https://pypi.org/project/d4rt/
|
|
6
6
|
Project-URL: Repository, https://codeberg.org/lucidrains/d4rt
|
|
@@ -50,6 +50,70 @@ Description-Content-Type: text/markdown
|
|
|
50
50
|
|
|
51
51
|
Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
|
|
52
52
|
|
|
53
|
+
## install
|
|
54
|
+
|
|
55
|
+
```shell
|
|
56
|
+
$ pip install d4rt
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## usage
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from torch import randn, randint
|
|
63
|
+
from d4rt.d4rt import D4RT
|
|
64
|
+
|
|
65
|
+
model = D4RT(
|
|
66
|
+
dim = 512,
|
|
67
|
+
video_image_size = 128,
|
|
68
|
+
video_patch_size = 32,
|
|
69
|
+
video_max_time_len = 10,
|
|
70
|
+
enc_depth = 6,
|
|
71
|
+
dec_depth = 6
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
videos = randn(2, 10, 3, 128, 128)
|
|
75
|
+
|
|
76
|
+
video_lens = randint(2, 10, (2,)) # handle variable lengthed video, can be None for max length always
|
|
77
|
+
|
|
78
|
+
# inputs
|
|
79
|
+
|
|
80
|
+
coors = randint(0, 128, (2, 5, 2))
|
|
81
|
+
time_src = randint(0, 10, (2, 5))
|
|
82
|
+
time_tgt = randint(0, 10, (2, 5))
|
|
83
|
+
time_camera = randint(0, 10, (2, 5))
|
|
84
|
+
|
|
85
|
+
query_lens = randint(1, 5, (2,)) # handle varaible lengthed queries
|
|
86
|
+
|
|
87
|
+
# output
|
|
88
|
+
|
|
89
|
+
points = randn(2, 5, 3)
|
|
90
|
+
|
|
91
|
+
loss = model(
|
|
92
|
+
videos,
|
|
93
|
+
video_lens = video_lens,
|
|
94
|
+
coors = coors,
|
|
95
|
+
time_src = time_src,
|
|
96
|
+
time_tgt = time_tgt,
|
|
97
|
+
time_camera = time_camera,
|
|
98
|
+
query_lens = query_lens,
|
|
99
|
+
points = points,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
loss.backward()
|
|
103
|
+
|
|
104
|
+
# without giving the output, it returns the prediction
|
|
105
|
+
|
|
106
|
+
pred = model(
|
|
107
|
+
videos,
|
|
108
|
+
coors = coors,
|
|
109
|
+
time_src = time_src,
|
|
110
|
+
time_tgt = time_tgt,
|
|
111
|
+
time_camera = time_camera
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
assert pred.shape == (2, 5, 3)
|
|
115
|
+
```
|
|
116
|
+
|
|
53
117
|
## citations
|
|
54
118
|
|
|
55
119
|
```bibtex
|
d4rt-0.0.4/README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
<img src="./d4rt.png" width="400px"></img>
|
|
2
|
+
|
|
3
|
+
## d4rt (wip)
|
|
4
|
+
|
|
5
|
+
Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
|
|
6
|
+
|
|
7
|
+
## install
|
|
8
|
+
|
|
9
|
+
```shell
|
|
10
|
+
$ pip install d4rt
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## usage
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from torch import randn, randint
|
|
17
|
+
from d4rt.d4rt import D4RT
|
|
18
|
+
|
|
19
|
+
model = D4RT(
|
|
20
|
+
dim = 512,
|
|
21
|
+
video_image_size = 128,
|
|
22
|
+
video_patch_size = 32,
|
|
23
|
+
video_max_time_len = 10,
|
|
24
|
+
enc_depth = 6,
|
|
25
|
+
dec_depth = 6
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
videos = randn(2, 10, 3, 128, 128)
|
|
29
|
+
|
|
30
|
+
video_lens = randint(2, 10, (2,)) # handle variable lengthed video, can be None for max length always
|
|
31
|
+
|
|
32
|
+
# inputs
|
|
33
|
+
|
|
34
|
+
coors = randint(0, 128, (2, 5, 2))
|
|
35
|
+
time_src = randint(0, 10, (2, 5))
|
|
36
|
+
time_tgt = randint(0, 10, (2, 5))
|
|
37
|
+
time_camera = randint(0, 10, (2, 5))
|
|
38
|
+
|
|
39
|
+
query_lens = randint(1, 5, (2,)) # handle varaible lengthed queries
|
|
40
|
+
|
|
41
|
+
# output
|
|
42
|
+
|
|
43
|
+
points = randn(2, 5, 3)
|
|
44
|
+
|
|
45
|
+
loss = model(
|
|
46
|
+
videos,
|
|
47
|
+
video_lens = video_lens,
|
|
48
|
+
coors = coors,
|
|
49
|
+
time_src = time_src,
|
|
50
|
+
time_tgt = time_tgt,
|
|
51
|
+
time_camera = time_camera,
|
|
52
|
+
query_lens = query_lens,
|
|
53
|
+
points = points,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
loss.backward()
|
|
57
|
+
|
|
58
|
+
# without giving the output, it returns the prediction
|
|
59
|
+
|
|
60
|
+
pred = model(
|
|
61
|
+
videos,
|
|
62
|
+
coors = coors,
|
|
63
|
+
time_src = time_src,
|
|
64
|
+
time_tgt = time_tgt,
|
|
65
|
+
time_camera = time_camera
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
assert pred.shape == (2, 5, 3)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## citations
|
|
72
|
+
|
|
73
|
+
```bibtex
|
|
74
|
+
@article{zhang2025d4rt,
|
|
75
|
+
title = {Efficiently Reconstructing Dynamic Scenes One D4RT at a Time},
|
|
76
|
+
author = {Zhang, Chuhan and Le Moing, Guillaume and Koppula, Skanda and Rocco, Ignacio and Momeni, Liliane and Xie, Junyu and Sun, Shuyang and Sukthankar, Rahul and Barral, Jo{\"e}lle K. and Hadsell, Raia and Ghahramani, Zoubin and Zisserman, Andrew and Zhang, Junlin and Sajjadi, Mehdi S. M.},
|
|
77
|
+
journal = {arXiv preprint},
|
|
78
|
+
year = {2025}
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
```bibtex
|
|
83
|
+
@inproceedings{liu2026geometryaware,
|
|
84
|
+
title = {Geometry-aware 4D Video Generation for Robot Manipulation},
|
|
85
|
+
author = {Zeyi Liu and Shuang Li and Eric Cousineau and Siyuan Feng and Benjamin Burchfiel and Shuran Song},
|
|
86
|
+
booktitle = {The Fourteenth International Conference on Learning Representations},
|
|
87
|
+
year = {2026},
|
|
88
|
+
url = {https://openreview.net/forum?id=18gC6pZVVc}
|
|
89
|
+
}
|
|
90
|
+
```
|
|
@@ -10,9 +10,9 @@ from x_transformers import Encoder, CrossAttender, Attention, FeedForward
|
|
|
10
10
|
# ein notation
|
|
11
11
|
|
|
12
12
|
import einx
|
|
13
|
-
from einops import rearrange
|
|
13
|
+
from einops import rearrange, repeat
|
|
14
14
|
from einops.layers.torch import Rearrange
|
|
15
|
-
from torch_einops_utils import pack_with_inverse
|
|
15
|
+
from torch_einops_utils import pack_with_inverse, lens_to_mask, maybe
|
|
16
16
|
|
|
17
17
|
# helpers
|
|
18
18
|
|
|
@@ -24,7 +24,12 @@ def divisible_by(num, den):
|
|
|
24
24
|
|
|
25
25
|
# function for the patch embedding in the query
|
|
26
26
|
|
|
27
|
-
def extract_patches(
|
|
27
|
+
def extract_patches(
|
|
28
|
+
video, # float[b t c h w]
|
|
29
|
+
coors, # int[b q 2]
|
|
30
|
+
time_src, # int[b q]
|
|
31
|
+
patch_size
|
|
32
|
+
):
|
|
28
33
|
b, q, p, device = *time_src.shape, patch_size, video.device
|
|
29
34
|
|
|
30
35
|
padded_video = F.pad(video, (p,) * 4)
|
|
@@ -112,11 +117,15 @@ class VideoEncoder(Module):
|
|
|
112
117
|
|
|
113
118
|
def forward(
|
|
114
119
|
self,
|
|
115
|
-
video
|
|
120
|
+
video, # float[b t c h w],
|
|
121
|
+
mask = None # bool[b t]
|
|
116
122
|
): # float[b n d]
|
|
117
123
|
|
|
118
124
|
tokens = self.patch_to_tokens(video) # float[b t s d]
|
|
119
125
|
|
|
126
|
+
if exists(mask):
|
|
127
|
+
mask = repeat(mask, 'b ... -> (b s) ...', s = tokens.shape[-2])
|
|
128
|
+
|
|
120
129
|
for spatial_attn, time_attn, ff in self.layers:
|
|
121
130
|
|
|
122
131
|
# space attn
|
|
@@ -133,7 +142,7 @@ class VideoEncoder(Module):
|
|
|
133
142
|
|
|
134
143
|
tokens, inverse_pack = pack_with_inverse(tokens, '* t d')
|
|
135
144
|
|
|
136
|
-
tokens = time_attn(tokens) + tokens
|
|
145
|
+
tokens = time_attn(tokens, mask = mask) + tokens
|
|
137
146
|
|
|
138
147
|
tokens = inverse_pack(tokens)
|
|
139
148
|
|
|
@@ -223,8 +232,14 @@ class D4RT(Module):
|
|
|
223
232
|
time_camera = None, # int[b q]
|
|
224
233
|
queries = None, # float[b q d]
|
|
225
234
|
points = None, # float[b q 3]
|
|
226
|
-
return_pred = False
|
|
235
|
+
return_pred = False,
|
|
236
|
+
video_lens = None, # int[b]
|
|
237
|
+
query_lens = None # int[b q]
|
|
227
238
|
):
|
|
239
|
+
max_time = video.shape[1]
|
|
240
|
+
|
|
241
|
+
# embedding to queries
|
|
242
|
+
|
|
228
243
|
assert (
|
|
229
244
|
exists(queries) or
|
|
230
245
|
all([exists(p) for p in (coors, time_src, time_tgt, time_camera)])
|
|
@@ -245,18 +260,39 @@ class D4RT(Module):
|
|
|
245
260
|
|
|
246
261
|
queries = self.norm_queries(queries)
|
|
247
262
|
|
|
248
|
-
|
|
263
|
+
max_queries = queries.shape[1]
|
|
264
|
+
|
|
265
|
+
# self attention
|
|
266
|
+
|
|
267
|
+
video_mask = maybe(lens_to_mask)(video_lens, max_time)
|
|
268
|
+
|
|
269
|
+
global_spatial_repr = self.to_global_spatial_repr(video, mask = video_mask)
|
|
249
270
|
|
|
250
271
|
global_spatial_repr, inverse_pack_spacetime = pack_with_inverse(global_spatial_repr, 'b * d')
|
|
251
272
|
|
|
252
|
-
|
|
273
|
+
# cross attention
|
|
274
|
+
|
|
275
|
+
global_spatial_repr_mask = None
|
|
276
|
+
|
|
277
|
+
if exists(video_mask):
|
|
278
|
+
global_spatial_repr_mask = repeat(video_mask, 'b t -> b (t s)', s = global_spatial_repr.shape[1] // video_mask.shape[1])
|
|
279
|
+
|
|
280
|
+
queried = self.cross_attender(queries, context = global_spatial_repr, context_mask = global_spatial_repr_mask)
|
|
281
|
+
|
|
282
|
+
# prediction
|
|
253
283
|
|
|
254
284
|
pred = self.to_pred(queried)
|
|
255
285
|
|
|
256
286
|
if not exists(points):
|
|
257
287
|
return pred
|
|
258
288
|
|
|
259
|
-
|
|
289
|
+
query_mask = maybe(lens_to_mask)(query_lens, max_queries)
|
|
290
|
+
var_len_queries = exists(query_mask)
|
|
291
|
+
|
|
292
|
+
loss = F.mse_loss(pred, points, reduction = 'none' if var_len_queries else 'mean')
|
|
293
|
+
|
|
294
|
+
if var_len_queries:
|
|
295
|
+
loss = loss[query_mask].mean()
|
|
260
296
|
|
|
261
297
|
if not return_pred:
|
|
262
298
|
return loss
|
d4rt-0.0.2/README.md
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
<img src="./d4rt.png" width="400px"></img>
|
|
2
|
-
|
|
3
|
-
## d4rt (wip)
|
|
4
|
-
|
|
5
|
-
Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
|
|
6
|
-
|
|
7
|
-
## citations
|
|
8
|
-
|
|
9
|
-
```bibtex
|
|
10
|
-
@article{zhang2025d4rt,
|
|
11
|
-
title = {Efficiently Reconstructing Dynamic Scenes One D4RT at a Time},
|
|
12
|
-
author = {Zhang, Chuhan and Le Moing, Guillaume and Koppula, Skanda and Rocco, Ignacio and Momeni, Liliane and Xie, Junyu and Sun, Shuyang and Sukthankar, Rahul and Barral, Jo{\"e}lle K. and Hadsell, Raia and Ghahramani, Zoubin and Zisserman, Andrew and Zhang, Junlin and Sajjadi, Mehdi S. M.},
|
|
13
|
-
journal = {arXiv preprint},
|
|
14
|
-
year = {2025}
|
|
15
|
-
}
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
```bibtex
|
|
19
|
-
@inproceedings{liu2026geometryaware,
|
|
20
|
-
title = {Geometry-aware 4D Video Generation for Robot Manipulation},
|
|
21
|
-
author = {Zeyi Liu and Shuang Li and Eric Cousineau and Siyuan Feng and Benjamin Burchfiel and Shuran Song},
|
|
22
|
-
booktitle = {The Fourteenth International Conference on Learning Representations},
|
|
23
|
-
year = {2026},
|
|
24
|
-
url = {https://openreview.net/forum?id=18gC6pZVVc}
|
|
25
|
-
}
|
|
26
|
-
```
|
|
File without changes
|
|
File without changes
|
|
File without changes
|