mimic-video 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mimic-video might be problematic. Click here for more details.
mimic_video/__init__.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch import nn
|
|
3
|
+
from torch.nn import Module, ModuleList, Linear
|
|
4
|
+
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
|
|
7
|
+
import einx
|
|
8
|
+
from einops import einsum, rearrange
|
|
9
|
+
from einops.layers.torch import Rearrange
|
|
10
|
+
|
|
11
|
+
from x_mlps_pytorch import create_mlp
|
|
12
|
+
|
|
13
|
+
from torch_einops_utils import (
|
|
14
|
+
pad_left_ndim,
|
|
15
|
+
align_dims_left
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# ein notation
|
|
19
|
+
|
|
20
|
+
# b - batch
|
|
21
|
+
# h - heads
|
|
22
|
+
# g - groups
|
|
23
|
+
# n - sequence
|
|
24
|
+
# i, j - sequence (source, target)
|
|
25
|
+
# d - feature dimension
|
|
26
|
+
|
|
27
|
+
# functions
|
|
28
|
+
|
|
29
|
+
def exists(v):
|
|
30
|
+
return v is not None
|
|
31
|
+
|
|
32
|
+
def default(v, d):
|
|
33
|
+
return v if exists(v) else d
|
|
34
|
+
|
|
35
|
+
def divisible_by(num, den):
|
|
36
|
+
return (num % den) == 0
|
|
37
|
+
|
|
38
|
+
# tensor function
|
|
39
|
+
|
|
40
|
+
def max_neg_value(t):
|
|
41
|
+
return -torch.finfo(t.dtype).max
|
|
42
|
+
|
|
43
|
+
def l2norm(t, eps = 1e-10):
|
|
44
|
+
return F.normalize(t, dim = -1, eps = eps)
|
|
45
|
+
|
|
46
|
+
# time
|
|
47
|
+
|
|
48
|
+
# they follow p0's research finding with the beta distribution
|
|
49
|
+
# lets stick with 0 noise to 1 data instead of the reverse
|
|
50
|
+
|
|
51
|
+
def default_sample_time_fn(time, s = 0.999):
|
|
52
|
+
return torch.sqrt(s - time)
|
|
53
|
+
|
|
54
|
+
class RandomFourierEmbed(Module):
|
|
55
|
+
def __init__(self, dim):
|
|
56
|
+
super().__init__()
|
|
57
|
+
self.proj = nn.Sequential(
|
|
58
|
+
Rearrange('... -> ... 1'),
|
|
59
|
+
nn.Linear(1, dim)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.proj.requires_grad_(False)
|
|
63
|
+
|
|
64
|
+
def forward(self, times):
|
|
65
|
+
rand_proj = self.proj(times)
|
|
66
|
+
return torch.cos(2 * torch.pi * rand_proj)
|
|
67
|
+
|
|
68
|
+
# adaptive rmsnorm
|
|
69
|
+
|
|
70
|
+
class AdaptiveRMSNorm(Module):
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
dim,
|
|
74
|
+
dim_time_cond,
|
|
75
|
+
eps = 1e-6
|
|
76
|
+
):
|
|
77
|
+
super().__init__()
|
|
78
|
+
self.scale = dim ** 0.5
|
|
79
|
+
self.eps = eps
|
|
80
|
+
|
|
81
|
+
self.to_modulation = Linear(dim_time_cond, dim * 3, bias = False)
|
|
82
|
+
self.split_modulation = Rearrange('b (three d) -> three b 1 d', three = 3)
|
|
83
|
+
|
|
84
|
+
nn.init.zeros_(self.to_modulation.weight)
|
|
85
|
+
|
|
86
|
+
def forward(
|
|
87
|
+
self,
|
|
88
|
+
tokens,
|
|
89
|
+
time_cond
|
|
90
|
+
):
|
|
91
|
+
|
|
92
|
+
if time_cond.ndim == 1:
|
|
93
|
+
time_cond = pad_left_ndim(time_cond, 1)
|
|
94
|
+
|
|
95
|
+
modulations = self.to_modulation(time_cond)
|
|
96
|
+
|
|
97
|
+
scale, shift, gate = self.split_modulation(modulations)
|
|
98
|
+
|
|
99
|
+
normed = l2norm(tokens, self.eps) * self.scale
|
|
100
|
+
|
|
101
|
+
adaptive_normed = normed * (scale + 1.) + shift
|
|
102
|
+
|
|
103
|
+
return adaptive_normed, gate
|
|
104
|
+
|
|
105
|
+
# attention
|
|
106
|
+
|
|
107
|
+
class Attention(Module):
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
dim,
|
|
111
|
+
*,
|
|
112
|
+
dim_context = None,
|
|
113
|
+
dim_head = 64,
|
|
114
|
+
heads = 8,
|
|
115
|
+
kv_heads = 2
|
|
116
|
+
):
|
|
117
|
+
super().__init__()
|
|
118
|
+
dim_q_inner = dim_head * heads
|
|
119
|
+
dim_kv_inner = dim_head * kv_heads
|
|
120
|
+
dim_context = default(dim_context, dim)
|
|
121
|
+
|
|
122
|
+
self.scale = dim_head ** -0.5
|
|
123
|
+
|
|
124
|
+
self.to_queries = Linear(dim, dim_q_inner, bias = False)
|
|
125
|
+
self.to_keys_values = Linear(dim_context, dim_kv_inner * 2, bias = False)
|
|
126
|
+
self.to_out = Linear(dim_q_inner, dim, bias = False)
|
|
127
|
+
|
|
128
|
+
assert divisible_by(heads, kv_heads)
|
|
129
|
+
groups = heads // kv_heads
|
|
130
|
+
|
|
131
|
+
self.split_q_heads = Rearrange('b n (g h d) -> b g h n d', g = groups, d = dim_head)
|
|
132
|
+
self.split_kv_heads = Rearrange('b n (h d) -> b h n d', d = dim_head)
|
|
133
|
+
self.merge_heads = Rearrange('b g h n d -> b n (g h d)')
|
|
134
|
+
|
|
135
|
+
def forward(
|
|
136
|
+
self,
|
|
137
|
+
tokens,
|
|
138
|
+
context = None,
|
|
139
|
+
context_mask = None
|
|
140
|
+
):
|
|
141
|
+
context = default(context, tokens)
|
|
142
|
+
|
|
143
|
+
queries = self.to_queries(tokens)
|
|
144
|
+
keys, values = self.to_keys_values(context).chunk(2, dim = -1)
|
|
145
|
+
|
|
146
|
+
queries = self.split_q_heads(queries)
|
|
147
|
+
keys, values = tuple(self.split_kv_heads(t) for t in (keys, values))
|
|
148
|
+
|
|
149
|
+
queries = queries * self.scale
|
|
150
|
+
|
|
151
|
+
sim = einsum(queries, keys, 'b g h i d, b h j d -> b g h i j')
|
|
152
|
+
|
|
153
|
+
if exists(context_mask):
|
|
154
|
+
mask_value = max_neg_value(sim)
|
|
155
|
+
sim = einx.where('b j, b g h i j,', context_mask, sim, mask_value)
|
|
156
|
+
|
|
157
|
+
attn = sim.softmax(dim = -1)
|
|
158
|
+
|
|
159
|
+
out = einsum(attn, values, 'b g h i j, b h j d -> b g h i d')
|
|
160
|
+
|
|
161
|
+
out = self.merge_heads(out)
|
|
162
|
+
|
|
163
|
+
return self.to_out(out)
|
|
164
|
+
|
|
165
|
+
# feedforward
|
|
166
|
+
|
|
167
|
+
class SwiGLUFeedForward(Module):
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
dim,
|
|
171
|
+
*,
|
|
172
|
+
expansion_factor = 4.,
|
|
173
|
+
):
|
|
174
|
+
super().__init__()
|
|
175
|
+
dim_inner = int(dim * expansion_factor * 2 / 3)
|
|
176
|
+
|
|
177
|
+
self.proj_in = nn.Linear(dim, dim_inner * 2)
|
|
178
|
+
self.proj_out = nn.Linear(dim_inner, dim)
|
|
179
|
+
|
|
180
|
+
def forward(
|
|
181
|
+
self,
|
|
182
|
+
tokens
|
|
183
|
+
):
|
|
184
|
+
hidden, gates = self.proj_in(tokens).chunk(2, dim = -1)
|
|
185
|
+
|
|
186
|
+
out = hidden * F.gelu(gates)
|
|
187
|
+
|
|
188
|
+
return self.proj_out(out)
|
|
189
|
+
|
|
190
|
+
# classes
|
|
191
|
+
|
|
192
|
+
class MimicVideo(Module):
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
dim,
|
|
196
|
+
*,
|
|
197
|
+
dim_video_hidden,
|
|
198
|
+
dim_action = 20,
|
|
199
|
+
depth = 8,
|
|
200
|
+
dim_head = 64,
|
|
201
|
+
heads = 8,
|
|
202
|
+
expansion_factor = 4.,
|
|
203
|
+
dim_time_cond = None,
|
|
204
|
+
sample_time_fn = None
|
|
205
|
+
):
|
|
206
|
+
super().__init__()
|
|
207
|
+
|
|
208
|
+
# flow related
|
|
209
|
+
|
|
210
|
+
self.sample_time_fn = default(sample_time_fn, default_sample_time_fn)
|
|
211
|
+
|
|
212
|
+
# embed
|
|
213
|
+
|
|
214
|
+
self.to_action_tokens = Linear(dim_action, dim)
|
|
215
|
+
|
|
216
|
+
dim_time_cond = default(dim_time_cond, dim * 2)
|
|
217
|
+
|
|
218
|
+
self.to_time_cond = nn.Sequential(
|
|
219
|
+
RandomFourierEmbed(dim),
|
|
220
|
+
create_mlp(dim_in = dim, dim = dim_time_cond, depth = 2, activation = nn.SiLU())
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
self.video_hidden_norm = nn.RMSNorm(dim_video_hidden)
|
|
224
|
+
|
|
225
|
+
# transformer
|
|
226
|
+
|
|
227
|
+
layers = []
|
|
228
|
+
|
|
229
|
+
for _ in range(depth):
|
|
230
|
+
attn_adanorm = AdaptiveRMSNorm(dim = dim, dim_time_cond = dim_time_cond)
|
|
231
|
+
|
|
232
|
+
attn = Attention(dim = dim, dim_head = dim_head, heads = heads)
|
|
233
|
+
|
|
234
|
+
cross_attn_adanorm = AdaptiveRMSNorm(dim = dim, dim_time_cond = dim_time_cond)
|
|
235
|
+
|
|
236
|
+
cross_attn = Attention(dim = dim, dim_head = dim_head, dim_context = dim_video_hidden, heads = heads)
|
|
237
|
+
|
|
238
|
+
ff_adanorm = AdaptiveRMSNorm(dim = dim, dim_time_cond = dim_time_cond)
|
|
239
|
+
|
|
240
|
+
ff = SwiGLUFeedForward(dim = dim, expansion_factor = expansion_factor)
|
|
241
|
+
|
|
242
|
+
layers.append(ModuleList([
|
|
243
|
+
attn_adanorm,
|
|
244
|
+
attn,
|
|
245
|
+
cross_attn_adanorm,
|
|
246
|
+
cross_attn,
|
|
247
|
+
ff_adanorm,
|
|
248
|
+
ff
|
|
249
|
+
]))
|
|
250
|
+
|
|
251
|
+
self.layers = ModuleList(layers)
|
|
252
|
+
|
|
253
|
+
# predictions
|
|
254
|
+
|
|
255
|
+
self.to_pred_action_flow = nn.Sequential(
|
|
256
|
+
nn.RMSNorm(dim),
|
|
257
|
+
Linear(dim, dim_action)
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
def forward(
|
|
261
|
+
self,
|
|
262
|
+
actions,
|
|
263
|
+
video_hiddens, # they use layer 19 of cosmos predict, at first denoising step. that's all
|
|
264
|
+
*,
|
|
265
|
+
time = None,
|
|
266
|
+
context_mask = None,
|
|
267
|
+
):
|
|
268
|
+
|
|
269
|
+
is_training = not exists(time)
|
|
270
|
+
|
|
271
|
+
# handle flow time conditioning
|
|
272
|
+
|
|
273
|
+
if is_training:
|
|
274
|
+
batch, device = actions.shape[0], actions.device
|
|
275
|
+
|
|
276
|
+
time = torch.rand((batch,), device = device)
|
|
277
|
+
time = self.sample_time_fn(time)
|
|
278
|
+
|
|
279
|
+
noise = torch.randn_like(actions)
|
|
280
|
+
flow = actions - noise
|
|
281
|
+
|
|
282
|
+
actions, left_aligned_time = align_dims_left((actions, time))
|
|
283
|
+
|
|
284
|
+
noised = noise.lerp(actions, left_aligned_time)
|
|
285
|
+
else:
|
|
286
|
+
noised = actions
|
|
287
|
+
|
|
288
|
+
time_cond = self.to_time_cond(time)
|
|
289
|
+
|
|
290
|
+
# handle video hiddens
|
|
291
|
+
|
|
292
|
+
video_hiddens = self.video_hidden_norm(video_hiddens)
|
|
293
|
+
|
|
294
|
+
# embed
|
|
295
|
+
|
|
296
|
+
tokens = self.to_action_tokens(noised)
|
|
297
|
+
|
|
298
|
+
# transformer layers
|
|
299
|
+
|
|
300
|
+
for (
|
|
301
|
+
attn_norm,
|
|
302
|
+
attn,
|
|
303
|
+
cross_attn_norm,
|
|
304
|
+
cross_attn,
|
|
305
|
+
ff_norm,
|
|
306
|
+
ff
|
|
307
|
+
) in self.layers:
|
|
308
|
+
|
|
309
|
+
# cross attention
|
|
310
|
+
|
|
311
|
+
residual = tokens
|
|
312
|
+
|
|
313
|
+
tokens, gate = cross_attn_norm(tokens, time_cond)
|
|
314
|
+
|
|
315
|
+
tokens = residual + cross_attn(tokens, context = video_hiddens, context_mask = context_mask) * gate
|
|
316
|
+
|
|
317
|
+
# self attention
|
|
318
|
+
|
|
319
|
+
residual = tokens
|
|
320
|
+
|
|
321
|
+
tokens, gate = attn_norm(tokens, time_cond)
|
|
322
|
+
|
|
323
|
+
tokens = residual + attn(tokens) * gate
|
|
324
|
+
|
|
325
|
+
# feedforward
|
|
326
|
+
|
|
327
|
+
residual = tokens
|
|
328
|
+
|
|
329
|
+
tokens, gate = ff_norm(tokens, time_cond)
|
|
330
|
+
|
|
331
|
+
tokens = residual + ff(tokens) * gate
|
|
332
|
+
|
|
333
|
+
# prediction
|
|
334
|
+
|
|
335
|
+
pred_flow = self.to_pred_action_flow(tokens)
|
|
336
|
+
|
|
337
|
+
if not is_training:
|
|
338
|
+
return pred_flow
|
|
339
|
+
|
|
340
|
+
# mse flow loss
|
|
341
|
+
|
|
342
|
+
flow_loss = F.mse_loss(pred_flow, flow)
|
|
343
|
+
return flow_loss
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mimic-video
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Mimic Video
|
|
5
|
+
Project-URL: Homepage, https://pypi.org/project/mimic-video/
|
|
6
|
+
Project-URL: Repository, https://github.com/lucidrains/mimic-video
|
|
7
|
+
Author-email: Phil Wang <lucidrains@gmail.com>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2025 Phil Wang
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Keywords: artificial intelligence,attention mechanism,deep learning,video language action model
|
|
31
|
+
Classifier: Development Status :: 4 - Beta
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
35
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
36
|
+
Requires-Python: >=3.10
|
|
37
|
+
Requires-Dist: einops>=0.8.1
|
|
38
|
+
Requires-Dist: einx>=0.3.0
|
|
39
|
+
Requires-Dist: torch-einops-utils>=0.0.8
|
|
40
|
+
Requires-Dist: torch>=2.5
|
|
41
|
+
Requires-Dist: x-mlps-pytorch
|
|
42
|
+
Provides-Extra: examples
|
|
43
|
+
Provides-Extra: test
|
|
44
|
+
Requires-Dist: pytest; extra == 'test'
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
<img src="./mimic-video.png" width="450px"></img>
|
|
48
|
+
|
|
49
|
+
## Mimic Video (wip)
|
|
50
|
+
|
|
51
|
+
Implementation of [Mimic-Video](https://mimic-video.github.io/), Video-Action Models for Generalizable Robot Control Beyond VLAs
|
|
52
|
+
|
|
53
|
+
## Contributing
|
|
54
|
+
|
|
55
|
+
First make sure `pytest` and test dependencies are installed with
|
|
56
|
+
|
|
57
|
+
```shell
|
|
58
|
+
$ pip install '.[test]'
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Then add your test to `tests/test_mimic_video.py` and run
|
|
62
|
+
|
|
63
|
+
```shell
|
|
64
|
+
$ pytest tests
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
That's it
|
|
68
|
+
|
|
69
|
+
## Citations
|
|
70
|
+
|
|
71
|
+
```bibtex
|
|
72
|
+
@inproceedings{Pai2025mimicvideoVM,
|
|
73
|
+
title = {mimic-video: Video-Action Models for Generalizable Robot Control Beyond VLAs},
|
|
74
|
+
author = {Jonas Pai and Liam Achenbach and Victoriano Montesinos and Benedek Forrai and Oier Mees and Elvis Nava},
|
|
75
|
+
year = {2025},
|
|
76
|
+
url = {https://api.semanticscholar.org/CorpusID:283920528}
|
|
77
|
+
}
|
|
78
|
+
```
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
mimic_video/__init__.py,sha256=-4HP_pbT4YLhRUwNwuL4qyLHbgDyQ099nHL7eVi0_Ag,48
|
|
2
|
+
mimic_video/mimic_video.py,sha256=aejvjr1F3A7pZFikf-kEgeOpi1_53xVddBMpDPoxA90,8272
|
|
3
|
+
mimic_video-0.0.1.dist-info/METADATA,sha256=414y344JcuIKQJss7d9riTrHszIwthHW8DDSSuRntdo,2960
|
|
4
|
+
mimic_video-0.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
mimic_video-0.0.1.dist-info/licenses/LICENSE,sha256=1yCiA9b5nhslTavxPjsQAO-wpOnwJR9-l8LTVi7GJuk,1066
|
|
6
|
+
mimic_video-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Phil Wang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|