cc-vjepa2 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cc_vjepa2-0.0.3/LICENSE +21 -0
- cc_vjepa2-0.0.3/PKG-INFO +613 -0
- cc_vjepa2-0.0.3/README.md +559 -0
- cc_vjepa2-0.0.3/cc_vjepa2.egg-info/PKG-INFO +613 -0
- cc_vjepa2-0.0.3/cc_vjepa2.egg-info/SOURCES.txt +46 -0
- cc_vjepa2-0.0.3/cc_vjepa2.egg-info/dependency_links.txt +1 -0
- cc_vjepa2-0.0.3/cc_vjepa2.egg-info/requires.txt +30 -0
- cc_vjepa2-0.0.3/cc_vjepa2.egg-info/top_level.txt +1 -0
- cc_vjepa2-0.0.3/pyproject.toml +78 -0
- cc_vjepa2-0.0.3/setup.cfg +4 -0
- cc_vjepa2-0.0.3/vjepa2/__init__.py +11 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/__init__.py +0 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/data_manager.py +89 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/imagenet1k.py +150 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/dataloader.py +234 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/utils.py +21 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/functional.py +110 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/randaugment.py +536 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/randerase.py +170 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/transforms.py +1161 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/transforms_builder.py +165 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/volume_transforms.py +159 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/weighted_sampler.py +366 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/utils/worker_init_fn.py +76 -0
- cc_vjepa2-0.0.3/vjepa2/datasets/video_dataset.py +384 -0
- cc_vjepa2-0.0.3/vjepa2/hub/__init__.py +0 -0
- cc_vjepa2-0.0.3/vjepa2/hub/backbones.py +336 -0
- cc_vjepa2-0.0.3/vjepa2/masks/__init__.py +0 -0
- cc_vjepa2-0.0.3/vjepa2/masks/default.py +18 -0
- cc_vjepa2-0.0.3/vjepa2/masks/multiseq_multiblock3d.py +269 -0
- cc_vjepa2-0.0.3/vjepa2/masks/utils.py +21 -0
- cc_vjepa2-0.0.3/vjepa2/models/__init__.py +0 -0
- cc_vjepa2-0.0.3/vjepa2/models/ac_predictor.py +200 -0
- cc_vjepa2-0.0.3/vjepa2/models/attentive_pooler.py +137 -0
- cc_vjepa2-0.0.3/vjepa2/models/predictor.py +261 -0
- cc_vjepa2-0.0.3/vjepa2/models/utils/__init__.py +0 -0
- cc_vjepa2-0.0.3/vjepa2/models/utils/modules.py +897 -0
- cc_vjepa2-0.0.3/vjepa2/models/utils/patch_embed.py +52 -0
- cc_vjepa2-0.0.3/vjepa2/models/utils/pos_embs.py +93 -0
- cc_vjepa2-0.0.3/vjepa2/models/vision_transformer.py +504 -0
- cc_vjepa2-0.0.3/vjepa2/utils/__init__.py +0 -0
- cc_vjepa2-0.0.3/vjepa2/utils/checkpoint_loader.py +37 -0
- cc_vjepa2-0.0.3/vjepa2/utils/distributed.py +101 -0
- cc_vjepa2-0.0.3/vjepa2/utils/logging.py +108 -0
- cc_vjepa2-0.0.3/vjepa2/utils/monitoring.py +171 -0
- cc_vjepa2-0.0.3/vjepa2/utils/schedulers.py +112 -0
- cc_vjepa2-0.0.3/vjepa2/utils/tensors.py +53 -0
- cc_vjepa2-0.0.3/vjepa2/utils/wrappers.py +44 -0
cc_vjepa2-0.0.3/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cc_vjepa2-0.0.3/PKG-INFO
ADDED
|
@@ -0,0 +1,613 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cc-vjepa2
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: V-JEPA 2 — Self-Supervised Video Models (community fork with MPS, ST-A², bugfixes)
|
|
5
|
+
Author: Meta AI (original)
|
|
6
|
+
Author-email: "Cagatay Cali (community maintainer)" <cagataycali@icloud.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/cagataycali/vjepa2
|
|
9
|
+
Project-URL: Repository, https://github.com/cagataycali/vjepa2
|
|
10
|
+
Project-URL: Issues, https://github.com/cagataycali/vjepa2/issues
|
|
11
|
+
Keywords: vjepa,vjepa2,video,self-supervised,world-model,robotics
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: torch>=2.0
|
|
26
|
+
Requires-Dist: torchvision
|
|
27
|
+
Requires-Dist: numpy
|
|
28
|
+
Requires-Dist: pyyaml
|
|
29
|
+
Requires-Dist: timm
|
|
30
|
+
Requires-Dist: einops
|
|
31
|
+
Provides-Extra: full
|
|
32
|
+
Requires-Dist: tensorboard; extra == "full"
|
|
33
|
+
Requires-Dist: wandb; extra == "full"
|
|
34
|
+
Requires-Dist: iopath; extra == "full"
|
|
35
|
+
Requires-Dist: opencv-python; extra == "full"
|
|
36
|
+
Requires-Dist: submitit; extra == "full"
|
|
37
|
+
Requires-Dist: braceexpand; extra == "full"
|
|
38
|
+
Requires-Dist: webdataset; extra == "full"
|
|
39
|
+
Requires-Dist: transformers; extra == "full"
|
|
40
|
+
Requires-Dist: peft; extra == "full"
|
|
41
|
+
Requires-Dist: decord2; extra == "full"
|
|
42
|
+
Requires-Dist: pandas; extra == "full"
|
|
43
|
+
Requires-Dist: beartype; extra == "full"
|
|
44
|
+
Requires-Dist: psutil; extra == "full"
|
|
45
|
+
Requires-Dist: h5py; extra == "full"
|
|
46
|
+
Requires-Dist: fire; extra == "full"
|
|
47
|
+
Requires-Dist: python-box; extra == "full"
|
|
48
|
+
Requires-Dist: scikit-image; extra == "full"
|
|
49
|
+
Requires-Dist: ftfy; extra == "full"
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
52
|
+
Requires-Dist: ruff; extra == "dev"
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
## 🚀 Install (with community patches: MPS, ST-A², bugfixes)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install git+https://github.com/cagataycali/vjepa2.git
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
🆕 **[2026-03-16]:** :fire: V-JEPA 2.1 is released :fire: A new familly of models trained with a novel recipe that learns high quality and temporolly consistent dense features !!!
|
|
65
|
+
|
|
66
|
+
**[2025-06-25]:** V-JEPA 2 is released. [[`Blog`](https://ai.meta.com/blog/v-jepa-2-world-model-benchmarks)]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning
|
|
70
|
+
|
|
71
|
+
### [Meta FAIR](https://ai.meta.com/research/)
|
|
72
|
+
|
|
73
|
+
Mahmoud Assran∗, Adrien Bardes∗, David Fan∗, Quentin Garrido∗, Russell Howes∗, Mojtaba
|
|
74
|
+
Komeili∗, Matthew Muckley∗, Ammar Rizvi∗, Claire Roberts∗, Koustuv Sinha∗, Artem Zholus*,
|
|
75
|
+
Sergio Arnaud*, Abha Gejji*, Ada Martin*, Francois Robert Hogan*, Daniel Dugas*, Piotr
|
|
76
|
+
Bojanowski, Vasil Khalidov, Patrick Labatut, Francisco Massa, Marc Szafraniec, Kapil
|
|
77
|
+
Krishnakumar, Yong Li, Xiaodong Ma, Sarath Chandar, Franziska Meier*, Yann LeCun*, Michael
|
|
78
|
+
Rabbat*, Nicolas Ballas*
|
|
79
|
+
|
|
80
|
+
*Core Team
|
|
81
|
+
|
|
82
|
+
[[`Paper`](https://arxiv.org/abs/2506.09985)] [[`Blog`](https://ai.meta.com/blog/v-jepa-2-world-model-benchmarks)] [[`BibTex`](#Citation)]
|
|
83
|
+
|
|
84
|
+
Official Pytorch codebase for V-JEPA 2, V-JEPA 2-AC, V-JEPA 2.1.
|
|
85
|
+
|
|
86
|
+
V-JEPA 2 is a self-supervised approach to training video encoders, using internet-scale video data, that attains state-of-the-art performance on motion understanding and human action anticipation tasks. V-JEPA 2-AC is a latent action-conditioned world model post-trained from V-JEPA 2 (using a small amount of robot trajectory interaction data) that solves robot manipulation tasks without environment-specific data collection or task-specific training or calibration.
|
|
87
|
+
|
|
88
|
+
<p align="center">
|
|
89
|
+
<img src="assets/flowchart.png" width=100%>
|
|
90
|
+
</p>
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
## V-JEPA 2.1 Pre-training
|
|
95
|
+
|
|
96
|
+
Lorenzo Mur-Labadia, Matthew Muckley, Amir Bar, Mahmoud Assran, Koustuv Sinha, Michael
|
|
97
|
+
Rabbat, Yann LeCun, Nicolas Ballas, Adrien Bardes
|
|
98
|
+
|
|
99
|
+
[[`Paper`](https://arxiv.org/abs/TODO)] [[`BibTex`](#Citation)]
|
|
100
|
+
|
|
101
|
+
V-JEPA 2.1 improves the training recipe to focus on learning high-quality and temporally consistent dense features, as higlighted by PCA visualizations:
|
|
102
|
+
|
|
103
|
+
<p align="center">
|
|
104
|
+
<img src="assets/teaser_screenshot_5dice.png" width=100%>
|
|
105
|
+
</p>
|
|
106
|
+
|
|
107
|
+
The V-JEPA 2.1 approach leverages: (1) **Dense Predictive Loss**, a masking-based
|
|
108
|
+
self-supervision objective where all tokens (both visible/context and masked tokens) contribute to the
|
|
109
|
+
self-supervised training loss; (2) **Deep Self-Supervision**, which applies the self-supervised loss at multiple
|
|
110
|
+
intermediate representations of the encoder models; (3) **Multi-Modal Tokenizers** for images and videos;
|
|
111
|
+
and we show that our approach benefit from (4) **Model and data scaling**.
|
|
112
|
+
|
|
113
|
+
<p align="center">
|
|
114
|
+
<img src="assets/architecture_vjepa2_1.jpg" width=100%>
|
|
115
|
+
</p>
|
|
116
|
+
|
|
117
|
+
V-JEPA 2.1 performance across dense and global prediction tasks:
|
|
118
|
+
|
|
119
|
+
<p align="center">
|
|
120
|
+
<img src="assets/bars_teaser_tikz-1.png" width=100%>
|
|
121
|
+
</p>
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
## V-JEPA 2 Pre-training
|
|
125
|
+
|
|
126
|
+
**(Top)** The encoder and predictor are pre-trained through self-supervised learning from video using a masked latent feature prediction objective, leveraging abundant natural videos to bootstrap physical world understanding and prediction. **(Bottom)** Performance of V-JEPA 2 on downstream understanding and prediction tasks.
|
|
127
|
+
|
|
128
|
+
<img align="left" src="https://github.com/user-attachments/assets/914942d8-6a1e-409d-86ff-ff856b7346ab" width=65%>
|
|
129
|
+
<table>
|
|
130
|
+
<tr>
|
|
131
|
+
<th colspan="1">Benchmark</th>
|
|
132
|
+
<th colspan="1">V-JEPA 2</th>
|
|
133
|
+
<th colspan="1">Previous Best</th>
|
|
134
|
+
</tr>
|
|
135
|
+
<tr>
|
|
136
|
+
<td>EK100</td>
|
|
137
|
+
<td>39.7%</td>
|
|
138
|
+
<td>27.6% (PlausiVL)</td>
|
|
139
|
+
</tr>
|
|
140
|
+
<tr>
|
|
141
|
+
<td>SSv2 (Probe)</td>
|
|
142
|
+
<td>77.3%</td>
|
|
143
|
+
<td>69.7% (InternVideo2-1B)</td>
|
|
144
|
+
</tr>
|
|
145
|
+
<tr>
|
|
146
|
+
<td>Diving48 (Probe)</td>
|
|
147
|
+
<td>90.2%</td>
|
|
148
|
+
<td>86.4% (InternVideo2-1B)</td>
|
|
149
|
+
</tr>
|
|
150
|
+
<tr>
|
|
151
|
+
<td>MVP (Video QA)</td>
|
|
152
|
+
<td>44.5%</td>
|
|
153
|
+
<td>39.9% (InternVL-2.5)</td>
|
|
154
|
+
</tr>
|
|
155
|
+
<tr>
|
|
156
|
+
<td>TempCompass (Video QA)</td>
|
|
157
|
+
<td>76.9%</td>
|
|
158
|
+
<td>75.3% (Tarsier 2)</td>
|
|
159
|
+
</tr>
|
|
160
|
+
</table>
|
|
161
|
+
|
|
162
|
+
## V-JEPA 2-AC Post-training
|
|
163
|
+
|
|
164
|
+
**(Top)** After post-training with a small amount of robot data, we can deploy the model on a robot arm in new environments, and tackle foundational tasks like reaching, grasping, and pick-and-place by planning from image goals. **(Bottom)** Performance on robot manipulation tasks using a Franka arm, with input provided through a monocular RGB camera.
|
|
165
|
+
|
|
166
|
+
<img align="left" src="https://github.com/user-attachments/assets/c5d42221-0102-4216-911d-061a4369a805" width=65%>
|
|
167
|
+
<table>
|
|
168
|
+
<tr>
|
|
169
|
+
<th colspan="1"></th>
|
|
170
|
+
<th colspan="1"></th>
|
|
171
|
+
<th colspan="2">Grasp</th>
|
|
172
|
+
<th colspan="2">Pick-and-Place</th>
|
|
173
|
+
</tr>
|
|
174
|
+
<tr>
|
|
175
|
+
<th colspan="1">Method</th>
|
|
176
|
+
<th colspan="1">Reach</th>
|
|
177
|
+
<th colspan="1">Cup</th>
|
|
178
|
+
<th colspan="1">Box</th>
|
|
179
|
+
<th colspan="1">Cup</th>
|
|
180
|
+
<th colspan="1">Box</th>
|
|
181
|
+
</tr>
|
|
182
|
+
<tr>
|
|
183
|
+
<td>Octo</td>
|
|
184
|
+
<td>100%</td>
|
|
185
|
+
<td>10%</td>
|
|
186
|
+
<td>0%</td>
|
|
187
|
+
<td>10%</td>
|
|
188
|
+
<td>10%</td>
|
|
189
|
+
</tr>
|
|
190
|
+
<tr>
|
|
191
|
+
<td>Cosmos</td>
|
|
192
|
+
<td>80%</td>
|
|
193
|
+
<td>0%</td>
|
|
194
|
+
<td>20%</td>
|
|
195
|
+
<td>0%</td>
|
|
196
|
+
<td>0%</td>
|
|
197
|
+
</tr>
|
|
198
|
+
<tr>
|
|
199
|
+
<td>VJEPA 2-AC</td>
|
|
200
|
+
<td>100%</td>
|
|
201
|
+
<td>60%</td>
|
|
202
|
+
<td>20%</td>
|
|
203
|
+
<td>80%</td>
|
|
204
|
+
<td>50%</td>
|
|
205
|
+
</tr>
|
|
206
|
+
</table>
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
## Models
|
|
213
|
+
|
|
214
|
+
### V-JEPA 2 and V-JEPA 2.1
|
|
215
|
+
|
|
216
|
+
#### HuggingFace
|
|
217
|
+
|
|
218
|
+
See our HuggingFace [collection](https://huggingface.co/collections/facebook/v-jepa-2-6841bad8413014e185b497a6) for V-JEPA 2.
|
|
219
|
+
|
|
220
|
+
#### V-JEPA 2 Pretrained Checkpoints
|
|
221
|
+
|
|
222
|
+
<table>
|
|
223
|
+
<tr>
|
|
224
|
+
<th colspan="1">Model</th>
|
|
225
|
+
<th colspan="1">#Parameters</th>
|
|
226
|
+
<th colspan="1">Resolution</th>
|
|
227
|
+
<th colspan="1">Download Link</th>
|
|
228
|
+
<th colspan="1">Pretraining Config</th>
|
|
229
|
+
</tr>
|
|
230
|
+
<tr>
|
|
231
|
+
<td>ViT-L/16</td>
|
|
232
|
+
<td>300M</td>
|
|
233
|
+
<td>256</td>
|
|
234
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vitl.pt">checkpoint</a></td>
|
|
235
|
+
<td><a href="configs/train/vitl16">configs</a></td>
|
|
236
|
+
</tr>
|
|
237
|
+
<tr>
|
|
238
|
+
<td>ViT-H/16</td>
|
|
239
|
+
<td>600M</td>
|
|
240
|
+
<td>256</td>
|
|
241
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vith.pt">checkpoint</a></td>
|
|
242
|
+
<td><a href="configs/train/vith16/">configs</a></td>
|
|
243
|
+
</tr>
|
|
244
|
+
<tr>
|
|
245
|
+
<td>ViT-g/16</td>
|
|
246
|
+
<td>1B</td>
|
|
247
|
+
<td>256</td>
|
|
248
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vitg.pt">checkpoint</a></td>
|
|
249
|
+
<td><a href="configs/train/vitg16">configs</a></td>
|
|
250
|
+
</tr>
|
|
251
|
+
<tr>
|
|
252
|
+
<td>ViT-g/16<sub>384</sub></td>
|
|
253
|
+
<td>1B</td>
|
|
254
|
+
<td>384</td>
|
|
255
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt">checkpoint</a></td>
|
|
256
|
+
<td><a href="configs/train/vitg16">configs</a></td>
|
|
257
|
+
</tr>
|
|
258
|
+
</table>
|
|
259
|
+
|
|
260
|
+
#### V-JEPA 2.1 Pretrained Checkpoints
|
|
261
|
+
|
|
262
|
+
<table>
|
|
263
|
+
<tr>
|
|
264
|
+
<th colspan="1">Model</th>
|
|
265
|
+
<th colspan="1">#Parameters</th>
|
|
266
|
+
<th colspan="1">Resolution</th>
|
|
267
|
+
<th colspan="1">Download Link</th>
|
|
268
|
+
<th colspan="1">Pretraining Config</th>
|
|
269
|
+
</tr>
|
|
270
|
+
|
|
271
|
+
<tr>
|
|
272
|
+
<td>ViT-B/16</td>
|
|
273
|
+
<td>80M</td>
|
|
274
|
+
<td>384</td>
|
|
275
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitb_dist_vitG_384.pt">checkpoint</a></td>
|
|
276
|
+
<td><a href="configs/train_2_1/vitb16">configs</a></td>
|
|
277
|
+
</tr>
|
|
278
|
+
|
|
279
|
+
<tr>
|
|
280
|
+
<td>ViT-L/16</td>
|
|
281
|
+
<td>300M</td>
|
|
282
|
+
<td>384</td>
|
|
283
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitl_dist_vitG_384.pt">checkpoint</a></td>
|
|
284
|
+
<td><a href="configs/train_2_1/vitl16">configs</a></td>
|
|
285
|
+
</tr>
|
|
286
|
+
|
|
287
|
+
<tr>
|
|
288
|
+
<td>ViT-g/16</td>
|
|
289
|
+
<td>1B</td>
|
|
290
|
+
<td>384</td>
|
|
291
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitg_384.pt">checkpoint</a></td>
|
|
292
|
+
<td><a href="configs/train_2_1/vitg16">configs</a></td>
|
|
293
|
+
</tr>
|
|
294
|
+
|
|
295
|
+
<tr>
|
|
296
|
+
<td>ViT-G/16</td>
|
|
297
|
+
<td>2B</td>
|
|
298
|
+
<td>384</td>
|
|
299
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitG_384.pt">checkpoint</a></td>
|
|
300
|
+
<td><a href="configs/train_2_1/vitG16">configs</a></td>
|
|
301
|
+
</tr>
|
|
302
|
+
</table>
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
#### Pretrained backbones (via PyTorch Hub)
|
|
306
|
+
|
|
307
|
+
Please install [Pytorch](https://pytorch.org/get-started/locally/), [timm](https://pypi.org/project/timm/) and [einops](https://pypi.org/project/einops/) locally, then run the following to load each model. Installing Pytorch with CUDA support is strongly recommended.
|
|
308
|
+
|
|
309
|
+
```python
|
|
310
|
+
import torch
|
|
311
|
+
|
|
312
|
+
# preprocessor
|
|
313
|
+
processor = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_preprocessor')
|
|
314
|
+
# models
|
|
315
|
+
# V-JEPA 2
|
|
316
|
+
vjepa2_vit_large = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_large')
|
|
317
|
+
vjepa2_vit_huge = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_huge')
|
|
318
|
+
vjepa2_vit_giant = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant')
|
|
319
|
+
vjepa2_vit_giant_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant_384')
|
|
320
|
+
# V-JEPA 2.1
|
|
321
|
+
vjepa2_1_vit_base_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_base_384')
|
|
322
|
+
vjepa2_1_vit_large_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_large_384')
|
|
323
|
+
vjepa2_1_vit_giant_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_giant_384')
|
|
324
|
+
vjepa2_1_vit_gigantic_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_gigantic_384')
|
|
325
|
+
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
#### Pretrained checkpoints on Huggingface
|
|
329
|
+
|
|
330
|
+
You can also use our pretrained checkpoints on [Huggingface for V-JEPA 2](https://huggingface.co/collections/facebook/v-jepa-2-6841bad8413014e185b497a6).
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
from transformers import AutoVideoProcessor, AutoModel
|
|
334
|
+
|
|
335
|
+
hf_repo = "facebook/vjepa2-vitg-fpc64-256"
|
|
336
|
+
# facebook/vjepa2-vitl-fpc64-256
|
|
337
|
+
# facebook/vjepa2-vith-fpc64-256
|
|
338
|
+
# facebook/vjepa2-vitg-fpc64-256
|
|
339
|
+
# facebook/vjepa2-vitg-fpc64-384
|
|
340
|
+
|
|
341
|
+
model = AutoModel.from_pretrained(hf_repo)
|
|
342
|
+
processor = AutoVideoProcessor.from_pretrained(hf_repo)
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
#### Evaluation Attentive Probes
|
|
346
|
+
|
|
347
|
+
We share the trained attentive probes for two of our visual understanding evals (Something-Something v2 and Diving48) and the action anticipation eval EPIC-KITCHENS-100.
|
|
348
|
+
|
|
349
|
+
<table>
|
|
350
|
+
<tr>
|
|
351
|
+
<th colspan="1">Model</th>
|
|
352
|
+
<th colspan="4">SSv2</th>
|
|
353
|
+
<th colspan="4">Diving48</th>
|
|
354
|
+
<th colspan="4">EK100</th>
|
|
355
|
+
</tr>
|
|
356
|
+
<tr>
|
|
357
|
+
<th colspan="1"></th>
|
|
358
|
+
<th colspan="1">Checkpoint</th>
|
|
359
|
+
<th colspan="1">Training Config</th>
|
|
360
|
+
<th colspan="1">Inference Config</th>
|
|
361
|
+
<th colspan="1">Result</th>
|
|
362
|
+
<th colspan="1">Checkpoint</th>
|
|
363
|
+
<th colspan="1">Training Config</th>
|
|
364
|
+
<th colspan="1">Inference Config</th>
|
|
365
|
+
<th colspan="1">Result</th>
|
|
366
|
+
<th colspan="1">Checkpoint</th>
|
|
367
|
+
<th colspan="1">Training Config</th>
|
|
368
|
+
<th colspan="1">Inference Config</th>
|
|
369
|
+
<th colspan="1">Result</th>
|
|
370
|
+
</tr>
|
|
371
|
+
<tr>
|
|
372
|
+
<td>ViT-L/16</td>
|
|
373
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitl-16x2x3.pt">checkpoint</a></td>
|
|
374
|
+
<td><a href="configs/eval/vitl/ssv2.yaml">config</a></td>
|
|
375
|
+
<td><a href="configs/inference/vitl/ssv2.yaml">config</a></td>
|
|
376
|
+
<td>73.7%</td>
|
|
377
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/diving48-vitl-256.pt">checkpoint</a></td>
|
|
378
|
+
<td><a href="configs/eval/vitl/diving48.yaml">config</a></td>
|
|
379
|
+
<td><a href="configs/inference/vitl/diving48.yaml">config</a></td>
|
|
380
|
+
<td>89.0%</td>
|
|
381
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ek100-vitl-256.pt">checkpoint</a></td>
|
|
382
|
+
<td><a href="configs/eval/vitl/ek100.yaml">config</a></td>
|
|
383
|
+
<td><a href="configs/inference/vitl/ek100.yaml">config</a></td>
|
|
384
|
+
<td>32.7 R@5</td>
|
|
385
|
+
</tr>
|
|
386
|
+
<tr>
|
|
387
|
+
<td>ViT-g/16<sub>384</td>
|
|
388
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt">checkpoint</a></td>
|
|
389
|
+
<td><a href="configs/eval/vitg-384/ssv2.yaml">config</a></td>
|
|
390
|
+
<td><a href="configs/inference/vitg-384/ssv2.yaml">config</a></td>
|
|
391
|
+
<td>77.3%</td>
|
|
392
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/diving48-vitg-384-32x4x3.pt">checkpoint</a></td>
|
|
393
|
+
<td><a href="configs/eval/vitg-384/diving48.yaml">config</a></td>
|
|
394
|
+
<td><a href="configs/inference/vitg-384/diving48.yaml">config</a></td>
|
|
395
|
+
<td>90.2%</td>
|
|
396
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ek100-vitg-384.pt">checkpoint</a></td>
|
|
397
|
+
<td><a href="configs/eval/vitg-384/ek100.yaml">config</a></td>
|
|
398
|
+
<td><a href="configs/inference/vitg-384/ek100.yaml">config</a></td>
|
|
399
|
+
<td>39.7 R@5</td>
|
|
400
|
+
</tr>
|
|
401
|
+
</table>
|
|
402
|
+
|
|
403
|
+
### V-JEPA 2-AC
|
|
404
|
+
|
|
405
|
+
Our action-conditioned checkpoint was trained from the ViT-g encoder.
|
|
406
|
+
<table>
|
|
407
|
+
<tr>
|
|
408
|
+
<th colspan="1">Model</th>
|
|
409
|
+
<th colspan="1">Download Link</th>
|
|
410
|
+
<th colspan="1">Training Config</th>
|
|
411
|
+
</tr>
|
|
412
|
+
<tr>
|
|
413
|
+
<td>ViT-g/16</td>
|
|
414
|
+
<td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2-ac-vitg.pt">checkpoint</a></td>
|
|
415
|
+
<td><a href="configs/train/vitg16/droid-256px-8f.yaml">config</a></td>
|
|
416
|
+
</tr>
|
|
417
|
+
</table>
|
|
418
|
+
|
|
419
|
+
#### Pretrained action-conditioned backbone (via PyTorch Hub)
|
|
420
|
+
|
|
421
|
+
Please install [Pytorch](https://pytorch.org/get-started/locally/), [timm](https://pypi.org/project/timm/) and [einops](https://pypi.org/project/einops/) locally, then run the following to load each model. Installing Pytorch with CUDA support is strongly recommended.
|
|
422
|
+
|
|
423
|
+
```python
|
|
424
|
+
import torch
|
|
425
|
+
|
|
426
|
+
vjepa2_encoder, vjepa2_ac_predictor = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_ac_vit_giant')
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
See [energy_landscape_example.ipynb](notebooks/energy_landscape_example.ipynb) for an example notebook computing the energy landscape of the pretrained action-conditioned backbone using a robot trajectory collected from our lab.
|
|
431
|
+
To run this notebook, you'll need to additionally install [Jupyter](https://jupyter.org/install) and [Scipy](https://scipy.org/install/) in your conda environment.
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
## Getting Started
|
|
436
|
+
|
|
437
|
+
### Setup
|
|
438
|
+
|
|
439
|
+
```
|
|
440
|
+
conda create -n vjepa2-312 python=3.12
|
|
441
|
+
conda activate vjepa2-312
|
|
442
|
+
pip install . # or `pip install -e .` for development mode
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
**Note to macOS users:** V-JEPA 2 relies on [`decord`](https://github.com/dmlc/decord), which does not support macOS (and, unfortunately, is also no longer under development). In order to run the V-JEPA 2 code on macOS, you will need a different `decord` implementation. We do not make specific recommendations, although some users have reported the use of [`eva-decord`](https://github.com/georgia-tech-db/eva-decord) (see [PR 1](https://github.com/facebookresearch/vjepa2/pull/1)) or [`decord2`](https://github.com/johnnynunez/decord2) (see [PR 31](https://github.com/facebookresearch/vjepa2/pull/31)). We leave the selection of the `decord` package up to the user's discretion.
|
|
446
|
+
|
|
447
|
+
### Usage Demo
|
|
448
|
+
|
|
449
|
+
See [vjepa2_demo.ipynb](notebooks/vjepa2_demo.ipynb) [(Colab Link)](https://colab.research.google.com/github/facebookresearch/vjepa2/blob/main/notebooks/vjepa2_demo.ipynb) or [vjepa2_demo.py](notebooks/vjepa2_demo.py) for an example of how to load both the HuggingFace and PyTorch V-JEPA 2 models and run inference on a sample video to get a sample classification result.
|
|
450
|
+
|
|
451
|
+
The script assumes the presence of downloaded model checkpoints so you will need to download the model weights and update the corresponding paths in the script. E.g.:
|
|
452
|
+
```
|
|
453
|
+
wget https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt -P YOUR_DIR
|
|
454
|
+
wget https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt -P YOUR_DIR
|
|
455
|
+
|
|
456
|
+
# Then update your model paths in vjepa2_demo.py.
|
|
457
|
+
pt_model_path = YOUR_DIR/vitg-384.pt
|
|
458
|
+
classifier_model_path = YOUR_DIR/ssv2-vitg-384-64x2x3.pt
|
|
459
|
+
|
|
460
|
+
# Then run the script (assumes your machine has a GPU)
|
|
461
|
+
python -m notebooks.vjepa2_demo
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
### Probe-based evaluation
|
|
465
|
+
|
|
466
|
+
Probe-based evaluation consists in training an attentive probe on top of frozen V-JEPA 2 features. We provide training scripts for training your own probes, and checkpoints to run inference directly.
|
|
467
|
+
|
|
468
|
+
#### Training probes
|
|
469
|
+
|
|
470
|
+
Evaluations can be run either locally, or distributed via SLURM. (Running locally is useful for debugging and validation).
|
|
471
|
+
These sample commands launch Something-Something v2 video classification; other evals are launched by specifying the corresponding config.
|
|
472
|
+
Use provided training configs under "Evaluation Attentive Probes". These configs allow to train multiple probes in parallel with various optimization parameters.
|
|
473
|
+
Change filepaths as needed (e.g. `folder`, `checkpoint`, `dataset_train`, `dataset_val`) to match locations of data and downloaded checkpoints on your local filesystem.
|
|
474
|
+
Change \# nodes and local batch size as needed to not exceed available GPU memory.
|
|
475
|
+
|
|
476
|
+
##### Local
|
|
477
|
+
|
|
478
|
+
To run locally, specify the GPUs to use on
|
|
479
|
+
```
|
|
480
|
+
python -m evals.main --fname configs/eval/vitl16/ssv2.yaml \
|
|
481
|
+
--devices cuda:0 cuda:1
|
|
482
|
+
```
|
|
483
|
+
|
|
484
|
+
##### Distributed
|
|
485
|
+
|
|
486
|
+
```
|
|
487
|
+
python -m evals.main_distributed \
|
|
488
|
+
--fname configs/eval/vitl/ssv2.yaml \
|
|
489
|
+
--time 8600 \
|
|
490
|
+
--account my_account --qos=my_qos
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
#### Inference from existing probes
|
|
494
|
+
|
|
495
|
+
Use provided inference configs under [Evaluation Attentive Probes](#evaluation-attentive-probes).
|
|
496
|
+
Download the corresponding checkpoint, rename it to 'latest.pt', and create a folder with the checkpoint inside, with the format matching the variables in the config:
|
|
497
|
+
```
|
|
498
|
+
[folder]/[eval_name]/[tag]/latest.pt
|
|
499
|
+
```
|
|
500
|
+
Then run inference, locally or distributed, using the same evaluation commands as above, but with configs from `configs/inference`.
|
|
501
|
+
|
|
502
|
+
### Pretraining
|
|
503
|
+
|
|
504
|
+
Likewise, training can also be run locally or distributed. Pretraining and cooldown training phases are
|
|
505
|
+
run with the same command using different configs.
|
|
506
|
+
These sample commands launch initial training of a ViT-L model. Configs for cooldown (or action-conditioned) training
|
|
507
|
+
can be found in the same directory as the config for initial training.
|
|
508
|
+
|
|
509
|
+
#### Local
|
|
510
|
+
|
|
511
|
+
```
|
|
512
|
+
python -m app.main --fname configs/train/vitl16/pretrain-256px-16f.yaml \
|
|
513
|
+
--devices cuda:0
|
|
514
|
+
```
|
|
515
|
+
|
|
516
|
+
#### Distributed
|
|
517
|
+
|
|
518
|
+
```
|
|
519
|
+
python -m app.main_distributed \
|
|
520
|
+
--fname configs/train/vitl16/pretrain-256px-16f.yaml
|
|
521
|
+
--time 6000
|
|
522
|
+
--account my_account --qos=my_qos
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
### Postraining
|
|
526
|
+
|
|
527
|
+
Post-training of the action-conditioned model, starting from the pretrained VJEPA 2 backbone, also follows a similar interface, and can be run locally or distributed using [this config](configs/train/vitg16/droid-256px-8f.yaml).
|
|
528
|
+
We post-train the model starting from the ViT-g/16 backbone.
|
|
529
|
+
|
|
530
|
+
#### Local
|
|
531
|
+
|
|
532
|
+
```
|
|
533
|
+
python -m app.main --fname configs/train/vitg16/droid-256px-8f.yaml \
|
|
534
|
+
--devices cuda:0
|
|
535
|
+
```
|
|
536
|
+
|
|
537
|
+
#### Distributed
|
|
538
|
+
|
|
539
|
+
```
|
|
540
|
+
python -m app.main_distributed \
|
|
541
|
+
--fname configs/train/vitg16/droid-256px-8f.yaml
|
|
542
|
+
--time 6000
|
|
543
|
+
--account my_account --qos=my_qos
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
## Code Structure
|
|
548
|
+
|
|
549
|
+
```
|
|
550
|
+
.
|
|
551
|
+
├── app # training loops
|
|
552
|
+
│ ├── vjepa # V-JEPA 2 pre-training
|
|
553
|
+
│ ├── vjepa_2_1 # V-JEPA 2.1 pre-training
|
|
554
|
+
│ ├── vjepa_droid # training the action-conditioned model
|
|
555
|
+
│ ├── main_distributed.py # entrypoint for launch app on slurm cluster
|
|
556
|
+
│ └── main.py # entrypoint for launch app locally on your machine
|
|
557
|
+
├── configs # config files with experiment params for training and evaluation
|
|
558
|
+
│ ├── train # pretraining with V-JEPA 2 (phase 1), cooldown (phase 2), and action-conditioned training
|
|
559
|
+
│ ├── train_2_1 # pretraining with V-JEPA 2.1 (phase 1), cooldown (phase 2)
|
|
560
|
+
│ └── eval # frozen evaluations
|
|
561
|
+
│ └── inference # inference only frozen evaluations
|
|
562
|
+
├── evals # evaluation loops training an attentive probe with frozen backbone...
|
|
563
|
+
│ ├── action_anticipation_frozen # action anticipation
|
|
564
|
+
│ ├── image_classification_frozen # image understanding
|
|
565
|
+
│ ├── video_classification_frozen # video understanding
|
|
566
|
+
│ ├── main_distributed.py # entrypoint for distributed evaluations
|
|
567
|
+
│ └── main.py # entrypoint for locally-run evaluations
|
|
568
|
+
├── src # the package
|
|
569
|
+
│ ├── datasets # datasets, data loaders, ...
|
|
570
|
+
│ ├── models # model definitions
|
|
571
|
+
│ ├── masks # mask collators, masking utilities, ...
|
|
572
|
+
│ └── utils # shared utilities
|
|
573
|
+
├── tests # unit tests for some modules in `src`
|
|
574
|
+
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
## License
|
|
578
|
+
|
|
579
|
+
The majority of V-JEPA 2 is licensed under MIT, however portions of the project are available under separate license terms:
|
|
580
|
+
|
|
581
|
+
[src/datasets/utils/video/randaugment.py](src/datasets/utils/video/randaugment.py)<br>
|
|
582
|
+
[src/datasets/utils/video/randerase.py](src/datasets/utils/video/randerase.py)<br>
|
|
583
|
+
[src/datasets/utils/worker_init_fn.py](src/datasets/utils/worker_init_fn.py)<br>
|
|
584
|
+
|
|
585
|
+
are licensed under the Apache 2.0 license.
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
## Citation
|
|
589
|
+
If you find this repository useful in your research, please consider giving a star :star: and cite the papers:
|
|
590
|
+
|
|
591
|
+
```bibtex
|
|
592
|
+
@article{assran2025vjepa2,
|
|
593
|
+
title={V-JEPA~2: Self-Supervised Video Models Enable Understanding, Prediction and Planning},
|
|
594
|
+
author={Assran, Mahmoud and Bardes, Adrien and Fan, David and Garrido, Quentin and Howes, Russell and
|
|
595
|
+
Komeili, Mojtaba and Muckley, Matthew and Rizvi, Ammar and Roberts, Claire and Sinha, Koustuv and Zholus, Artem and
|
|
596
|
+
Arnaud, Sergio and Gejji, Abha and Martin, Ada and Robert Hogan, Francois and Dugas, Daniel and
|
|
597
|
+
Bojanowski, Piotr and Khalidov, Vasil and Labatut, Patrick and Massa, Francisco and Szafraniec, Marc and
|
|
598
|
+
Krishnakumar, Kapil and Li, Yong and Ma, Xiaodong and Chandar, Sarath and Meier, Franziska and LeCun, Yann and
|
|
599
|
+
Rabbat, Michael and Ballas, Nicolas},
|
|
600
|
+
journal={arXiv preprint arXiv:2506.09985},
|
|
601
|
+
year={2025}
|
|
602
|
+
}
|
|
603
|
+
```
|
|
604
|
+
|
|
605
|
+
```bibtex
|
|
606
|
+
@article{murlabadia2026vjepa2_1,
|
|
607
|
+
title={V-JEPA 2.1: Unlocking Dense Features in Video Self-Supervised Learning},
|
|
608
|
+
author={Mur-Labadia, Lorenzo and Muckley, Matthew and Bar, Amir and Assran, Mahmoud and
|
|
609
|
+
Sinha, Koustuv and Rabbat, Michael and LeCun, Yann and Ballas, Nicolas and Bardes, Adrien},
|
|
610
|
+
journal={arXiv preprint arXiv:2603.14482},
|
|
611
|
+
year={2026}
|
|
612
|
+
}
|
|
613
|
+
```
|