cc-vjepa2 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. cc_vjepa2-0.0.3/LICENSE +21 -0
  2. cc_vjepa2-0.0.3/PKG-INFO +613 -0
  3. cc_vjepa2-0.0.3/README.md +559 -0
  4. cc_vjepa2-0.0.3/cc_vjepa2.egg-info/PKG-INFO +613 -0
  5. cc_vjepa2-0.0.3/cc_vjepa2.egg-info/SOURCES.txt +46 -0
  6. cc_vjepa2-0.0.3/cc_vjepa2.egg-info/dependency_links.txt +1 -0
  7. cc_vjepa2-0.0.3/cc_vjepa2.egg-info/requires.txt +30 -0
  8. cc_vjepa2-0.0.3/cc_vjepa2.egg-info/top_level.txt +1 -0
  9. cc_vjepa2-0.0.3/pyproject.toml +78 -0
  10. cc_vjepa2-0.0.3/setup.cfg +4 -0
  11. cc_vjepa2-0.0.3/vjepa2/__init__.py +11 -0
  12. cc_vjepa2-0.0.3/vjepa2/datasets/__init__.py +0 -0
  13. cc_vjepa2-0.0.3/vjepa2/datasets/data_manager.py +89 -0
  14. cc_vjepa2-0.0.3/vjepa2/datasets/imagenet1k.py +150 -0
  15. cc_vjepa2-0.0.3/vjepa2/datasets/utils/dataloader.py +234 -0
  16. cc_vjepa2-0.0.3/vjepa2/datasets/utils/utils.py +21 -0
  17. cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/functional.py +110 -0
  18. cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/randaugment.py +536 -0
  19. cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/randerase.py +170 -0
  20. cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/transforms.py +1161 -0
  21. cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/transforms_builder.py +165 -0
  22. cc_vjepa2-0.0.3/vjepa2/datasets/utils/video/volume_transforms.py +159 -0
  23. cc_vjepa2-0.0.3/vjepa2/datasets/utils/weighted_sampler.py +366 -0
  24. cc_vjepa2-0.0.3/vjepa2/datasets/utils/worker_init_fn.py +76 -0
  25. cc_vjepa2-0.0.3/vjepa2/datasets/video_dataset.py +384 -0
  26. cc_vjepa2-0.0.3/vjepa2/hub/__init__.py +0 -0
  27. cc_vjepa2-0.0.3/vjepa2/hub/backbones.py +336 -0
  28. cc_vjepa2-0.0.3/vjepa2/masks/__init__.py +0 -0
  29. cc_vjepa2-0.0.3/vjepa2/masks/default.py +18 -0
  30. cc_vjepa2-0.0.3/vjepa2/masks/multiseq_multiblock3d.py +269 -0
  31. cc_vjepa2-0.0.3/vjepa2/masks/utils.py +21 -0
  32. cc_vjepa2-0.0.3/vjepa2/models/__init__.py +0 -0
  33. cc_vjepa2-0.0.3/vjepa2/models/ac_predictor.py +200 -0
  34. cc_vjepa2-0.0.3/vjepa2/models/attentive_pooler.py +137 -0
  35. cc_vjepa2-0.0.3/vjepa2/models/predictor.py +261 -0
  36. cc_vjepa2-0.0.3/vjepa2/models/utils/__init__.py +0 -0
  37. cc_vjepa2-0.0.3/vjepa2/models/utils/modules.py +897 -0
  38. cc_vjepa2-0.0.3/vjepa2/models/utils/patch_embed.py +52 -0
  39. cc_vjepa2-0.0.3/vjepa2/models/utils/pos_embs.py +93 -0
  40. cc_vjepa2-0.0.3/vjepa2/models/vision_transformer.py +504 -0
  41. cc_vjepa2-0.0.3/vjepa2/utils/__init__.py +0 -0
  42. cc_vjepa2-0.0.3/vjepa2/utils/checkpoint_loader.py +37 -0
  43. cc_vjepa2-0.0.3/vjepa2/utils/distributed.py +101 -0
  44. cc_vjepa2-0.0.3/vjepa2/utils/logging.py +108 -0
  45. cc_vjepa2-0.0.3/vjepa2/utils/monitoring.py +171 -0
  46. cc_vjepa2-0.0.3/vjepa2/utils/schedulers.py +112 -0
  47. cc_vjepa2-0.0.3/vjepa2/utils/tensors.py +53 -0
  48. cc_vjepa2-0.0.3/vjepa2/utils/wrappers.py +44 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Meta Platforms, Inc. and affiliates.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,613 @@
1
+ Metadata-Version: 2.4
2
+ Name: cc-vjepa2
3
+ Version: 0.0.3
4
+ Summary: V-JEPA 2 — Self-Supervised Video Models (community fork with MPS, ST-A², bugfixes)
5
+ Author: Meta AI (original)
6
+ Author-email: "Cagatay Cali (community maintainer)" <cagataycali@icloud.com>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/cagataycali/vjepa2
9
+ Project-URL: Repository, https://github.com/cagataycali/vjepa2
10
+ Project-URL: Issues, https://github.com/cagataycali/vjepa2/issues
11
+ Keywords: vjepa,vjepa2,video,self-supervised,world-model,robotics
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: torch>=2.0
26
+ Requires-Dist: torchvision
27
+ Requires-Dist: numpy
28
+ Requires-Dist: pyyaml
29
+ Requires-Dist: timm
30
+ Requires-Dist: einops
31
+ Provides-Extra: full
32
+ Requires-Dist: tensorboard; extra == "full"
33
+ Requires-Dist: wandb; extra == "full"
34
+ Requires-Dist: iopath; extra == "full"
35
+ Requires-Dist: opencv-python; extra == "full"
36
+ Requires-Dist: submitit; extra == "full"
37
+ Requires-Dist: braceexpand; extra == "full"
38
+ Requires-Dist: webdataset; extra == "full"
39
+ Requires-Dist: transformers; extra == "full"
40
+ Requires-Dist: peft; extra == "full"
41
+ Requires-Dist: decord2; extra == "full"
42
+ Requires-Dist: pandas; extra == "full"
43
+ Requires-Dist: beartype; extra == "full"
44
+ Requires-Dist: psutil; extra == "full"
45
+ Requires-Dist: h5py; extra == "full"
46
+ Requires-Dist: fire; extra == "full"
47
+ Requires-Dist: python-box; extra == "full"
48
+ Requires-Dist: scikit-image; extra == "full"
49
+ Requires-Dist: ftfy; extra == "full"
50
+ Provides-Extra: dev
51
+ Requires-Dist: pytest>=7.0; extra == "dev"
52
+ Requires-Dist: ruff; extra == "dev"
53
+ Dynamic: license-file
54
+
55
+
56
+ ## 🚀 Install (with community patches: MPS, ST-A², bugfixes)
57
+
58
+ ```bash
59
+ pip install git+https://github.com/cagataycali/vjepa2.git
60
+ ```
61
+
62
+
63
+
64
+ 🆕 **[2026-03-16]:** :fire: V-JEPA 2.1 is released :fire: A new familly of models trained with a novel recipe that learns high quality and temporolly consistent dense features !!!
65
+
66
+ **[2025-06-25]:** V-JEPA 2 is released. [[`Blog`](https://ai.meta.com/blog/v-jepa-2-world-model-benchmarks)]
67
+
68
+
69
+ # V-JEPA 2: Self-Supervised Video Models Enable Understanding, Prediction and Planning
70
+
71
+ ### [Meta FAIR](https://ai.meta.com/research/)
72
+
73
+ Mahmoud Assran∗, Adrien Bardes∗, David Fan∗, Quentin Garrido∗, Russell Howes∗, Mojtaba
74
+ Komeili∗, Matthew Muckley∗, Ammar Rizvi∗, Claire Roberts∗, Koustuv Sinha∗, Artem Zholus*,
75
+ Sergio Arnaud*, Abha Gejji*, Ada Martin*, Francois Robert Hogan*, Daniel Dugas*, Piotr
76
+ Bojanowski, Vasil Khalidov, Patrick Labatut, Francisco Massa, Marc Szafraniec, Kapil
77
+ Krishnakumar, Yong Li, Xiaodong Ma, Sarath Chandar, Franziska Meier*, Yann LeCun*, Michael
78
+ Rabbat*, Nicolas Ballas*
79
+
80
+ *Core Team
81
+
82
+ [[`Paper`](https://arxiv.org/abs/2506.09985)] [[`Blog`](https://ai.meta.com/blog/v-jepa-2-world-model-benchmarks)] [[`BibTex`](#Citation)]
83
+
84
+ Official Pytorch codebase for V-JEPA 2, V-JEPA 2-AC, V-JEPA 2.1.
85
+
86
+ V-JEPA 2 is a self-supervised approach to training video encoders, using internet-scale video data, that attains state-of-the-art performance on motion understanding and human action anticipation tasks. V-JEPA 2-AC is a latent action-conditioned world model post-trained from V-JEPA 2 (using a small amount of robot trajectory interaction data) that solves robot manipulation tasks without environment-specific data collection or task-specific training or calibration.
87
+
88
+ <p align="center">
89
+ <img src="assets/flowchart.png" width=100%>
90
+ </p>
91
+
92
+
93
+
94
+ ## V-JEPA 2.1 Pre-training
95
+
96
+ Lorenzo Mur-Labadia, Matthew Muckley, Amir Bar, Mahmoud Assran, Koustuv Sinha, Michael
97
+ Rabbat, Yann LeCun, Nicolas Ballas, Adrien Bardes
98
+
99
+ [[`Paper`](https://arxiv.org/abs/TODO)] [[`BibTex`](#Citation)]
100
+
101
+ V-JEPA 2.1 improves the training recipe to focus on learning high-quality and temporally consistent dense features, as higlighted by PCA visualizations:
102
+
103
+ <p align="center">
104
+ <img src="assets/teaser_screenshot_5dice.png" width=100%>
105
+ </p>
106
+
107
+ The V-JEPA 2.1 approach leverages: (1) **Dense Predictive Loss**, a masking-based
108
+ self-supervision objective where all tokens (both visible/context and masked tokens) contribute to the
109
+ self-supervised training loss; (2) **Deep Self-Supervision**, which applies the self-supervised loss at multiple
110
+ intermediate representations of the encoder models; (3) **Multi-Modal Tokenizers** for images and videos;
111
+ and we show that our approach benefit from (4) **Model and data scaling**.
112
+
113
+ <p align="center">
114
+ <img src="assets/architecture_vjepa2_1.jpg" width=100%>
115
+ </p>
116
+
117
+ V-JEPA 2.1 performance across dense and global prediction tasks:
118
+
119
+ <p align="center">
120
+ <img src="assets/bars_teaser_tikz-1.png" width=100%>
121
+ </p>
122
+
123
+
124
+ ## V-JEPA 2 Pre-training
125
+
126
+ **(Top)** The encoder and predictor are pre-trained through self-supervised learning from video using a masked latent feature prediction objective, leveraging abundant natural videos to bootstrap physical world understanding and prediction. **(Bottom)** Performance of V-JEPA 2 on downstream understanding and prediction tasks.
127
+
128
+ <img align="left" src="https://github.com/user-attachments/assets/914942d8-6a1e-409d-86ff-ff856b7346ab" width=65%>&nbsp;
129
+ <table>
130
+ <tr>
131
+ <th colspan="1">Benchmark</th>
132
+ <th colspan="1">V-JEPA 2</th>
133
+ <th colspan="1">Previous Best</th>
134
+ </tr>
135
+ <tr>
136
+ <td>EK100</td>
137
+ <td>39.7%</td>
138
+ <td>27.6% (PlausiVL)</td>
139
+ </tr>
140
+ <tr>
141
+ <td>SSv2 (Probe)</td>
142
+ <td>77.3%</td>
143
+ <td>69.7% (InternVideo2-1B)</td>
144
+ </tr>
145
+ <tr>
146
+ <td>Diving48 (Probe)</td>
147
+ <td>90.2%</td>
148
+ <td>86.4% (InternVideo2-1B)</td>
149
+ </tr>
150
+ <tr>
151
+ <td>MVP (Video QA)</td>
152
+ <td>44.5%</td>
153
+ <td>39.9% (InternVL-2.5)</td>
154
+ </tr>
155
+ <tr>
156
+ <td>TempCompass (Video QA)</td>
157
+ <td>76.9%</td>
158
+ <td>75.3% (Tarsier 2)</td>
159
+ </tr>
160
+ </table>
161
+
162
+ ## V-JEPA 2-AC Post-training
163
+
164
+ **(Top)** After post-training with a small amount of robot data, we can deploy the model on a robot arm in new environments, and tackle foundational tasks like reaching, grasping, and pick-and-place by planning from image goals. **(Bottom)** Performance on robot manipulation tasks using a Franka arm, with input provided through a monocular RGB camera.
165
+
166
+ <img align="left" src="https://github.com/user-attachments/assets/c5d42221-0102-4216-911d-061a4369a805" width=65%>&nbsp;
167
+ <table>
168
+ <tr>
169
+ <th colspan="1"></th>
170
+ <th colspan="1"></th>
171
+ <th colspan="2">Grasp</th>
172
+ <th colspan="2">Pick-and-Place</th>
173
+ </tr>
174
+ <tr>
175
+ <th colspan="1">Method</th>
176
+ <th colspan="1">Reach</th>
177
+ <th colspan="1">Cup</th>
178
+ <th colspan="1">Box</th>
179
+ <th colspan="1">Cup</th>
180
+ <th colspan="1">Box</th>
181
+ </tr>
182
+ <tr>
183
+ <td>Octo</td>
184
+ <td>100%</td>
185
+ <td>10%</td>
186
+ <td>0%</td>
187
+ <td>10%</td>
188
+ <td>10%</td>
189
+ </tr>
190
+ <tr>
191
+ <td>Cosmos</td>
192
+ <td>80%</td>
193
+ <td>0%</td>
194
+ <td>20%</td>
195
+ <td>0%</td>
196
+ <td>0%</td>
197
+ </tr>
198
+ <tr>
199
+ <td>VJEPA 2-AC</td>
200
+ <td>100%</td>
201
+ <td>60%</td>
202
+ <td>20%</td>
203
+ <td>80%</td>
204
+ <td>50%</td>
205
+ </tr>
206
+ </table>
207
+
208
+
209
+
210
+
211
+
212
+ ## Models
213
+
214
+ ### V-JEPA 2 and V-JEPA 2.1
215
+
216
+ #### HuggingFace
217
+
218
+ See our HuggingFace [collection](https://huggingface.co/collections/facebook/v-jepa-2-6841bad8413014e185b497a6) for V-JEPA 2.
219
+
220
+ #### V-JEPA 2 Pretrained Checkpoints
221
+
222
+ <table>
223
+ <tr>
224
+ <th colspan="1">Model</th>
225
+ <th colspan="1">#Parameters</th>
226
+ <th colspan="1">Resolution</th>
227
+ <th colspan="1">Download Link</th>
228
+ <th colspan="1">Pretraining Config</th>
229
+ </tr>
230
+ <tr>
231
+ <td>ViT-L/16</td>
232
+ <td>300M</td>
233
+ <td>256</td>
234
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vitl.pt">checkpoint</a></td>
235
+ <td><a href="configs/train/vitl16">configs</a></td>
236
+ </tr>
237
+ <tr>
238
+ <td>ViT-H/16</td>
239
+ <td>600M</td>
240
+ <td>256</td>
241
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vith.pt">checkpoint</a></td>
242
+ <td><a href="configs/train/vith16/">configs</a></td>
243
+ </tr>
244
+ <tr>
245
+ <td>ViT-g/16</td>
246
+ <td>1B</td>
247
+ <td>256</td>
248
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vitg.pt">checkpoint</a></td>
249
+ <td><a href="configs/train/vitg16">configs</a></td>
250
+ </tr>
251
+ <tr>
252
+ <td>ViT-g/16<sub>384</sub></td>
253
+ <td>1B</td>
254
+ <td>384</td>
255
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt">checkpoint</a></td>
256
+ <td><a href="configs/train/vitg16">configs</a></td>
257
+ </tr>
258
+ </table>
259
+
260
+ #### V-JEPA 2.1 Pretrained Checkpoints
261
+
262
+ <table>
263
+ <tr>
264
+ <th colspan="1">Model</th>
265
+ <th colspan="1">#Parameters</th>
266
+ <th colspan="1">Resolution</th>
267
+ <th colspan="1">Download Link</th>
268
+ <th colspan="1">Pretraining Config</th>
269
+ </tr>
270
+
271
+ <tr>
272
+ <td>ViT-B/16</td>
273
+ <td>80M</td>
274
+ <td>384</td>
275
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitb_dist_vitG_384.pt">checkpoint</a></td>
276
+ <td><a href="configs/train_2_1/vitb16">configs</a></td>
277
+ </tr>
278
+
279
+ <tr>
280
+ <td>ViT-L/16</td>
281
+ <td>300M</td>
282
+ <td>384</td>
283
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitl_dist_vitG_384.pt">checkpoint</a></td>
284
+ <td><a href="configs/train_2_1/vitl16">configs</a></td>
285
+ </tr>
286
+
287
+ <tr>
288
+ <td>ViT-g/16</td>
289
+ <td>1B</td>
290
+ <td>384</td>
291
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitg_384.pt">checkpoint</a></td>
292
+ <td><a href="configs/train_2_1/vitg16">configs</a></td>
293
+ </tr>
294
+
295
+ <tr>
296
+ <td>ViT-G/16</td>
297
+ <td>2B</td>
298
+ <td>384</td>
299
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2_1_vitG_384.pt">checkpoint</a></td>
300
+ <td><a href="configs/train_2_1/vitG16">configs</a></td>
301
+ </tr>
302
+ </table>
303
+
304
+
305
+ #### Pretrained backbones (via PyTorch Hub)
306
+
307
+ Please install [Pytorch](https://pytorch.org/get-started/locally/), [timm](https://pypi.org/project/timm/) and [einops](https://pypi.org/project/einops/) locally, then run the following to load each model. Installing Pytorch with CUDA support is strongly recommended.
308
+
309
+ ```python
310
+ import torch
311
+
312
+ # preprocessor
313
+ processor = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_preprocessor')
314
+ # models
315
+ # V-JEPA 2
316
+ vjepa2_vit_large = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_large')
317
+ vjepa2_vit_huge = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_huge')
318
+ vjepa2_vit_giant = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant')
319
+ vjepa2_vit_giant_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant_384')
320
+ # V-JEPA 2.1
321
+ vjepa2_1_vit_base_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_base_384')
322
+ vjepa2_1_vit_large_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_large_384')
323
+ vjepa2_1_vit_giant_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_giant_384')
324
+ vjepa2_1_vit_gigantic_384 = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_1_vit_gigantic_384')
325
+
326
+ ```
327
+
328
+ #### Pretrained checkpoints on Huggingface
329
+
330
+ You can also use our pretrained checkpoints on [Huggingface for V-JEPA 2](https://huggingface.co/collections/facebook/v-jepa-2-6841bad8413014e185b497a6).
331
+
332
+ ```python
333
+ from transformers import AutoVideoProcessor, AutoModel
334
+
335
+ hf_repo = "facebook/vjepa2-vitg-fpc64-256"
336
+ # facebook/vjepa2-vitl-fpc64-256
337
+ # facebook/vjepa2-vith-fpc64-256
338
+ # facebook/vjepa2-vitg-fpc64-256
339
+ # facebook/vjepa2-vitg-fpc64-384
340
+
341
+ model = AutoModel.from_pretrained(hf_repo)
342
+ processor = AutoVideoProcessor.from_pretrained(hf_repo)
343
+ ```
344
+
345
+ #### Evaluation Attentive Probes
346
+
347
+ We share the trained attentive probes for two of our visual understanding evals (Something-Something v2 and Diving48) and the action anticipation eval EPIC-KITCHENS-100.
348
+
349
+ <table>
350
+ <tr>
351
+ <th colspan="1">Model</th>
352
+ <th colspan="4">SSv2</th>
353
+ <th colspan="4">Diving48</th>
354
+ <th colspan="4">EK100</th>
355
+ </tr>
356
+ <tr>
357
+ <th colspan="1"></th>
358
+ <th colspan="1">Checkpoint</th>
359
+ <th colspan="1">Training Config</th>
360
+ <th colspan="1">Inference Config</th>
361
+ <th colspan="1">Result</th>
362
+ <th colspan="1">Checkpoint</th>
363
+ <th colspan="1">Training Config</th>
364
+ <th colspan="1">Inference Config</th>
365
+ <th colspan="1">Result</th>
366
+ <th colspan="1">Checkpoint</th>
367
+ <th colspan="1">Training Config</th>
368
+ <th colspan="1">Inference Config</th>
369
+ <th colspan="1">Result</th>
370
+ </tr>
371
+ <tr>
372
+ <td>ViT-L/16</td>
373
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitl-16x2x3.pt">checkpoint</a></td>
374
+ <td><a href="configs/eval/vitl/ssv2.yaml">config</a></td>
375
+ <td><a href="configs/inference/vitl/ssv2.yaml">config</a></td>
376
+ <td>73.7%</td>
377
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/diving48-vitl-256.pt">checkpoint</a></td>
378
+ <td><a href="configs/eval/vitl/diving48.yaml">config</a></td>
379
+ <td><a href="configs/inference/vitl/diving48.yaml">config</a></td>
380
+ <td>89.0%</td>
381
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ek100-vitl-256.pt">checkpoint</a></td>
382
+ <td><a href="configs/eval/vitl/ek100.yaml">config</a></td>
383
+ <td><a href="configs/inference/vitl/ek100.yaml">config</a></td>
384
+ <td>32.7 R@5</td>
385
+ </tr>
386
+ <tr>
387
+ <td>ViT-g/16<sub>384</td>
388
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt">checkpoint</a></td>
389
+ <td><a href="configs/eval/vitg-384/ssv2.yaml">config</a></td>
390
+ <td><a href="configs/inference/vitg-384/ssv2.yaml">config</a></td>
391
+ <td>77.3%</td>
392
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/diving48-vitg-384-32x4x3.pt">checkpoint</a></td>
393
+ <td><a href="configs/eval/vitg-384/diving48.yaml">config</a></td>
394
+ <td><a href="configs/inference/vitg-384/diving48.yaml">config</a></td>
395
+ <td>90.2%</td>
396
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/evals/ek100-vitg-384.pt">checkpoint</a></td>
397
+ <td><a href="configs/eval/vitg-384/ek100.yaml">config</a></td>
398
+ <td><a href="configs/inference/vitg-384/ek100.yaml">config</a></td>
399
+ <td>39.7 R@5</td>
400
+ </tr>
401
+ </table>
402
+
403
+ ### V-JEPA 2-AC
404
+
405
+ Our action-conditioned checkpoint was trained from the ViT-g encoder.
406
+ <table>
407
+ <tr>
408
+ <th colspan="1">Model</th>
409
+ <th colspan="1">Download Link</th>
410
+ <th colspan="1">Training Config</th>
411
+ </tr>
412
+ <tr>
413
+ <td>ViT-g/16</td>
414
+ <td><a href="https://dl.fbaipublicfiles.com/vjepa2/vjepa2-ac-vitg.pt">checkpoint</a></td>
415
+ <td><a href="configs/train/vitg16/droid-256px-8f.yaml">config</a></td>
416
+ </tr>
417
+ </table>
418
+
419
+ #### Pretrained action-conditioned backbone (via PyTorch Hub)
420
+
421
+ Please install [Pytorch](https://pytorch.org/get-started/locally/), [timm](https://pypi.org/project/timm/) and [einops](https://pypi.org/project/einops/) locally, then run the following to load each model. Installing Pytorch with CUDA support is strongly recommended.
422
+
423
+ ```python
424
+ import torch
425
+
426
+ vjepa2_encoder, vjepa2_ac_predictor = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_ac_vit_giant')
427
+ ```
428
+
429
+
430
+ See [energy_landscape_example.ipynb](notebooks/energy_landscape_example.ipynb) for an example notebook computing the energy landscape of the pretrained action-conditioned backbone using a robot trajectory collected from our lab.
431
+ To run this notebook, you'll need to additionally install [Jupyter](https://jupyter.org/install) and [Scipy](https://scipy.org/install/) in your conda environment.
432
+
433
+
434
+
435
+ ## Getting Started
436
+
437
+ ### Setup
438
+
439
+ ```
440
+ conda create -n vjepa2-312 python=3.12
441
+ conda activate vjepa2-312
442
+ pip install . # or `pip install -e .` for development mode
443
+ ```
444
+
445
+ **Note to macOS users:** V-JEPA 2 relies on [`decord`](https://github.com/dmlc/decord), which does not support macOS (and, unfortunately, is also no longer under development). In order to run the V-JEPA 2 code on macOS, you will need a different `decord` implementation. We do not make specific recommendations, although some users have reported the use of [`eva-decord`](https://github.com/georgia-tech-db/eva-decord) (see [PR 1](https://github.com/facebookresearch/vjepa2/pull/1)) or [`decord2`](https://github.com/johnnynunez/decord2) (see [PR 31](https://github.com/facebookresearch/vjepa2/pull/31)). We leave the selection of the `decord` package up to the user's discretion.
446
+
447
+ ### Usage Demo
448
+
449
+ See [vjepa2_demo.ipynb](notebooks/vjepa2_demo.ipynb) [(Colab Link)](https://colab.research.google.com/github/facebookresearch/vjepa2/blob/main/notebooks/vjepa2_demo.ipynb) or [vjepa2_demo.py](notebooks/vjepa2_demo.py) for an example of how to load both the HuggingFace and PyTorch V-JEPA 2 models and run inference on a sample video to get a sample classification result.
450
+
451
+ The script assumes the presence of downloaded model checkpoints so you will need to download the model weights and update the corresponding paths in the script. E.g.:
452
+ ```
453
+ wget https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt -P YOUR_DIR
454
+ wget https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt -P YOUR_DIR
455
+
456
+ # Then update your model paths in vjepa2_demo.py.
457
+ pt_model_path = YOUR_DIR/vitg-384.pt
458
+ classifier_model_path = YOUR_DIR/ssv2-vitg-384-64x2x3.pt
459
+
460
+ # Then run the script (assumes your machine has a GPU)
461
+ python -m notebooks.vjepa2_demo
462
+ ```
463
+
464
+ ### Probe-based evaluation
465
+
466
+ Probe-based evaluation consists in training an attentive probe on top of frozen V-JEPA 2 features. We provide training scripts for training your own probes, and checkpoints to run inference directly.
467
+
468
+ #### Training probes
469
+
470
+ Evaluations can be run either locally, or distributed via SLURM. (Running locally is useful for debugging and validation).
471
+ These sample commands launch Something-Something v2 video classification; other evals are launched by specifying the corresponding config.
472
+ Use provided training configs under "Evaluation Attentive Probes". These configs allow to train multiple probes in parallel with various optimization parameters.
473
+ Change filepaths as needed (e.g. `folder`, `checkpoint`, `dataset_train`, `dataset_val`) to match locations of data and downloaded checkpoints on your local filesystem.
474
+ Change \# nodes and local batch size as needed to not exceed available GPU memory.
475
+
476
+ ##### Local
477
+
478
+ To run locally, specify the GPUs to use on
479
+ ```
480
+ python -m evals.main --fname configs/eval/vitl16/ssv2.yaml \
481
+ --devices cuda:0 cuda:1
482
+ ```
483
+
484
+ ##### Distributed
485
+
486
+ ```
487
+ python -m evals.main_distributed \
488
+ --fname configs/eval/vitl/ssv2.yaml \
489
+ --time 8600 \
490
+ --account my_account --qos=my_qos
491
+ ```
492
+
493
+ #### Inference from existing probes
494
+
495
+ Use provided inference configs under [Evaluation Attentive Probes](#evaluation-attentive-probes).
496
+ Download the corresponding checkpoint, rename it to 'latest.pt', and create a folder with the checkpoint inside, with the format matching the variables in the config:
497
+ ```
498
+ [folder]/[eval_name]/[tag]/latest.pt
499
+ ```
500
+ Then run inference, locally or distributed, using the same evaluation commands as above, but with configs from `configs/inference`.
501
+
502
+ ### Pretraining
503
+
504
+ Likewise, training can also be run locally or distributed. Pretraining and cooldown training phases are
505
+ run with the same command using different configs.
506
+ These sample commands launch initial training of a ViT-L model. Configs for cooldown (or action-conditioned) training
507
+ can be found in the same directory as the config for initial training.
508
+
509
+ #### Local
510
+
511
+ ```
512
+ python -m app.main --fname configs/train/vitl16/pretrain-256px-16f.yaml \
513
+ --devices cuda:0
514
+ ```
515
+
516
+ #### Distributed
517
+
518
+ ```
519
+ python -m app.main_distributed \
520
+ --fname configs/train/vitl16/pretrain-256px-16f.yaml
521
+ --time 6000
522
+ --account my_account --qos=my_qos
523
+ ```
524
+
525
+ ### Postraining
526
+
527
+ Post-training of the action-conditioned model, starting from the pretrained VJEPA 2 backbone, also follows a similar interface, and can be run locally or distributed using [this config](configs/train/vitg16/droid-256px-8f.yaml).
528
+ We post-train the model starting from the ViT-g/16 backbone.
529
+
530
+ #### Local
531
+
532
+ ```
533
+ python -m app.main --fname configs/train/vitg16/droid-256px-8f.yaml \
534
+ --devices cuda:0
535
+ ```
536
+
537
+ #### Distributed
538
+
539
+ ```
540
+ python -m app.main_distributed \
541
+ --fname configs/train/vitg16/droid-256px-8f.yaml
542
+ --time 6000
543
+ --account my_account --qos=my_qos
544
+ ```
545
+
546
+
547
+ ## Code Structure
548
+
549
+ ```
550
+ .
551
+ ├── app # training loops
552
+ │ ├── vjepa # V-JEPA 2 pre-training
553
+ │ ├── vjepa_2_1 # V-JEPA 2.1 pre-training
554
+ │ ├── vjepa_droid # training the action-conditioned model
555
+ │ ├── main_distributed.py # entrypoint for launch app on slurm cluster
556
+ │ └── main.py # entrypoint for launch app locally on your machine
557
+ ├── configs # config files with experiment params for training and evaluation
558
+ │ ├── train # pretraining with V-JEPA 2 (phase 1), cooldown (phase 2), and action-conditioned training
559
+ │ ├── train_2_1 # pretraining with V-JEPA 2.1 (phase 1), cooldown (phase 2)
560
+ │ └── eval # frozen evaluations
561
+ │ └── inference # inference only frozen evaluations
562
+ ├── evals # evaluation loops training an attentive probe with frozen backbone...
563
+ │ ├── action_anticipation_frozen # action anticipation
564
+ │ ├── image_classification_frozen # image understanding
565
+ │ ├── video_classification_frozen # video understanding
566
+ │ ├── main_distributed.py # entrypoint for distributed evaluations
567
+ │ └── main.py # entrypoint for locally-run evaluations
568
+ ├── src # the package
569
+ │ ├── datasets # datasets, data loaders, ...
570
+ │ ├── models # model definitions
571
+ │ ├── masks # mask collators, masking utilities, ...
572
+ │ └── utils # shared utilities
573
+ ├── tests # unit tests for some modules in `src`
574
+
575
+ ```
576
+
577
+ ## License
578
+
579
+ The majority of V-JEPA 2 is licensed under MIT, however portions of the project are available under separate license terms:
580
+
581
+ [src/datasets/utils/video/randaugment.py](src/datasets/utils/video/randaugment.py)<br>
582
+ [src/datasets/utils/video/randerase.py](src/datasets/utils/video/randerase.py)<br>
583
+ [src/datasets/utils/worker_init_fn.py](src/datasets/utils/worker_init_fn.py)<br>
584
+
585
+ are licensed under the Apache 2.0 license.
586
+
587
+
588
+ ## Citation
589
+ If you find this repository useful in your research, please consider giving a star :star: and cite the papers:
590
+
591
+ ```bibtex
592
+ @article{assran2025vjepa2,
593
+ title={V-JEPA~2: Self-Supervised Video Models Enable Understanding, Prediction and Planning},
594
+ author={Assran, Mahmoud and Bardes, Adrien and Fan, David and Garrido, Quentin and Howes, Russell and
595
+ Komeili, Mojtaba and Muckley, Matthew and Rizvi, Ammar and Roberts, Claire and Sinha, Koustuv and Zholus, Artem and
596
+ Arnaud, Sergio and Gejji, Abha and Martin, Ada and Robert Hogan, Francois and Dugas, Daniel and
597
+ Bojanowski, Piotr and Khalidov, Vasil and Labatut, Patrick and Massa, Francisco and Szafraniec, Marc and
598
+ Krishnakumar, Kapil and Li, Yong and Ma, Xiaodong and Chandar, Sarath and Meier, Franziska and LeCun, Yann and
599
+ Rabbat, Michael and Ballas, Nicolas},
600
+ journal={arXiv preprint arXiv:2506.09985},
601
+ year={2025}
602
+ }
603
+ ```
604
+
605
+ ```bibtex
606
+ @article{murlabadia2026vjepa2_1,
607
+ title={V-JEPA 2.1: Unlocking Dense Features in Video Self-Supervised Learning},
608
+ author={Mur-Labadia, Lorenzo and Muckley, Matthew and Bar, Amir and Assran, Mahmoud and
609
+ Sinha, Koustuv and Rabbat, Michael and LeCun, Yann and Ballas, Nicolas and Bardes, Adrien},
610
+ journal={arXiv preprint arXiv:2603.14482},
611
+ year={2026}
612
+ }
613
+ ```