nvidia-vipe 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (317) hide show
  1. nvidia_vipe-0.1.1/LICENSE +13 -0
  2. nvidia_vipe-0.1.1/MANIFEST.in +8 -0
  3. nvidia_vipe-0.1.1/PKG-INFO +227 -0
  4. nvidia_vipe-0.1.1/README.md +175 -0
  5. nvidia_vipe-0.1.1/THIRD_PARTY_LICENSES.md +3189 -0
  6. nvidia_vipe-0.1.1/configs/default.yaml +16 -0
  7. nvidia_vipe-0.1.1/configs/pipeline/dav3.yaml +8 -0
  8. nvidia_vipe-0.1.1/configs/pipeline/default.yaml +47 -0
  9. nvidia_vipe-0.1.1/configs/pipeline/lyra.yaml +8 -0
  10. nvidia_vipe-0.1.1/configs/pipeline/no_vda.yaml +5 -0
  11. nvidia_vipe-0.1.1/configs/pipeline/panorama.yaml +51 -0
  12. nvidia_vipe-0.1.1/configs/pipeline/static_vda.yaml +8 -0
  13. nvidia_vipe-0.1.1/configs/pipeline/wide_angle.yaml +14 -0
  14. nvidia_vipe-0.1.1/configs/slam/default.yaml +63 -0
  15. nvidia_vipe-0.1.1/configs/streams/frame_dir_stream.yaml +7 -0
  16. nvidia_vipe-0.1.1/configs/streams/raw_mp4_stream.yaml +7 -0
  17. nvidia_vipe-0.1.1/csrc/bind.cpp +49 -0
  18. nvidia_vipe-0.1.1/csrc/corr_ext/correlation.cpp +148 -0
  19. nvidia_vipe-0.1.1/csrc/corr_ext/correlation_cuda_kernel.cu +286 -0
  20. nvidia_vipe-0.1.1/csrc/corr_ext/correlation_sampler.cpp +85 -0
  21. nvidia_vipe-0.1.1/csrc/droid_net_ext/altcorr_kernel.cu +321 -0
  22. nvidia_vipe-0.1.1/csrc/droid_net_ext/correlation_kernels.cu +159 -0
  23. nvidia_vipe-0.1.1/csrc/droid_net_ext/droid.cpp +63 -0
  24. nvidia_vipe-0.1.1/csrc/grounding_dino_ext/ms_deform_attn_cuda.cu +136 -0
  25. nvidia_vipe-0.1.1/csrc/grounding_dino_ext/ms_deform_attn_cuda.h +25 -0
  26. nvidia_vipe-0.1.1/csrc/grounding_dino_ext/ms_deform_im2col_cuda.cuh +914 -0
  27. nvidia_vipe-0.1.1/csrc/grounding_dino_ext/vision.cpp +33 -0
  28. nvidia_vipe-0.1.1/csrc/lietorch_ext/common.h +16 -0
  29. nvidia_vipe-0.1.1/csrc/lietorch_ext/dispatch.h +52 -0
  30. nvidia_vipe-0.1.1/csrc/lietorch_ext/lietorch.cpp +336 -0
  31. nvidia_vipe-0.1.1/csrc/lietorch_ext/lietorch_cpu.cpp +610 -0
  32. nvidia_vipe-0.1.1/csrc/lietorch_ext/lietorch_cpu.h +50 -0
  33. nvidia_vipe-0.1.1/csrc/lietorch_ext/lietorch_gpu.cu +562 -0
  34. nvidia_vipe-0.1.1/csrc/lietorch_ext/lietorch_gpu.h +52 -0
  35. nvidia_vipe-0.1.1/csrc/lietorch_ext/rxso3.h +313 -0
  36. nvidia_vipe-0.1.1/csrc/lietorch_ext/se3.h +214 -0
  37. nvidia_vipe-0.1.1/csrc/lietorch_ext/sim3.h +203 -0
  38. nvidia_vipe-0.1.1/csrc/lietorch_ext/so3.h +202 -0
  39. nvidia_vipe-0.1.1/csrc/scatter_ext/cuda/atomics.cuh +300 -0
  40. nvidia_vipe-0.1.1/csrc/scatter_ext/cuda/reducer.cuh +115 -0
  41. nvidia_vipe-0.1.1/csrc/scatter_ext/cuda/scatter_cuda.cu +128 -0
  42. nvidia_vipe-0.1.1/csrc/scatter_ext/cuda/scatter_cuda.h +15 -0
  43. nvidia_vipe-0.1.1/csrc/scatter_ext/cuda/utils.cuh +29 -0
  44. nvidia_vipe-0.1.1/csrc/scatter_ext/scatter.cpp +238 -0
  45. nvidia_vipe-0.1.1/csrc/slam_ext/geom_kernels.cu +1509 -0
  46. nvidia_vipe-0.1.1/csrc/slam_ext/slam.cpp +37 -0
  47. nvidia_vipe-0.1.1/csrc/utils_ext/cuda_kdtree.cu +1361 -0
  48. nvidia_vipe-0.1.1/csrc/utils_ext/cuda_kdtree.cuh +137 -0
  49. nvidia_vipe-0.1.1/csrc/utils_ext/knn.cu +66 -0
  50. nvidia_vipe-0.1.1/csrc/utils_ext/math_util.h +1392 -0
  51. nvidia_vipe-0.1.1/csrc/utils_ext/platform.h +50 -0
  52. nvidia_vipe-0.1.1/csrc/utils_ext/utils_bind.cpp +25 -0
  53. nvidia_vipe-0.1.1/nvidia_vipe.egg-info/PKG-INFO +227 -0
  54. nvidia_vipe-0.1.1/nvidia_vipe.egg-info/SOURCES.txt +315 -0
  55. nvidia_vipe-0.1.1/nvidia_vipe.egg-info/dependency_links.txt +1 -0
  56. nvidia_vipe-0.1.1/nvidia_vipe.egg-info/entry_points.txt +2 -0
  57. nvidia_vipe-0.1.1/nvidia_vipe.egg-info/requires.txt +26 -0
  58. nvidia_vipe-0.1.1/nvidia_vipe.egg-info/top_level.txt +2 -0
  59. nvidia_vipe-0.1.1/pyproject.toml +152 -0
  60. nvidia_vipe-0.1.1/setup.cfg +4 -0
  61. nvidia_vipe-0.1.1/setup.py +48 -0
  62. nvidia_vipe-0.1.1/tests/test_config.py +128 -0
  63. nvidia_vipe-0.1.1/vipe/__init__.py +42 -0
  64. nvidia_vipe-0.1.1/vipe/cli/__init__.py +14 -0
  65. nvidia_vipe-0.1.1/vipe/cli/main.py +110 -0
  66. nvidia_vipe-0.1.1/vipe/config/__init__.py +29 -0
  67. nvidia_vipe-0.1.1/vipe/config/base_schema.py +43 -0
  68. nvidia_vipe-0.1.1/vipe/config/parse.py +67 -0
  69. nvidia_vipe-0.1.1/vipe/config/pipeline.py +70 -0
  70. nvidia_vipe-0.1.1/vipe/config/slam.py +73 -0
  71. nvidia_vipe-0.1.1/vipe/config/streams.py +36 -0
  72. nvidia_vipe-0.1.1/vipe/config/vipe.py +15 -0
  73. nvidia_vipe-0.1.1/vipe/configs/__init__.py +1 -0
  74. nvidia_vipe-0.1.1/vipe/configs/default.yaml +16 -0
  75. nvidia_vipe-0.1.1/vipe/configs/pipeline/__init__.py +1 -0
  76. nvidia_vipe-0.1.1/vipe/configs/pipeline/dav3.yaml +8 -0
  77. nvidia_vipe-0.1.1/vipe/configs/pipeline/default.yaml +47 -0
  78. nvidia_vipe-0.1.1/vipe/configs/pipeline/lyra.yaml +8 -0
  79. nvidia_vipe-0.1.1/vipe/configs/pipeline/no_vda.yaml +5 -0
  80. nvidia_vipe-0.1.1/vipe/configs/pipeline/panorama.yaml +51 -0
  81. nvidia_vipe-0.1.1/vipe/configs/pipeline/static_vda.yaml +8 -0
  82. nvidia_vipe-0.1.1/vipe/configs/pipeline/wide_angle.yaml +14 -0
  83. nvidia_vipe-0.1.1/vipe/configs/slam/__init__.py +1 -0
  84. nvidia_vipe-0.1.1/vipe/configs/slam/default.yaml +63 -0
  85. nvidia_vipe-0.1.1/vipe/configs/streams/__init__.py +1 -0
  86. nvidia_vipe-0.1.1/vipe/configs/streams/frame_dir_stream.yaml +7 -0
  87. nvidia_vipe-0.1.1/vipe/configs/streams/raw_mp4_stream.yaml +7 -0
  88. nvidia_vipe-0.1.1/vipe/ext/__init__.py +45 -0
  89. nvidia_vipe-0.1.1/vipe/ext/corr/__init__.py +5 -0
  90. nvidia_vipe-0.1.1/vipe/ext/corr/spatial_correlation_sampler.py +126 -0
  91. nvidia_vipe-0.1.1/vipe/ext/lietorch/__init__.py +6 -0
  92. nvidia_vipe-0.1.1/vipe/ext/lietorch/broadcasting.py +34 -0
  93. nvidia_vipe-0.1.1/vipe/ext/lietorch/group_ops.py +123 -0
  94. nvidia_vipe-0.1.1/vipe/ext/lietorch/groups.py +330 -0
  95. nvidia_vipe-0.1.1/vipe/ext/scatter.py +205 -0
  96. nvidia_vipe-0.1.1/vipe/ext/specs.py +74 -0
  97. nvidia_vipe-0.1.1/vipe/ext/xformers.py +51 -0
  98. nvidia_vipe-0.1.1/vipe/pipeline/__init__.py +87 -0
  99. nvidia_vipe-0.1.1/vipe/pipeline/default.py +157 -0
  100. nvidia_vipe-0.1.1/vipe/pipeline/panorama.py +318 -0
  101. nvidia_vipe-0.1.1/vipe/pipeline/processors.py +521 -0
  102. nvidia_vipe-0.1.1/vipe/priors/__init__.py +0 -0
  103. nvidia_vipe-0.1.1/vipe/priors/depth/__init__.py +46 -0
  104. nvidia_vipe-0.1.1/vipe/priors/depth/adapter.py +107 -0
  105. nvidia_vipe-0.1.1/vipe/priors/depth/alignment.py +125 -0
  106. nvidia_vipe-0.1.1/vipe/priors/depth/base.py +114 -0
  107. nvidia_vipe-0.1.1/vipe/priors/depth/dap/__init__.py +151 -0
  108. nvidia_vipe-0.1.1/vipe/priors/depth/dap/dino.py +691 -0
  109. nvidia_vipe-0.1.1/vipe/priors/depth/dap/model.py +361 -0
  110. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/__init__.py +105 -0
  111. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2.py +426 -0
  112. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/__init__.py +11 -0
  113. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/attention.py +69 -0
  114. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/block.py +253 -0
  115. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/drop_path.py +37 -0
  116. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/layer_scale.py +27 -0
  117. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/mlp.py +41 -0
  118. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/patch_embed.py +114 -0
  119. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dinov2_layers/swiglu_ffn.py +33 -0
  120. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/dpt.py +280 -0
  121. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/util/__init__.py +3 -0
  122. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/util/blocks.py +207 -0
  123. nvidia_vipe-0.1.1/vipe/priors/depth/dav2/util/transform.py +167 -0
  124. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/__init__.py +98 -0
  125. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/api.py +489 -0
  126. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/cfg.py +183 -0
  127. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/configs/__init__.py +1 -0
  128. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/configs/da3-giant.yaml +45 -0
  129. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/configs/da3metric-large.yaml +28 -0
  130. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/model/__init__.py +20 -0
  131. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/model/da3.py +361 -0
  132. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/model/heads.py +1102 -0
  133. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/model/utils.py +410 -0
  134. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/registry.py +50 -0
  135. nvidia_vipe-0.1.1/vipe/priors/depth/dav3/utils.py +1264 -0
  136. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/__init__.py +167 -0
  137. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/__init__.py +3 -0
  138. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/backbones/ConvNeXt.py +308 -0
  139. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/backbones/ViT_DINO_reg.py +1355 -0
  140. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/backbones/__init__.py +17 -0
  141. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/configs/__init__.py +1 -0
  142. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/configs/convlarge.0.3_150.py +52 -0
  143. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/configs/convtiny.0.3_150.py +52 -0
  144. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/configs/vit.raft5.giant2.py +59 -0
  145. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/configs/vit.raft5.large.py +59 -0
  146. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/configs/vit.raft5.small.py +59 -0
  147. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/decode_heads/HourGlassDecoder.py +305 -0
  148. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/decode_heads/RAFTDepthNormalDPTDecoder5.py +1322 -0
  149. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/decode_heads/__init__.py +8 -0
  150. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/dense_pipeline.py +64 -0
  151. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model/monodepth_model.py +41 -0
  152. nvidia_vipe-0.1.1/vipe/priors/depth/metric3d/model_fn.py +192 -0
  153. nvidia_vipe-0.1.1/vipe/priors/depth/moge.py +86 -0
  154. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/__init__.py +49 -0
  155. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/dav2/__init__.py +33 -0
  156. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/dav2/dinov2.py +423 -0
  157. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/dav2/dpt.py +272 -0
  158. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/dav2/transform.py +76 -0
  159. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/depth_completion.py +428 -0
  160. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/priorda.py +268 -0
  161. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/sparse_sampler.py +356 -0
  162. nvidia_vipe-0.1.1/vipe/priors/depth/priorda/utils.py +59 -0
  163. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/__init__.py +71 -0
  164. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/layers.py +341 -0
  165. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/models/__init__.py +1 -0
  166. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/models/encoder.py +951 -0
  167. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/models/unidepthv2/__init__.py +1 -0
  168. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/models/unidepthv2/decoder.py +491 -0
  169. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/models/unidepthv2/unidepthv2.py +424 -0
  170. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/utils/__init__.py +1 -0
  171. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/utils/camera.py +1308 -0
  172. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/utils/constants.py +20 -0
  173. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/utils/coordinate.py +30 -0
  174. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/utils/geometric.py +273 -0
  175. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/utils/misc.py +648 -0
  176. nvidia_vipe-0.1.1/vipe/priors/depth/unidepth/utils/positional_embedding.py +250 -0
  177. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/__init__.py +65 -0
  178. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/models/__init__.py +1 -0
  179. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/models/decoder.py +504 -0
  180. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/models/encoder.py +68 -0
  181. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/models/layers.py +848 -0
  182. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/unik3d.py +433 -0
  183. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/__init__.py +1 -0
  184. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/camera.py +1310 -0
  185. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/constants.py +42 -0
  186. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/coordinate.py +25 -0
  187. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/geometric.py +444 -0
  188. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/misc.py +592 -0
  189. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/positional_embedding.py +245 -0
  190. nvidia_vipe-0.1.1/vipe/priors/depth/unik3d/utils/sht.py +1231 -0
  191. nvidia_vipe-0.1.1/vipe/priors/depth/videodepthanything/__init__.py +73 -0
  192. nvidia_vipe-0.1.1/vipe/priors/depth/videodepthanything/dpt_temporal.py +135 -0
  193. nvidia_vipe-0.1.1/vipe/priors/depth/videodepthanything/motion_module/__init__.py +1 -0
  194. nvidia_vipe-0.1.1/vipe/priors/depth/videodepthanything/motion_module/attention.py +435 -0
  195. nvidia_vipe-0.1.1/vipe/priors/depth/videodepthanything/motion_module/motion_module.py +326 -0
  196. nvidia_vipe-0.1.1/vipe/priors/depth/videodepthanything/util.py +76 -0
  197. nvidia_vipe-0.1.1/vipe/priors/depth/videodepthanything/video_depth.py +182 -0
  198. nvidia_vipe-0.1.1/vipe/priors/geocalib/__init__.py +5 -0
  199. nvidia_vipe-0.1.1/vipe/priors/geocalib/camera.py +953 -0
  200. nvidia_vipe-0.1.1/vipe/priors/geocalib/extractor.py +134 -0
  201. nvidia_vipe-0.1.1/vipe/priors/geocalib/geocalib.py +159 -0
  202. nvidia_vipe-0.1.1/vipe/priors/geocalib/gravity.py +131 -0
  203. nvidia_vipe-0.1.1/vipe/priors/geocalib/lm_optimizer.py +627 -0
  204. nvidia_vipe-0.1.1/vipe/priors/geocalib/misc.py +322 -0
  205. nvidia_vipe-0.1.1/vipe/priors/geocalib/modules.py +624 -0
  206. nvidia_vipe-0.1.1/vipe/priors/geocalib/perspective_fields.py +372 -0
  207. nvidia_vipe-0.1.1/vipe/priors/geocalib/utils.py +326 -0
  208. nvidia_vipe-0.1.1/vipe/priors/track_anything/__init__.py +123 -0
  209. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/__init__.py +0 -0
  210. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/config.py +211 -0
  211. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/__init__.py +3 -0
  212. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/decoders/__init__.py +12 -0
  213. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/decoders/fpn.py +73 -0
  214. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/encoders/__init__.py +20 -0
  215. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/encoders/resnet.py +205 -0
  216. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/engines/__init__.py +25 -0
  217. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/engines/aot_engine.py +645 -0
  218. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/engines/deaot_engine.py +115 -0
  219. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/layers/__init__.py +3 -0
  220. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/layers/attention.py +909 -0
  221. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/layers/basic.py +149 -0
  222. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/layers/normalization.py +48 -0
  223. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/layers/position.py +81 -0
  224. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/layers/transformer.py +696 -0
  225. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/models/__init__.py +15 -0
  226. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/models/aot.py +130 -0
  227. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/networks/models/deaot.py +63 -0
  228. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/transforms/__init__.py +3 -0
  229. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/transforms/image_transforms.py +532 -0
  230. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/transforms/video_transforms.py +688 -0
  231. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/utils/__init__.py +3 -0
  232. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/utils/checkpoint.py +158 -0
  233. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/utils/ema.py +94 -0
  234. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/utils/image.py +847 -0
  235. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/utils/learning.py +106 -0
  236. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot/utils/math.py +28 -0
  237. nvidia_vipe-0.1.1/vipe/priors/track_anything/aot_tracker.py +189 -0
  238. nvidia_vipe-0.1.1/vipe/priors/track_anything/detector.py +104 -0
  239. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/__init__.py +0 -0
  240. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/config.py +49 -0
  241. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/datasets/__init__.py +0 -0
  242. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/datasets/transforms.py +300 -0
  243. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/__init__.py +18 -0
  244. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/__init__.py +15 -0
  245. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/backbone/__init__.py +1 -0
  246. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/backbone/backbone.py +240 -0
  247. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/backbone/position_encoding.py +197 -0
  248. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/backbone/swin_transformer.py +798 -0
  249. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/bertwarper.py +293 -0
  250. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/fuse_modules.py +300 -0
  251. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/groundingdino.py +385 -0
  252. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/ms_deform_attn.py +397 -0
  253. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/transformer.py +955 -0
  254. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/transformer_vanilla.py +119 -0
  255. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/main/utils.py +257 -0
  256. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/models/registry.py +66 -0
  257. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/util/__init__.py +1 -0
  258. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/util/box_ops.py +140 -0
  259. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/util/get_tokenlizer.py +31 -0
  260. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/util/inference.py +216 -0
  261. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/util/misc.py +750 -0
  262. nvidia_vipe-0.1.1/vipe/priors/track_anything/groundingdino/util/utils.py +548 -0
  263. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/__init__.py +15 -0
  264. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/automatic_mask_generator.py +382 -0
  265. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/build_sam.py +113 -0
  266. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/modeling/__init__.py +11 -0
  267. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/modeling/common.py +43 -0
  268. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/modeling/image_encoder.py +419 -0
  269. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/modeling/mask_decoder.py +186 -0
  270. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/modeling/prompt_encoder.py +225 -0
  271. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/modeling/sam.py +180 -0
  272. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/modeling/transformer.py +242 -0
  273. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/predictor.py +284 -0
  274. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/utils/__init__.py +5 -0
  275. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/utils/amg.py +346 -0
  276. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/utils/onnx.py +155 -0
  277. nvidia_vipe-0.1.1/vipe/priors/track_anything/sam/utils/transforms.py +112 -0
  278. nvidia_vipe-0.1.1/vipe/priors/track_anything/seg_tracker.py +173 -0
  279. nvidia_vipe-0.1.1/vipe/priors/track_anything/segmentor.py +107 -0
  280. nvidia_vipe-0.1.1/vipe/slam/__init__.py +14 -0
  281. nvidia_vipe-0.1.1/vipe/slam/ba/__init__.py +14 -0
  282. nvidia_vipe-0.1.1/vipe/slam/ba/kernel.py +162 -0
  283. nvidia_vipe-0.1.1/vipe/slam/ba/solver.py +196 -0
  284. nvidia_vipe-0.1.1/vipe/slam/ba/terms.py +432 -0
  285. nvidia_vipe-0.1.1/vipe/slam/components/__init__.py +14 -0
  286. nvidia_vipe-0.1.1/vipe/slam/components/backend.py +123 -0
  287. nvidia_vipe-0.1.1/vipe/slam/components/buffer.py +768 -0
  288. nvidia_vipe-0.1.1/vipe/slam/components/factor_graph.py +493 -0
  289. nvidia_vipe-0.1.1/vipe/slam/components/frontend.py +158 -0
  290. nvidia_vipe-0.1.1/vipe/slam/components/inner_filler.py +137 -0
  291. nvidia_vipe-0.1.1/vipe/slam/components/motion_filter.py +150 -0
  292. nvidia_vipe-0.1.1/vipe/slam/components/sparse_tracks/__init__.py +155 -0
  293. nvidia_vipe-0.1.1/vipe/slam/components/sparse_tracks/cuvslam.py +92 -0
  294. nvidia_vipe-0.1.1/vipe/slam/interface.py +205 -0
  295. nvidia_vipe-0.1.1/vipe/slam/maths/__init__.py +14 -0
  296. nvidia_vipe-0.1.1/vipe/slam/maths/geom.py +353 -0
  297. nvidia_vipe-0.1.1/vipe/slam/maths/matrix.py +514 -0
  298. nvidia_vipe-0.1.1/vipe/slam/maths/retractor.py +64 -0
  299. nvidia_vipe-0.1.1/vipe/slam/maths/vector.py +156 -0
  300. nvidia_vipe-0.1.1/vipe/slam/networks/__init__.py +14 -0
  301. nvidia_vipe-0.1.1/vipe/slam/networks/droid_net.py +553 -0
  302. nvidia_vipe-0.1.1/vipe/slam/networks/superpoint.py +181 -0
  303. nvidia_vipe-0.1.1/vipe/slam/system.py +321 -0
  304. nvidia_vipe-0.1.1/vipe/streams/__init__.py +14 -0
  305. nvidia_vipe-0.1.1/vipe/streams/base.py +539 -0
  306. nvidia_vipe-0.1.1/vipe/streams/frame_dir_stream.py +142 -0
  307. nvidia_vipe-0.1.1/vipe/streams/raw_mp4_stream.py +116 -0
  308. nvidia_vipe-0.1.1/vipe/utils/__init__.py +14 -0
  309. nvidia_vipe-0.1.1/vipe/utils/cameras.py +407 -0
  310. nvidia_vipe-0.1.1/vipe/utils/depth.py +419 -0
  311. nvidia_vipe-0.1.1/vipe/utils/geometry.py +677 -0
  312. nvidia_vipe-0.1.1/vipe/utils/io.py +381 -0
  313. nvidia_vipe-0.1.1/vipe/utils/logging.py +53 -0
  314. nvidia_vipe-0.1.1/vipe/utils/misc.py +26 -0
  315. nvidia_vipe-0.1.1/vipe/utils/morph.py +134 -0
  316. nvidia_vipe-0.1.1/vipe/utils/viser.py +446 -0
  317. nvidia_vipe-0.1.1/vipe/utils/visualization.py +494 -0
@@ -0,0 +1,13 @@
1
+ Copyright 2025 NVIDIA CORPORATION & AFFILIATES.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
@@ -0,0 +1,8 @@
1
+ include LICENSE
2
+ include README.md
3
+ include THIRD_PARTY_LICENSES.md
4
+ recursive-include csrc *
5
+ prune csrc/include
6
+ recursive-include configs *.yaml
7
+ recursive-include vipe/configs *.yaml
8
+ recursive-include vipe/priors *.yaml
@@ -0,0 +1,227 @@
1
+ Metadata-Version: 2.4
2
+ Name: nvidia-vipe
3
+ Version: 0.1.1
4
+ Summary: NVIDIA Video Pose Engine
5
+ Author-email: The ViPE Authors <jiahuih@nvidia.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://research.nvidia.com/labs/toronto-ai/vipe
8
+ Project-URL: Repository, https://github.com/nv-tlabs/vipe
9
+ Project-URL: Paper, https://research.nvidia.com/labs/toronto-ai/vipe/assets/paper.pdf
10
+ Keywords: computer-vision,depth-estimation,nvidia,pose-estimation,video
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Scientific/Engineering :: Image Processing
22
+ Requires-Python: <3.15,>=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: torch==2.9.0
26
+ Requires-Dist: torchvision==0.24.0
27
+ Requires-Dist: click
28
+ Requires-Dist: einops
29
+ Requires-Dist: gdown
30
+ Requires-Dist: huggingface_hub
31
+ Requires-Dist: hydra-core
32
+ Requires-Dist: imageio[ffmpeg]
33
+ Requires-Dist: kornia
34
+ Requires-Dist: matplotlib
35
+ Requires-Dist: ninja
36
+ Requires-Dist: numpy
37
+ Requires-Dist: omegaconf
38
+ Requires-Dist: opencv-python
39
+ Requires-Dist: OpenEXR<3.3.0
40
+ Requires-Dist: pydantic<3,>=2
41
+ Requires-Dist: pillow
42
+ Requires-Dist: python-pycg
43
+ Requires-Dist: ray
44
+ Requires-Dist: rerun-sdk
45
+ Requires-Dist: safetensors
46
+ Requires-Dist: scipy
47
+ Requires-Dist: timm
48
+ Requires-Dist: tqdm
49
+ Requires-Dist: transformers<5,>=4
50
+ Requires-Dist: viser
51
+ Dynamic: license-file
52
+
53
+ # ViPE: Video Pose Engine for Geometric 3D Perception
54
+
55
+ <p align="center">
56
+ <img src="assets/teaser.gif" alt="teaser"/>
57
+ </p>
58
+
59
+ **TL;DR: ViPE is a useful open-source spatial AI tool for annotating camera poses and dense depth maps from raw videos!**
60
+
61
+ **Contributors**: NVIDIA (Spatial Intelligence Lab, Dynamic Vision Lab, NVIDIA Issac, NVIDIA Research).
62
+
63
+ **Full Abstract**: Accurate 3D geometric perception is an important prerequisite for a wide range of spatial AI systems. While state-of-the-art methods depend on large-scale training data, acquiring consistent and precise 3D annotations from in-the-wild videos remains a key challenge. In this work, we introduce ViPE, a handy and versatile video processing engine designed to bridge this gap. ViPE efficiently estimates camera intrinsics, camera motion, and dense, near-metric depth maps from unconstrained raw videos. It is robust to diverse scenarios, including dynamic selfie videos, cinematic shots, or dashcams, and supports various camera models such as pinhole, wide-angle, and 360° panoramas.
64
+ We use ViPE to annotate a large-scale collection of videos. This collection includes around 100K real-world internet videos, 1M high-quality AI-generated videos, and 2K panoramic videos, totaling approximately 96M frames -- all annotated with accurate camera poses and dense depth maps. We open source ViPE and the annotated dataset with the hope to accelerate the development of spatial AI systems.
65
+
66
+ **[Technical Whitepaper](https://research.nvidia.com/labs/toronto-ai/vipe/assets/paper.pdf), [Project Page](https://research.nvidia.com/labs/toronto-ai/vipe), [Dataset](#downloading-the-dataset)**
67
+
68
+ ## Installation
69
+
70
+ ### Installing From PyPI
71
+
72
+ ```bash
73
+ pip install nvidia-vipe
74
+ ```
75
+
76
+ This installs the `vipe` Python package and the `vipe` CLI. ViPE builds native CUDA extensions during installation, so the environment still needs a CUDA-enabled PyTorch build and an available CUDA toolkit with `nvcc`.
77
+ If a compatible binary wheel is available for your platform, pip will use it directly and skip the local CUDA build.
78
+
79
+ ### Installing From Source
80
+
81
+ To keep native and Python dependencies separate, we use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) for the CUDA/native toolchain and [uv](https://docs.astral.sh/uv/) for the local Python environment in `.venv`.
82
+
83
+ ```bash
84
+ # Create a conda environment for uv, CUDA, and native build dependencies.
85
+ conda env create -f envs/cu128.yml
86
+ conda activate cu128
87
+
88
+ # Create .venv, install Python runtime dependencies, and build the package.
89
+ uv sync
90
+ ```
91
+
92
+ For development, include the `dev` dependency group:
93
+
94
+ ```bash
95
+ conda activate cu128
96
+ uv sync --dev
97
+
98
+ uv run --dev pre-commit install
99
+ uv run --dev ruff format .
100
+ uv run --dev ruff check .
101
+ uv run --dev mypy
102
+ ```
103
+
104
+ ## Usage
105
+
106
+ ### Using the ViPE CLI
107
+
108
+ Once the python package is installed, you can use the `vipe` CLI to process raw videos in mp4 format.
109
+
110
+ ```bash
111
+ # Replace YOUR_VIDEO.mp4 with the path to your video. We provide sample videos in assets/examples.
112
+ uv run vipe infer YOUR_VIDEO.mp4
113
+ # Additional options:
114
+ # --output: Output directory (default: vipe_results)
115
+ # --visualize: Enable visualization of intermediate and final results (default: false)
116
+ # --pipeline: Pipeline configuration to use (default: default)
117
+ ```
118
+
119
+ ![vipe-vis](assets/vipe-vis.gif)
120
+
121
+ Currently, we support the following pipeline configurations:
122
+ - `default`: The default pipeline for pinhole cameras.
123
+ - `lyra`: Configuration for results in the [Lyra](https://github.com/nv-tlabs/lyra) paper.
124
+ - `dav3`: Using the newest Depth-Anything-V3 model as depth estimation model.
125
+ - `no_vda`: If running video-depth-anything is too memory-consuming for you, this configuration can produce less temporally-stable depth (but empirically more 3D consistent) maps.
126
+ - `wide_angle`: If your video contains some wide-angle or fisheye distortion.
127
+ - `panorama`: For 360° videos, add `pipeline.post.depth_align_model=dap` (MIT) or `unik3d` (CC-BY-NC 4.0) to enable depth estimation for panoramas.
128
+
129
+ One can visualize the results that ViPE produces by running (supported by `viser`):
130
+ ```bash
131
+ uv run vipe visualize vipe_results/
132
+ # Please modify the above vipe_results/ path to the output directory of your choice.
133
+ ```
134
+
135
+ ![vipe-viser](assets/vipe-viser.gif)
136
+
137
+ ### Using the `run.py` script
138
+
139
+ The `run.py` script is a more flexible way to run ViPE. Compared to the CLI, the script supports running on multiple videos at once and allows more fine-grained control over the pipeline with `hydra` configs. It also provides an example of using `vipe` as a library in your own project.
140
+
141
+ Example usages:
142
+
143
+ ```bash
144
+ # Running the full pipeline.
145
+ uv run python run.py pipeline=default streams=raw_mp4_stream streams.base_path=YOUR_VIDEO_OR_DIR_PATH
146
+
147
+ # Running the pose-only pipeline without depth estimation.
148
+ uv run python run.py pipeline=default streams=raw_mp4_stream streams.base_path=YOUR_VIDEO_OR_DIR_PATH pipeline.post.depth_align_model=null
149
+ ```
150
+
151
+ ### Converting to COLMAP format
152
+
153
+ You can use the following script to convert the ViPE results to COLMAP format. For example:
154
+ ```bash
155
+ uv run python scripts/vipe_to_colmap.py vipe_results/ --sequence dog_example
156
+ ```
157
+ This will unproject the dense depth maps to create the 3D point cloud.
158
+ Alternatively for a more lightweight and 3D consistent point cloud, you can add the `--use_slam_map` flag to the above command. This requires you to run the full pipeline with `pipeline.output.save_slam_map=true` to save the additional information.
159
+
160
+ ## Downloading the Dataset
161
+
162
+ ![dataset](assets/dataset.gif)
163
+
164
+ Together with ViPE we release a large-scale dataset containing ~1M high-quality videos with accurate camera poses and dense depth maps. Specifications of the datasets are listed below:
165
+
166
+ | Dataset Name | # Videos | # Frames | Hugging Face Link | License | Prefix |
167
+ | -------------- | -------- | -------- | ------------------------------------------------------------ | ------------ | ------ |
168
+ | Dynpose-100K++ | 99,501 | 15.8M | [Link](https://huggingface.co/datasets/nvidia/vipe-dynpose-100kpp) | CC-BY-NC 4.0 | `dpsp` |
169
+ | Wild-SDG-1M | 966,448 | 78.2M | [Link](https://huggingface.co/datasets/nvidia/vipe-wild-sdg-1m) | CC-BY-NC 4.0 | `wsdg` |
170
+ | Web360 | 2,114 | 212K | [Link](https://huggingface.co/datasets/nvidia/vipe-web360) | CC-BY 4.0 | `w360` |
171
+
172
+ You can download the datasets using the following utility script:
173
+
174
+ ```bash
175
+ # Replace YOUR_PREFIX with the prefix of the dataset to be downloaded (see prefix column in the table above)
176
+ # You can also use more specific prefixes, e.g. wsdg-003e2c86 to download a specific shard of the dataset.
177
+ uv run python scripts/download_dataset.py --prefix YOUR_PREFIX --output_base YOUR_OUTPUT_DIR --rgb --depth
178
+ ```
179
+
180
+ > Note that the depth component is very large and you might expect a long downloading time. For `rgb` component of the Dynpose-100K++ dataset, we directly retrieve the RGB frames from YouTube. You have to `pip install yt_dlp ffmpeg-python` to use this feature. Please refer to the original [Dynpose-100K dataset](https://huggingface.co/datasets/nvidia/dynpose-100k) for alternative approaches to retrieve the videos.
181
+
182
+ The dataset itself can be visualized using the same visualization script:
183
+ ```bash
184
+ uv run vipe visualize YOUR_OUTPUT_DIR
185
+ ```
186
+
187
+ ## Acknowledgments
188
+
189
+ ViPE is built on top of many great open-source research projects and codebases. Some of these include (not exhaustive):
190
+ - [DROID-SLAM](https://github.com/princeton-vl/DROID-SLAM)
191
+ - [Depth Anything V2](https://github.com/DepthAnything/Depth-Anything-V2)
192
+ - [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3)
193
+ - [Metric3Dv2](https://github.com/YvanYin/Metric3D)
194
+ - [PriorDA](https://github.com/SpatialVision/Prior-Depth-Anything)
195
+ - [UniDepth](https://github.com/lpiccinelli-eth/UniDepth)
196
+ - [UniK3D](https://github.com/lpiccinelli-eth/UniK3D)
197
+ - [VideoDepthAnything](https://github.com/DepthAnything/Video-Depth-Anything)
198
+ - [GeoCalib](https://github.com/cvg/GeoCalib)
199
+ - [Segment and Track Anything](https://github.com/z-x-yang/Segment-and-Track-Anything)
200
+
201
+ Please refer to the [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md) for a full list of projects and their licenses.
202
+
203
+ We thank useful discussions from Aigul Dzhumamuratova, Viktor Kuznetsov, Soha Pouya, and Ming-Yu Liu, as well as release support from Vishal Kulkarni.
204
+
205
+ ## TODO
206
+
207
+ - [x] Initial code released under Apache 2.0 license.
208
+ - [x] Full dataset uploaded to Hugging Face for download.
209
+ - [x] Add instructions to run inference on wide-angle and 360° videos.
210
+ - [ ] Add instructions for benchmarking.
211
+
212
+ ## Citation
213
+
214
+ If you find ViPE useful in your research or application, please consider citing the following whitepaper:
215
+
216
+ ```
217
+ @inproceedings{huang2025vipe,
218
+ title={ViPE: Video Pose Engine for 3D Geometric Perception},
219
+ author={Huang, Jiahui and Zhou, Qunjie and Rabeti, Hesam and Korovko, Aleksandr and Ling, Huan and Ren, Xuanchi and Shen, Tianchang and Gao, Jun and Slepichev, Dmitry and Lin, Chen-Hsuan and Ren, Jiawei and Xie, Kevin and Biswas, Joydeep and Leal-Taixe, Laura and Fidler, Sanja},
220
+ booktitle={NVIDIA Research Whitepapers arXiv:2508.10934},
221
+ year={2025}
222
+ }
223
+ ```
224
+
225
+ ## License
226
+
227
+ This project will download and install additional third-party **models and softwares**. Note that these models or softwares are not distributed by NVIDIA. Review the license terms of these models and projects before use. This source code, **except for the Unik3D part (which is under the BY-NC-SA 4.0 license)** , is released under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
@@ -0,0 +1,175 @@
1
+ # ViPE: Video Pose Engine for Geometric 3D Perception
2
+
3
+ <p align="center">
4
+ <img src="assets/teaser.gif" alt="teaser"/>
5
+ </p>
6
+
7
+ **TL;DR: ViPE is a useful open-source spatial AI tool for annotating camera poses and dense depth maps from raw videos!**
8
+
9
+ **Contributors**: NVIDIA (Spatial Intelligence Lab, Dynamic Vision Lab, NVIDIA Issac, NVIDIA Research).
10
+
11
+ **Full Abstract**: Accurate 3D geometric perception is an important prerequisite for a wide range of spatial AI systems. While state-of-the-art methods depend on large-scale training data, acquiring consistent and precise 3D annotations from in-the-wild videos remains a key challenge. In this work, we introduce ViPE, a handy and versatile video processing engine designed to bridge this gap. ViPE efficiently estimates camera intrinsics, camera motion, and dense, near-metric depth maps from unconstrained raw videos. It is robust to diverse scenarios, including dynamic selfie videos, cinematic shots, or dashcams, and supports various camera models such as pinhole, wide-angle, and 360° panoramas.
12
+ We use ViPE to annotate a large-scale collection of videos. This collection includes around 100K real-world internet videos, 1M high-quality AI-generated videos, and 2K panoramic videos, totaling approximately 96M frames -- all annotated with accurate camera poses and dense depth maps. We open source ViPE and the annotated dataset with the hope to accelerate the development of spatial AI systems.
13
+
14
+ **[Technical Whitepaper](https://research.nvidia.com/labs/toronto-ai/vipe/assets/paper.pdf), [Project Page](https://research.nvidia.com/labs/toronto-ai/vipe), [Dataset](#downloading-the-dataset)**
15
+
16
+ ## Installation
17
+
18
+ ### Installing From PyPI
19
+
20
+ ```bash
21
+ pip install nvidia-vipe
22
+ ```
23
+
24
+ This installs the `vipe` Python package and the `vipe` CLI. ViPE builds native CUDA extensions during installation, so the environment still needs a CUDA-enabled PyTorch build and an available CUDA toolkit with `nvcc`.
25
+ If a compatible binary wheel is available for your platform, pip will use it directly and skip the local CUDA build.
26
+
27
+ ### Installing From Source
28
+
29
+ To keep native and Python dependencies separate, we use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) for the CUDA/native toolchain and [uv](https://docs.astral.sh/uv/) for the local Python environment in `.venv`.
30
+
31
+ ```bash
32
+ # Create a conda environment for uv, CUDA, and native build dependencies.
33
+ conda env create -f envs/cu128.yml
34
+ conda activate cu128
35
+
36
+ # Create .venv, install Python runtime dependencies, and build the package.
37
+ uv sync
38
+ ```
39
+
40
+ For development, include the `dev` dependency group:
41
+
42
+ ```bash
43
+ conda activate cu128
44
+ uv sync --dev
45
+
46
+ uv run --dev pre-commit install
47
+ uv run --dev ruff format .
48
+ uv run --dev ruff check .
49
+ uv run --dev mypy
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ### Using the ViPE CLI
55
+
56
+ Once the python package is installed, you can use the `vipe` CLI to process raw videos in mp4 format.
57
+
58
+ ```bash
59
+ # Replace YOUR_VIDEO.mp4 with the path to your video. We provide sample videos in assets/examples.
60
+ uv run vipe infer YOUR_VIDEO.mp4
61
+ # Additional options:
62
+ # --output: Output directory (default: vipe_results)
63
+ # --visualize: Enable visualization of intermediate and final results (default: false)
64
+ # --pipeline: Pipeline configuration to use (default: default)
65
+ ```
66
+
67
+ ![vipe-vis](assets/vipe-vis.gif)
68
+
69
+ Currently, we support the following pipeline configurations:
70
+ - `default`: The default pipeline for pinhole cameras.
71
+ - `lyra`: Configuration for results in the [Lyra](https://github.com/nv-tlabs/lyra) paper.
72
+ - `dav3`: Using the newest Depth-Anything-V3 model as depth estimation model.
73
+ - `no_vda`: If running video-depth-anything is too memory-consuming for you, this configuration can produce less temporally-stable depth (but empirically more 3D consistent) maps.
74
+ - `wide_angle`: If your video contains some wide-angle or fisheye distortion.
75
+ - `panorama`: For 360° videos, add `pipeline.post.depth_align_model=dap` (MIT) or `unik3d` (CC-BY-NC 4.0) to enable depth estimation for panoramas.
76
+
77
+ One can visualize the results that ViPE produces by running (supported by `viser`):
78
+ ```bash
79
+ uv run vipe visualize vipe_results/
80
+ # Please modify the above vipe_results/ path to the output directory of your choice.
81
+ ```
82
+
83
+ ![vipe-viser](assets/vipe-viser.gif)
84
+
85
+ ### Using the `run.py` script
86
+
87
+ The `run.py` script is a more flexible way to run ViPE. Compared to the CLI, the script supports running on multiple videos at once and allows more fine-grained control over the pipeline with `hydra` configs. It also provides an example of using `vipe` as a library in your own project.
88
+
89
+ Example usages:
90
+
91
+ ```bash
92
+ # Running the full pipeline.
93
+ uv run python run.py pipeline=default streams=raw_mp4_stream streams.base_path=YOUR_VIDEO_OR_DIR_PATH
94
+
95
+ # Running the pose-only pipeline without depth estimation.
96
+ uv run python run.py pipeline=default streams=raw_mp4_stream streams.base_path=YOUR_VIDEO_OR_DIR_PATH pipeline.post.depth_align_model=null
97
+ ```
98
+
99
+ ### Converting to COLMAP format
100
+
101
+ You can use the following script to convert the ViPE results to COLMAP format. For example:
102
+ ```bash
103
+ uv run python scripts/vipe_to_colmap.py vipe_results/ --sequence dog_example
104
+ ```
105
+ This will unproject the dense depth maps to create the 3D point cloud.
106
+ Alternatively for a more lightweight and 3D consistent point cloud, you can add the `--use_slam_map` flag to the above command. This requires you to run the full pipeline with `pipeline.output.save_slam_map=true` to save the additional information.
107
+
108
+ ## Downloading the Dataset
109
+
110
+ ![dataset](assets/dataset.gif)
111
+
112
+ Together with ViPE we release a large-scale dataset containing ~1M high-quality videos with accurate camera poses and dense depth maps. Specifications of the datasets are listed below:
113
+
114
+ | Dataset Name | # Videos | # Frames | Hugging Face Link | License | Prefix |
115
+ | -------------- | -------- | -------- | ------------------------------------------------------------ | ------------ | ------ |
116
+ | Dynpose-100K++ | 99,501 | 15.8M | [Link](https://huggingface.co/datasets/nvidia/vipe-dynpose-100kpp) | CC-BY-NC 4.0 | `dpsp` |
117
+ | Wild-SDG-1M | 966,448 | 78.2M | [Link](https://huggingface.co/datasets/nvidia/vipe-wild-sdg-1m) | CC-BY-NC 4.0 | `wsdg` |
118
+ | Web360 | 2,114 | 212K | [Link](https://huggingface.co/datasets/nvidia/vipe-web360) | CC-BY 4.0 | `w360` |
119
+
120
+ You can download the datasets using the following utility script:
121
+
122
+ ```bash
123
+ # Replace YOUR_PREFIX with the prefix of the dataset to be downloaded (see prefix column in the table above)
124
+ # You can also use more specific prefixes, e.g. wsdg-003e2c86 to download a specific shard of the dataset.
125
+ uv run python scripts/download_dataset.py --prefix YOUR_PREFIX --output_base YOUR_OUTPUT_DIR --rgb --depth
126
+ ```
127
+
128
+ > Note that the depth component is very large and you might expect a long downloading time. For `rgb` component of the Dynpose-100K++ dataset, we directly retrieve the RGB frames from YouTube. You have to `pip install yt_dlp ffmpeg-python` to use this feature. Please refer to the original [Dynpose-100K dataset](https://huggingface.co/datasets/nvidia/dynpose-100k) for alternative approaches to retrieve the videos.
129
+
130
+ The dataset itself can be visualized using the same visualization script:
131
+ ```bash
132
+ uv run vipe visualize YOUR_OUTPUT_DIR
133
+ ```
134
+
135
+ ## Acknowledgments
136
+
137
+ ViPE is built on top of many great open-source research projects and codebases. Some of these include (not exhaustive):
138
+ - [DROID-SLAM](https://github.com/princeton-vl/DROID-SLAM)
139
+ - [Depth Anything V2](https://github.com/DepthAnything/Depth-Anything-V2)
140
+ - [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3)
141
+ - [Metric3Dv2](https://github.com/YvanYin/Metric3D)
142
+ - [PriorDA](https://github.com/SpatialVision/Prior-Depth-Anything)
143
+ - [UniDepth](https://github.com/lpiccinelli-eth/UniDepth)
144
+ - [UniK3D](https://github.com/lpiccinelli-eth/UniK3D)
145
+ - [VideoDepthAnything](https://github.com/DepthAnything/Video-Depth-Anything)
146
+ - [GeoCalib](https://github.com/cvg/GeoCalib)
147
+ - [Segment and Track Anything](https://github.com/z-x-yang/Segment-and-Track-Anything)
148
+
149
+ Please refer to the [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md) for a full list of projects and their licenses.
150
+
151
+ We thank useful discussions from Aigul Dzhumamuratova, Viktor Kuznetsov, Soha Pouya, and Ming-Yu Liu, as well as release support from Vishal Kulkarni.
152
+
153
+ ## TODO
154
+
155
+ - [x] Initial code released under Apache 2.0 license.
156
+ - [x] Full dataset uploaded to Hugging Face for download.
157
+ - [x] Add instructions to run inference on wide-angle and 360° videos.
158
+ - [ ] Add instructions for benchmarking.
159
+
160
+ ## Citation
161
+
162
+ If you find ViPE useful in your research or application, please consider citing the following whitepaper:
163
+
164
+ ```
165
+ @inproceedings{huang2025vipe,
166
+ title={ViPE: Video Pose Engine for 3D Geometric Perception},
167
+ author={Huang, Jiahui and Zhou, Qunjie and Rabeti, Hesam and Korovko, Aleksandr and Ling, Huan and Ren, Xuanchi and Shen, Tianchang and Gao, Jun and Slepichev, Dmitry and Lin, Chen-Hsuan and Ren, Jiawei and Xie, Kevin and Biswas, Joydeep and Leal-Taixe, Laura and Fidler, Sanja},
168
+ booktitle={NVIDIA Research Whitepapers arXiv:2508.10934},
169
+ year={2025}
170
+ }
171
+ ```
172
+
173
+ ## License
174
+
175
+ This project will download and install additional third-party **models and softwares**. Note that these models or softwares are not distributed by NVIDIA. Review the license terms of these models and projects before use. This source code, **except for the Unik3D part (which is under the BY-NC-SA 4.0 license)** , is released under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).