PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.6.2__py3-none-macosx_10_13_universal2.whl → 1.7.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (179) hide show

warp/__init__.py +7 -1
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +452 -362
warp/codegen.py +179 -119
warp/config.py +42 -6
warp/context.py +490 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/nodal_field.py +22 -68
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +9 -10
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +3 -8
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +14 -27
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +301 -105
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +99 -10
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/sim/articulation.py +4 -4
warp/sim/collide.py +21 -10
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/integrator_euler.py +5 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +5 -5
warp/sim/model.py +42 -13
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +216 -19
warp/tests/__main__.py +0 -15
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +2 -2
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_examples.py +28 -36
warp/tests/test_fem.py +23 -4
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +233 -79
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +67 -46
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +46 -34
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +1 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -59
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +110 -658
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/METADATA +29 -7
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/RECORD +172 -162
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info/licenses}/LICENSE.md +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/native/volume_builder.cu CHANGED Viewed

@@ -267,11 +267,21 @@ __device__ std::enable_if_t<nanovdb::BuildTraits<typename Node::BuildType>::is_i
 {
 }
+template <typename T>
+struct alignas(alignof(T)) AlignedProxy
+{
+    char data[sizeof(T)];
+};
 template <typename Tree, typename NodeT>
 __global__ void setInternalBBoxAndBackgroundValue(Tree *tree, const typename Tree::BuildType background_value)
 {
     using BBox = nanovdb::math::BBox<typename NodeT::CoordT>;
-    __shared__ BBox bbox;
+    using BBoxProxy = AlignedProxy<BBox>;
+    __shared__ BBoxProxy bbox_mem;
+    BBox& bbox = reinterpret_cast<BBox&>(bbox_mem);
     const unsigned node_count = tree->mNodeCount[NodeT::LEVEL];
     const unsigned node_id = blockIdx.x;
@@ -281,7 +291,7 @@ __global__ void setInternalBBoxAndBackgroundValue(Tree *tree, const typename Tre
         if (threadIdx.x == 0)
         {
-            bbox = BBox();
+            new(&bbox) BBox();
         }
         __syncthreads();
@@ -313,14 +323,17 @@ __global__ void setRootBBoxAndBackgroundValue(nanovdb::Grid<Tree> *grid,
                                               const typename Tree::BuildType background_value)
 {
     using BBox = typename Tree::RootNodeType::BBoxType;
-    __shared__ BBox bbox;
+    using BBoxProxy = AlignedProxy<BBox>;
+    __shared__ BBoxProxy bbox_mem;
+    BBox& bbox = reinterpret_cast<BBox&>(bbox_mem);
     Tree &tree = grid->tree();
     const unsigned upper_count = tree.mNodeCount[2];
     if (threadIdx.x == 0)
     {
-        bbox = BBox();
+        new(&bbox) BBox();
     }
     __syncthreads();
@@ -450,12 +463,14 @@ void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid,
     grid_handle.buffer().detachDeviceData();
 }
-template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<float>> *&, size_t &, const void *, size_t, bool,
-                                     const BuildGridParams<float> &);
-template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::Vec3f>> *&, size_t &, const void *,
-                                     size_t, bool, const BuildGridParams<nanovdb::Vec3f> &);
-template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<int32_t>> *&, size_t &, const void *, size_t, bool,
-                                     const BuildGridParams<int32_t> &);
+#define EXPAND_BUILDER_TYPE(type) \
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<type>> *&, size_t &, const void *, size_t, bool, \
+                                     const BuildGridParams<type> &);
+WP_VOLUME_BUILDER_INSTANTIATE_TYPES
+#undef EXPAND_BUILDER_TYPE
 template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueIndex>> *&, size_t &, const void *,
                                      size_t, bool, const BuildGridParams<nanovdb::ValueIndex> &);
 template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueOnIndex>> *&, size_t &, const void *,

warp/native/volume_builder.h CHANGED Viewed

@@ -19,6 +19,12 @@
 #include <nanovdb/NanoVDB.h>
+#define WP_VOLUME_BUILDER_INSTANTIATE_TYPES                                                                            \
+    EXPAND_BUILDER_TYPE(int32_t)                                                                                       \
+    EXPAND_BUILDER_TYPE(float)                                                                                         \
+    EXPAND_BUILDER_TYPE(nanovdb::Vec3f)                                                                                \
+    EXPAND_BUILDER_TYPE(nanovdb::Vec4f)                                                                                \
 template <typename BuildT> struct BuildGridParams
 {
     nanovdb::Map map;

warp/native/warp.cpp CHANGED Viewed

@@ -151,11 +151,6 @@ int is_cuda_compatibility_enabled()
     return int(WP_ENABLE_CUDA_COMPATIBILITY);
 }
-int is_cutlass_enabled()
-{
-    return int(WP_ENABLE_CUTLASS);
-}
 int is_mathdx_enabled()
 {
     return int(WP_ENABLE_MATHDX);
@@ -1004,6 +999,8 @@ WP_API int cuda_device_is_mempool_supported(int ordinal) { return 0; }
 WP_API int cuda_device_is_ipc_supported(int ordinal) { return 0; }
 WP_API int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold) { return 0; }
 WP_API uint64_t cuda_device_get_mempool_release_threshold(int ordinal) { return 0; }
+WP_API uint64_t cuda_device_get_mempool_used_mem_current(int ordinal) { return 0; }
+WP_API uint64_t cuda_device_get_mempool_used_mem_high(int ordinal) { return 0; }
 WP_API void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem) {}
 WP_API void* cuda_context_get_current() { return NULL; }
@@ -1033,6 +1030,7 @@ WP_API void* cuda_ipc_open_event_handle(void* context, char* handle) { return NU
 WP_API void* cuda_stream_create(void* context, int priority) { return NULL; }
 WP_API void cuda_stream_destroy(void* context, void* stream) {}
+WP_API int cuda_stream_query(void* stream) { return 0; }
 WP_API void cuda_stream_register(void* context, void* stream) {}
 WP_API void cuda_stream_unregister(void* context, void* stream) {}
 WP_API void* cuda_stream_get_current() { return NULL; }
@@ -1045,7 +1043,8 @@ WP_API int cuda_stream_get_priority(void* stream) { return 0; }
 WP_API void* cuda_event_create(void* context, unsigned flags) { return NULL; }
 WP_API void cuda_event_destroy(void* event) {}
-WP_API void cuda_event_record(void* event, void* stream) {}
+WP_API int cuda_event_query(void* event) { return 0; }
+WP_API void cuda_event_record(void* event, void* stream, bool timing) {}
 WP_API void cuda_event_synchronize(void* event) {}
 WP_API float cuda_event_elapsed_time(void* start_event, void* end_event) { return 0.0f; }

warp/native/warp.cu CHANGED Viewed

@@ -1888,6 +1888,62 @@ uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
     return threshold;
 }
+uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
+{
+    if (ordinal < 0 || ordinal > int(g_devices.size()))
+    {
+        fprintf(stderr, "Invalid device ordinal %d\n", ordinal);
+        return 0;
+    }
+    if (!g_devices[ordinal].is_mempool_supported)
+        return 0;
+    cudaMemPool_t pool;
+    if (!check_cuda(cudaDeviceGetDefaultMemPool(&pool, ordinal)))
+    {
+        fprintf(stderr, "Warp error: Failed to get memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    uint64_t mem_used = 0;
+    if (!check_cuda(cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &mem_used)))
+    {
+        fprintf(stderr, "Warp error: Failed to get amount of currently used memory from the memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    return mem_used;
+}
+uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
+{
+    if (ordinal < 0 || ordinal > int(g_devices.size()))
+    {
+        fprintf(stderr, "Invalid device ordinal %d\n", ordinal);
+        return 0;
+    }
+    if (!g_devices[ordinal].is_mempool_supported)
+        return 0;
+    cudaMemPool_t pool;
+    if (!check_cuda(cudaDeviceGetDefaultMemPool(&pool, ordinal)))
+    {
+        fprintf(stderr, "Warp error: Failed to get memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    uint64_t mem_high_water_mark = 0;
+    if (!check_cuda(cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemHigh, &mem_high_water_mark)))
+    {
+        fprintf(stderr, "Warp error: Failed to get memory usage high water mark from the memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    return mem_high_water_mark;
+}
 void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
 {
     // use temporary storage if user didn't specify pointers
@@ -2371,6 +2427,19 @@ void cuda_stream_destroy(void* context, void* stream)
     check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
 }
+int cuda_stream_query(void* stream)
+{
+    CUresult res =  cuStreamQuery_f(static_cast<CUstream>(stream));
+    if ((res != CUDA_SUCCESS) && (res != CUDA_ERROR_NOT_READY))
+    {
+        // Abnormal, print out error
+        check_cu(res);
+    }
+    return res;
+}
 void cuda_stream_register(void* context, void* stream)
 {
     if (!stream)
@@ -2465,9 +2534,30 @@ void cuda_event_destroy(void* event)
     check_cu(cuEventDestroy_f(static_cast<CUevent>(event)));
 }
-void cuda_event_record(void* event, void* stream)
+int cuda_event_query(void* event)
+{
+    CUresult res = cuEventQuery_f(static_cast<CUevent>(event));
+    if ((res != CUDA_SUCCESS) && (res != CUDA_ERROR_NOT_READY))
+    {
+        // Abnormal, print out error
+        check_cu(res);
+    }
+    return res;
+}
+void cuda_event_record(void* event, void* stream, bool timing)
 {
-    check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(stream)));
+    if (timing && !g_captures.empty() && cuda_stream_is_capturing(stream))
+    {
+        // record timing event during graph capture
+        check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
+    }
+    else
+    {
+        check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(stream)));
+    }
 }
 void cuda_event_synchronize(void* event)
@@ -2814,6 +2904,12 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         opts.push_back("--define-macro=WP_VERIFY_FP");
     else
         opts.push_back("--undefine-macro=WP_VERIFY_FP");
+#if WP_ENABLE_MATHDX
+    opts.push_back("--define-macro=WP_ENABLE_MATHDX=1");
+#else
+    opts.push_back("--define-macro=WP_ENABLE_MATHDX=0");
+#endif
     if (fast_math)
         opts.push_back("--use_fast_math");
@@ -2823,10 +2919,6 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
     else
         opts.push_back("--fmad=false");
-    char include_cutlass[max_path];
-    sprintf(include_cutlass, "--include-path=%s/cutlass/include", include_dir);
-    opts.push_back(include_cutlass);
     std::vector<std::string> cuda_include_opt;
     for(int i = 0; i < num_cuda_include_dirs; i++)
     {
@@ -3182,7 +3274,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         std::vector<char> lto(lto_size);
         CHECK_CUSOLVER(cusolverGetLTOIR(h, lto.size(), lto.data()));
-        // This fatbin is universal, ie it is the same for any instantations of a cusolver device function
+        // This fatbin is universal, ie it is the same for any instantiations of a cusolver device function
         size_t fatbin_size = 0;
         CHECK_CUSOLVER(cusolverGetUniversalFATBINSize(h, &fatbin_size));
@@ -3539,9 +3631,6 @@ void cuda_timing_end(timing_result_t* results, int size)
 #include "sparse.cu"
 #include "volume.cu"
 #include "volume_builder.cu"
-#if WP_ENABLE_CUTLASS
-    #include "cutlass_gemm.cu"
-#endif
 //#include "spline.inl"
 //#include "volume.inl"

warp/native/warp.h CHANGED Viewed

@@ -41,8 +41,6 @@ extern "C"
     WP_API int is_cuda_enabled();
     // whether Warp was compiled with enhanced CUDA compatibility
     WP_API int is_cuda_compatibility_enabled();
-    // whether Warp was compiled with CUTLASS support
-    WP_API int is_cutlass_enabled();
     // whether Warp was compiled with MathDx support
     WP_API int is_mathdx_enabled();
     // whether Warp was compiled with debug support
@@ -112,10 +110,6 @@ extern "C"
     WP_API void hash_grid_destroy_device(uint64_t id);
     WP_API void hash_grid_update_device(uint64_t id, float cell_width, const wp::array_t<wp::vec3>* points);
-    WP_API bool cutlass_gemm(void* context, int compute_capability, int m, int n, int k, const char* datatype,
-                             const void* a, const void* b, const void* c, void* d, float alpha, float beta,
-                             bool row_major_a, bool row_major_b, bool allow_tf32x3_arith, int batch_count);
     WP_API uint64_t volume_create_host(void* buf, uint64_t size, bool copy, bool owner);
     WP_API void volume_get_tiles_host(uint64_t id, void* buf);
     WP_API void volume_get_voxels_host(uint64_t id, void* buf);
@@ -126,9 +120,7 @@ extern "C"
     WP_API void volume_get_voxels_device(uint64_t id, void* buf);
     WP_API void volume_destroy_device(uint64_t id);
-    WP_API uint64_t volume_f_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, float bg_value);
-    WP_API uint64_t volume_v_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, float bg_value[3]);
-    WP_API uint64_t volume_i_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, int bg_value);
+    WP_API uint64_t volume_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, const void* bg_value, uint32_t bg_value_size, const char* bg_value_type);
     WP_API uint64_t volume_index_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space);
     WP_API uint64_t volume_from_active_voxels_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space);
@@ -173,6 +165,15 @@ extern "C"
     WP_API void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n);
     WP_API void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n);
+    WP_API void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n);
+    WP_API void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n);
+    WP_API void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
+    WP_API void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
+    WP_API void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
+    WP_API void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
     WP_API void runlength_encode_int_host(uint64_t values, uint64_t run_values, uint64_t run_lengths, uint64_t run_count, int n);
     WP_API void runlength_encode_int_device(uint64_t values, uint64_t run_values, uint64_t run_lengths, uint64_t run_count, int n);
@@ -185,6 +186,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -199,6 +201,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -213,6 +216,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -227,6 +231,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -283,6 +288,8 @@ extern "C"
     WP_API int cuda_device_is_ipc_supported(int ordinal);
     WP_API int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold);
     WP_API uint64_t cuda_device_get_mempool_release_threshold(int ordinal);
+    WP_API uint64_t cuda_device_get_mempool_used_mem_current(int ordinal);
+    WP_API uint64_t cuda_device_get_mempool_used_mem_high(int ordinal);
     WP_API void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem);
     WP_API void* cuda_context_get_current();
@@ -318,6 +325,7 @@ extern "C"
     WP_API void* cuda_stream_create(void* context, int priority);
     WP_API void cuda_stream_destroy(void* context, void* stream);
+    WP_API int cuda_stream_query(void* stream);
     WP_API void cuda_stream_register(void* context, void* stream);
     WP_API void cuda_stream_unregister(void* context, void* stream);
     WP_API void* cuda_stream_get_current();
@@ -330,7 +338,8 @@ extern "C"
     WP_API void* cuda_event_create(void* context, unsigned flags);
     WP_API void cuda_event_destroy(void* event);
-    WP_API void cuda_event_record(void* event, void* stream);
+    WP_API int cuda_event_query(void* event);
+    WP_API void cuda_event_record(void* event, void* stream, bool timing=false);
     WP_API void cuda_event_synchronize(void* event);
     WP_API float cuda_event_elapsed_time(void* start_event, void* end_event);

warp/optim/linear.py CHANGED Viewed

@@ -866,7 +866,7 @@ def _diag_mv_vec_kernel(
 def _inverse_diag_coefficient(coeff: Any, use_abs: wp.bool):
     zero = type(coeff)(0.0)
     one = type(coeff)(1.0)
-    return wp.select(coeff == zero, one / wp.select(use_abs, coeff, wp.abs(coeff)), one)
+    return wp.where(coeff == zero, one, one / wp.where(use_abs, wp.abs(coeff), coeff))
 @wp.kernel
@@ -917,7 +917,7 @@ def _cg_kernel_1(
 ):
     i = wp.tid()
-    alpha = wp.select(resid[0] > tol, rz_old.dtype(0.0), rz_old[0] / p_Ap[0])
+    alpha = wp.where(resid[0] > tol, rz_old[0] / p_Ap[0], rz_old.dtype(0.0))
     x[i] = x[i] + alpha * p[i]
     r[i] = r[i] - alpha * Ap[i]
@@ -935,7 +935,7 @@ def _cg_kernel_2(
     #    p = r + (rz_new / rz_old) * p;
     i = wp.tid()
-    beta = wp.select(resid[0] > tol, rz_old.dtype(0.0), rz_new[0] / rz_old[0])
+    beta = wp.where(resid[0] > tol, rz_new[0] / rz_old[0], rz_old.dtype(0.0))
     p[i] = z[i] + beta * p[i]
@@ -955,7 +955,7 @@ def _cr_kernel_1(
 ):
     i = wp.tid()
-    alpha = wp.select(resid[0] > tol and y_Ap[0] > 0.0, zAz_old.dtype(0.0), zAz_old[0] / y_Ap[0])
+    alpha = wp.where(resid[0] > tol and y_Ap[0] > 0.0, zAz_old[0] / y_Ap[0], zAz_old.dtype(0.0))
     x[i] = x[i] + alpha * p[i]
     r[i] = r[i] - alpha * Ap[i]
@@ -976,7 +976,7 @@ def _cr_kernel_2(
     #    p = r + (rz_new / rz_old) * p;
     i = wp.tid()
-    beta = wp.select(resid[0] > tol and zAz_old[0] > 0.0, zAz_old.dtype(0.0), zAz_new[0] / zAz_old[0])
+    beta = wp.where(resid[0] > tol and zAz_old[0] > 0.0, zAz_new[0] / zAz_old[0], zAz_old.dtype(0.0))
     p[i] = z[i] + beta * p[i]
     Ap[i] = Az[i] + beta * Ap[i]
@@ -995,7 +995,7 @@ def _bicgstab_kernel_1(
 ):
     i = wp.tid()
-    alpha = wp.select(resid[0] > tol, rho_old.dtype(0.0), rho_old[0] / r0v[0])
+    alpha = wp.where(resid[0] > tol, rho_old[0] / r0v[0], rho_old.dtype(0.0))
     x[i] += alpha * y[i]
     r[i] -= alpha * v[i]
@@ -1014,7 +1014,7 @@ def _bicgstab_kernel_2(
 ):
     i = wp.tid()
-    omega = wp.select(resid[0] > tol, st.dtype(0.0), st[0] / tt[0])
+    omega = wp.where(resid[0] > tol, st[0] / tt[0], st.dtype(0.0))
     x[i] += omega * z[i]
     r[i] -= omega * t[i]
@@ -1034,8 +1034,8 @@ def _bicgstab_kernel_3(
 ):
     i = wp.tid()
-    beta = wp.select(resid[0] > tol, st.dtype(0.0), rho_new[0] * tt[0] / (r0v[0] * st[0]))
-    beta_omega = wp.select(resid[0] > tol, st.dtype(0.0), rho_new[0] / r0v[0])
+    beta = wp.where(resid[0] > tol, rho_new[0] * tt[0] / (r0v[0] * st[0]), st.dtype(0.0))
+    beta_omega = wp.where(resid[0] > tol, rho_new[0] / r0v[0], st.dtype(0.0))
     p[i] = r[i] + beta * p[i] - beta_omega * v[i]
@@ -1123,7 +1123,7 @@ def _gmres_arnoldi_normalize_kernel(
     alpha: wp.array(dtype=Any),
 ):
     tid = wp.tid()
-    y[tid] = wp.select(alpha[0] == alpha.dtype(0.0), x[tid] / wp.sqrt(alpha[0]), x[tid])
+    y[tid] = wp.where(alpha[0] == alpha.dtype(0.0), x[tid], x[tid] / wp.sqrt(alpha[0]))
 @wp.kernel

warp/sim/articulation.py CHANGED Viewed

@@ -30,7 +30,7 @@ def compute_2d_rotational_dofs(
     """
     Computes the rotation quaternion and 3D angular velocity given the joint axes, coordinates and velocities.
     """
-    q_off = wp.quat_from_matrix(wp.mat33(axis_0, axis_1, wp.cross(axis_0, axis_1)))
+    q_off = wp.quat_from_matrix(wp.matrix_from_cols(axis_0, axis_1, wp.cross(axis_0, axis_1)))
     # body local axes
     local_0 = wp.quat_rotate(q_off, wp.vec3(1.0, 0.0, 0.0))
@@ -60,7 +60,7 @@ def invert_2d_rotational_dofs(
     """
     Computes generalized joint position and velocity coordinates for a 2D rotational joint given the joint axes, relative orientations and angular velocity differences between the two bodies the joint connects.
     """
-    q_off = wp.quat_from_matrix(wp.mat33(axis_0, axis_1, wp.cross(axis_0, axis_1)))
+    q_off = wp.quat_from_matrix(wp.matrix_from_cols(axis_0, axis_1, wp.cross(axis_0, axis_1)))
     q_pc = wp.quat_inverse(q_off) * wp.quat_inverse(q_p) * q_c * q_off
     # decompose to a compound rotation each axis
@@ -106,7 +106,7 @@ def compute_3d_rotational_dofs(
     """
     Computes the rotation quaternion and 3D angular velocity given the joint axes, coordinates and velocities.
     """
-    q_off = wp.quat_from_matrix(wp.mat33(axis_0, axis_1, axis_2))
+    q_off = wp.quat_from_matrix(wp.matrix_from_cols(axis_0, axis_1, axis_2))
     # body local axes
     local_0 = wp.quat_rotate(q_off, wp.vec3(1.0, 0.0, 0.0))
@@ -136,7 +136,7 @@ def invert_3d_rotational_dofs(
     """
     Computes generalized joint position and velocity coordinates for a 3D rotational joint given the joint axes, relative orientations and angular velocity differences between the two bodies the joint connects.
     """
-    q_off = wp.quat_from_matrix(wp.mat33(axis_0, axis_1, axis_2))
+    q_off = wp.quat_from_matrix(wp.matrix_from_cols(axis_0, axis_1, axis_2))
     q_pc = wp.quat_inverse(q_off) * wp.quat_inverse(q_p) * q_c * q_off
     # decompose to a compound rotation each axis

warp/sim/collide.py CHANGED Viewed

@@ -17,10 +17,12 @@
 Collision handling functions and kernels.
 """
+from typing import Optional
 import numpy as np
 import warp as wp
-from warp.sim.model import Model
+from warp.sim.model import Model, State
 from .model import PARTICLE_FLAG_ACTIVE, ModelShapeGeometry
@@ -1556,17 +1558,23 @@ def handle_contact_pairs(
         contact_thickness[index] = thickness
-def collide(model, state, edge_sdf_iter: int = 10, iterate_mesh_vertices: bool = True, requires_grad: bool = None):
-    """
-    Generates contact points for the particles and rigid bodies in the model,
-    to be used in the contact dynamics kernel of the integrator.
+def collide(
+    model: Model,
+    state: State,
+    edge_sdf_iter: int = 10,
+    iterate_mesh_vertices: bool = True,
+    requires_grad: Optional[bool] = None,
+) -> None:
+    """Generate contact points for the particles and rigid bodies in the model for use in contact-dynamics kernels.
     Args:
-        model: the model to be simulated
-        state: the state of the model
-        edge_sdf_iter: number of search iterations for finding closest contact points between edges and SDF
-        iterate_mesh_vertices: whether to iterate over all vertices of a mesh for contact generation (used for capsule/box <> mesh collision)
-        requires_grad: whether to duplicate contact arrays for gradient computation (if None uses model.requires_grad)
+        model: The model to be simulated.
+        state: The state of the model.
+        edge_sdf_iter: Number of search iterations for finding closest contact points between edges and SDF.
+        iterate_mesh_vertices: Whether to iterate over all vertices of a mesh for contact generation
+            (used for capsule/box <> mesh collision).
+        requires_grad: Whether to duplicate contact arrays for gradient computation
+            (if ``None``, uses ``model.requires_grad``).
     """
     if requires_grad is None:
@@ -1685,13 +1693,16 @@ def collide(model, state, edge_sdf_iter: int = 10, iterate_mesh_vertices: bool =
                 model.rigid_contact_tids = wp.zeros_like(model.rigid_contact_tids)
                 model.rigid_contact_shape0 = wp.empty_like(model.rigid_contact_shape0)
                 model.rigid_contact_shape1 = wp.empty_like(model.rigid_contact_shape1)
                 if model.rigid_contact_pairwise_counter is not None:
                     model.rigid_contact_pairwise_counter = wp.zeros_like(model.rigid_contact_pairwise_counter)
             else:
                 model.rigid_contact_count.zero_()
                 model.rigid_contact_tids.zero_()
                 if model.rigid_contact_pairwise_counter is not None:
                     model.rigid_contact_pairwise_counter.zero_()
             model.rigid_contact_shape0.fill_(-1)
             model.rigid_contact_shape1.fill_(-1)