PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-win_amd64.whl → 1.7.1__py3-none-win_amd64.whl - Mend

warp-lang 1.6.2__py3-none-win_amd64.whl → 1.7.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (191) hide show

warp/__init__.py +7 -1
warp/autograd.py +12 -2
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +463 -372
warp/codegen.py +196 -124
warp/config.py +42 -6
warp/context.py +496 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_cloth.py +1 -1
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/distributed/example_jacobi_mpi.py +507 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/field.py +11 -1
warp/fem/field/nodal_field.py +56 -88
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +16 -13
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +7 -20
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -29
warp/jax_experimental/ffi.py +702 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +312 -116
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +100 -11
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/render/render_opengl.py +19 -17
warp/render/render_usd.py +93 -3
warp/sim/articulation.py +4 -4
warp/sim/collide.py +32 -19
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/inertia.py +189 -156
warp/sim/integrator_euler.py +8 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +8 -5
warp/sim/model.py +71 -25
warp/sim/render.py +4 -0
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +217 -20
warp/tests/__main__.py +0 -15
warp/tests/assets/torus.usda +1 -1
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +236 -205
warp/tests/sim/test_inertia.py +161 -0
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{flaky_test_sim_grad.py → sim/test_sim_grad.py} +4 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/sim/test_xpbd.py +399 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_codegen.py +24 -3
warp/tests/test_examples.py +40 -38
warp/tests/test_fem.py +98 -14
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +577 -156
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +356 -151
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +336 -178
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +98 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -62
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +175 -666
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/METADATA +46 -12
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/RECORD +184 -171
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/WHEEL +1 -1
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info/licenses}/LICENSE.md +0 -26
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/top_level.txt +0 -0

warp/native/volume_builder.cu CHANGED Viewed

@@ -267,11 +267,21 @@ __device__ std::enable_if_t<nanovdb::BuildTraits<typename Node::BuildType>::is_i
 {
 }
+template <typename T>
+struct alignas(alignof(T)) AlignedProxy
+{
+    char data[sizeof(T)];
+};
 template <typename Tree, typename NodeT>
 __global__ void setInternalBBoxAndBackgroundValue(Tree *tree, const typename Tree::BuildType background_value)
 {
     using BBox = nanovdb::math::BBox<typename NodeT::CoordT>;
-    __shared__ BBox bbox;
+    using BBoxProxy = AlignedProxy<BBox>;
+    __shared__ BBoxProxy bbox_mem;
+    BBox& bbox = reinterpret_cast<BBox&>(bbox_mem);
     const unsigned node_count = tree->mNodeCount[NodeT::LEVEL];
     const unsigned node_id = blockIdx.x;
@@ -281,7 +291,7 @@ __global__ void setInternalBBoxAndBackgroundValue(Tree *tree, const typename Tre
         if (threadIdx.x == 0)
         {
-            bbox = BBox();
+            new(&bbox) BBox();
         }
         __syncthreads();
@@ -313,14 +323,17 @@ __global__ void setRootBBoxAndBackgroundValue(nanovdb::Grid<Tree> *grid,
                                               const typename Tree::BuildType background_value)
 {
     using BBox = typename Tree::RootNodeType::BBoxType;
-    __shared__ BBox bbox;
+    using BBoxProxy = AlignedProxy<BBox>;
+    __shared__ BBoxProxy bbox_mem;
+    BBox& bbox = reinterpret_cast<BBox&>(bbox_mem);
     Tree &tree = grid->tree();
     const unsigned upper_count = tree.mNodeCount[2];
     if (threadIdx.x == 0)
     {
-        bbox = BBox();
+        new(&bbox) BBox();
     }
     __syncthreads();
@@ -450,12 +463,14 @@ void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid,
     grid_handle.buffer().detachDeviceData();
 }
-template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<float>> *&, size_t &, const void *, size_t, bool,
-                                     const BuildGridParams<float> &);
-template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::Vec3f>> *&, size_t &, const void *,
-                                     size_t, bool, const BuildGridParams<nanovdb::Vec3f> &);
-template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<int32_t>> *&, size_t &, const void *, size_t, bool,
-                                     const BuildGridParams<int32_t> &);
+#define EXPAND_BUILDER_TYPE(type) \
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<type>> *&, size_t &, const void *, size_t, bool, \
+                                     const BuildGridParams<type> &);
+WP_VOLUME_BUILDER_INSTANTIATE_TYPES
+#undef EXPAND_BUILDER_TYPE
 template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueIndex>> *&, size_t &, const void *,
                                      size_t, bool, const BuildGridParams<nanovdb::ValueIndex> &);
 template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueOnIndex>> *&, size_t &, const void *,

warp/native/volume_builder.h CHANGED Viewed

@@ -19,6 +19,12 @@
 #include <nanovdb/NanoVDB.h>
+#define WP_VOLUME_BUILDER_INSTANTIATE_TYPES                                                                            \
+    EXPAND_BUILDER_TYPE(int32_t)                                                                                       \
+    EXPAND_BUILDER_TYPE(float)                                                                                         \
+    EXPAND_BUILDER_TYPE(nanovdb::Vec3f)                                                                                \
+    EXPAND_BUILDER_TYPE(nanovdb::Vec4f)                                                                                \
 template <typename BuildT> struct BuildGridParams
 {
     nanovdb::Map map;

warp/native/warp.cpp CHANGED Viewed

@@ -151,11 +151,6 @@ int is_cuda_compatibility_enabled()
     return int(WP_ENABLE_CUDA_COMPATIBILITY);
 }
-int is_cutlass_enabled()
-{
-    return int(WP_ENABLE_CUTLASS);
-}
 int is_mathdx_enabled()
 {
     return int(WP_ENABLE_MATHDX);
@@ -1004,6 +999,8 @@ WP_API int cuda_device_is_mempool_supported(int ordinal) { return 0; }
 WP_API int cuda_device_is_ipc_supported(int ordinal) { return 0; }
 WP_API int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold) { return 0; }
 WP_API uint64_t cuda_device_get_mempool_release_threshold(int ordinal) { return 0; }
+WP_API uint64_t cuda_device_get_mempool_used_mem_current(int ordinal) { return 0; }
+WP_API uint64_t cuda_device_get_mempool_used_mem_high(int ordinal) { return 0; }
 WP_API void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem) {}
 WP_API void* cuda_context_get_current() { return NULL; }
@@ -1033,6 +1030,7 @@ WP_API void* cuda_ipc_open_event_handle(void* context, char* handle) { return NU
 WP_API void* cuda_stream_create(void* context, int priority) { return NULL; }
 WP_API void cuda_stream_destroy(void* context, void* stream) {}
+WP_API int cuda_stream_query(void* stream) { return 0; }
 WP_API void cuda_stream_register(void* context, void* stream) {}
 WP_API void cuda_stream_unregister(void* context, void* stream) {}
 WP_API void* cuda_stream_get_current() { return NULL; }
@@ -1045,7 +1043,8 @@ WP_API int cuda_stream_get_priority(void* stream) { return 0; }
 WP_API void* cuda_event_create(void* context, unsigned flags) { return NULL; }
 WP_API void cuda_event_destroy(void* event) {}
-WP_API void cuda_event_record(void* event, void* stream) {}
+WP_API int cuda_event_query(void* event) { return 0; }
+WP_API void cuda_event_record(void* event, void* stream, bool timing) {}
 WP_API void cuda_event_synchronize(void* event) {}
 WP_API float cuda_event_elapsed_time(void* start_event, void* end_event) { return 0.0f; }

warp/native/warp.cu CHANGED Viewed

@@ -1888,6 +1888,62 @@ uint64_t cuda_device_get_mempool_release_threshold(int ordinal)
     return threshold;
 }
+uint64_t cuda_device_get_mempool_used_mem_current(int ordinal)
+{
+    if (ordinal < 0 || ordinal > int(g_devices.size()))
+    {
+        fprintf(stderr, "Invalid device ordinal %d\n", ordinal);
+        return 0;
+    }
+    if (!g_devices[ordinal].is_mempool_supported)
+        return 0;
+    cudaMemPool_t pool;
+    if (!check_cuda(cudaDeviceGetDefaultMemPool(&pool, ordinal)))
+    {
+        fprintf(stderr, "Warp error: Failed to get memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    uint64_t mem_used = 0;
+    if (!check_cuda(cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemCurrent, &mem_used)))
+    {
+        fprintf(stderr, "Warp error: Failed to get amount of currently used memory from the memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    return mem_used;
+}
+uint64_t cuda_device_get_mempool_used_mem_high(int ordinal)
+{
+    if (ordinal < 0 || ordinal > int(g_devices.size()))
+    {
+        fprintf(stderr, "Invalid device ordinal %d\n", ordinal);
+        return 0;
+    }
+    if (!g_devices[ordinal].is_mempool_supported)
+        return 0;
+    cudaMemPool_t pool;
+    if (!check_cuda(cudaDeviceGetDefaultMemPool(&pool, ordinal)))
+    {
+        fprintf(stderr, "Warp error: Failed to get memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    uint64_t mem_high_water_mark = 0;
+    if (!check_cuda(cudaMemPoolGetAttribute(pool, cudaMemPoolAttrUsedMemHigh, &mem_high_water_mark)))
+    {
+        fprintf(stderr, "Warp error: Failed to get memory usage high water mark from the memory pool on device %d\n", ordinal);
+        return 0;
+    }
+    return mem_high_water_mark;
+}
 void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem)
 {
     // use temporary storage if user didn't specify pointers
@@ -2371,6 +2427,19 @@ void cuda_stream_destroy(void* context, void* stream)
     check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
 }
+int cuda_stream_query(void* stream)
+{
+    CUresult res =  cuStreamQuery_f(static_cast<CUstream>(stream));
+    if ((res != CUDA_SUCCESS) && (res != CUDA_ERROR_NOT_READY))
+    {
+        // Abnormal, print out error
+        check_cu(res);
+    }
+    return res;
+}
 void cuda_stream_register(void* context, void* stream)
 {
     if (!stream)
@@ -2465,9 +2534,30 @@ void cuda_event_destroy(void* event)
     check_cu(cuEventDestroy_f(static_cast<CUevent>(event)));
 }
-void cuda_event_record(void* event, void* stream)
+int cuda_event_query(void* event)
+{
+    CUresult res = cuEventQuery_f(static_cast<CUevent>(event));
+    if ((res != CUDA_SUCCESS) && (res != CUDA_ERROR_NOT_READY))
+    {
+        // Abnormal, print out error
+        check_cu(res);
+    }
+    return res;
+}
+void cuda_event_record(void* event, void* stream, bool timing)
 {
-    check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(stream)));
+    if (timing && !g_captures.empty() && cuda_stream_is_capturing(stream))
+    {
+        // record timing event during graph capture
+        check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
+    }
+    else
+    {
+        check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(stream)));
+    }
 }
 void cuda_event_synchronize(void* event)
@@ -2814,6 +2904,12 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         opts.push_back("--define-macro=WP_VERIFY_FP");
     else
         opts.push_back("--undefine-macro=WP_VERIFY_FP");
+#if WP_ENABLE_MATHDX
+    opts.push_back("--define-macro=WP_ENABLE_MATHDX=1");
+#else
+    opts.push_back("--define-macro=WP_ENABLE_MATHDX=0");
+#endif
     if (fast_math)
         opts.push_back("--use_fast_math");
@@ -2823,10 +2919,6 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
     else
         opts.push_back("--fmad=false");
-    char include_cutlass[max_path];
-    sprintf(include_cutlass, "--include-path=%s/cutlass/include", include_dir);
-    opts.push_back(include_cutlass);
     std::vector<std::string> cuda_include_opt;
     for(int i = 0; i < num_cuda_include_dirs; i++)
     {
@@ -2935,7 +3027,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
                     fprintf(stderr, "Warp error: num_ltoirs > 0 but ltoir_input_types, ltoirs or ltoir_sizes are NULL\n");
                     return size_t(-1);
                 }
-                nvJitLinkHandle handle;
+                nvJitLinkHandle handle = nullptr;
                 std::vector<const char *> lopts = {"-dlto", arch_opt_lto};
                 if (use_ptx) {
                     lopts.push_back("-ptx");
@@ -3182,7 +3274,7 @@ size_t cuda_compile_program(const char* cuda_src, const char* program_name, int
         std::vector<char> lto(lto_size);
         CHECK_CUSOLVER(cusolverGetLTOIR(h, lto.size(), lto.data()));
-        // This fatbin is universal, ie it is the same for any instantations of a cusolver device function
+        // This fatbin is universal, ie it is the same for any instantiations of a cusolver device function
         size_t fatbin_size = 0;
         CHECK_CUSOLVER(cusolverGetUniversalFATBINSize(h, &fatbin_size));
@@ -3539,9 +3631,6 @@ void cuda_timing_end(timing_result_t* results, int size)
 #include "sparse.cu"
 #include "volume.cu"
 #include "volume_builder.cu"
-#if WP_ENABLE_CUTLASS
-    #include "cutlass_gemm.cu"
-#endif
 //#include "spline.inl"
 //#include "volume.inl"

warp/native/warp.h CHANGED Viewed

@@ -41,8 +41,6 @@ extern "C"
     WP_API int is_cuda_enabled();
     // whether Warp was compiled with enhanced CUDA compatibility
     WP_API int is_cuda_compatibility_enabled();
-    // whether Warp was compiled with CUTLASS support
-    WP_API int is_cutlass_enabled();
     // whether Warp was compiled with MathDx support
     WP_API int is_mathdx_enabled();
     // whether Warp was compiled with debug support
@@ -112,10 +110,6 @@ extern "C"
     WP_API void hash_grid_destroy_device(uint64_t id);
     WP_API void hash_grid_update_device(uint64_t id, float cell_width, const wp::array_t<wp::vec3>* points);
-    WP_API bool cutlass_gemm(void* context, int compute_capability, int m, int n, int k, const char* datatype,
-                             const void* a, const void* b, const void* c, void* d, float alpha, float beta,
-                             bool row_major_a, bool row_major_b, bool allow_tf32x3_arith, int batch_count);
     WP_API uint64_t volume_create_host(void* buf, uint64_t size, bool copy, bool owner);
     WP_API void volume_get_tiles_host(uint64_t id, void* buf);
     WP_API void volume_get_voxels_host(uint64_t id, void* buf);
@@ -126,9 +120,7 @@ extern "C"
     WP_API void volume_get_voxels_device(uint64_t id, void* buf);
     WP_API void volume_destroy_device(uint64_t id);
-    WP_API uint64_t volume_f_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, float bg_value);
-    WP_API uint64_t volume_v_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, float bg_value[3]);
-    WP_API uint64_t volume_i_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, int bg_value);
+    WP_API uint64_t volume_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space, const void* bg_value, uint32_t bg_value_size, const char* bg_value_type);
     WP_API uint64_t volume_index_from_tiles_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space);
     WP_API uint64_t volume_from_active_voxels_device(void* context, void* points, int num_points, float transform[9], float translation[3], bool points_in_world_space);
@@ -173,6 +165,15 @@ extern "C"
     WP_API void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n);
     WP_API void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n);
+    WP_API void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n);
+    WP_API void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n);
+    WP_API void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
+    WP_API void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
+    WP_API void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
+    WP_API void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments);
     WP_API void runlength_encode_int_host(uint64_t values, uint64_t run_values, uint64_t run_lengths, uint64_t run_count, int n);
     WP_API void runlength_encode_int_device(uint64_t values, uint64_t run_values, uint64_t run_lengths, uint64_t run_count, int n);
@@ -185,6 +186,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -199,6 +201,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -213,6 +216,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -227,6 +231,7 @@ extern "C"
         int* tpl_columns,
         void* tpl_values,
         bool prune_numerical_zeros,
+        bool masked,
         int* bsr_offsets,
         int* bsr_columns,
         void* bsr_values,
@@ -283,6 +288,8 @@ extern "C"
     WP_API int cuda_device_is_ipc_supported(int ordinal);
     WP_API int cuda_device_set_mempool_release_threshold(int ordinal, uint64_t threshold);
     WP_API uint64_t cuda_device_get_mempool_release_threshold(int ordinal);
+    WP_API uint64_t cuda_device_get_mempool_used_mem_current(int ordinal);
+    WP_API uint64_t cuda_device_get_mempool_used_mem_high(int ordinal);
     WP_API void cuda_device_get_memory_info(int ordinal, size_t* free_mem, size_t* total_mem);
     WP_API void* cuda_context_get_current();
@@ -318,6 +325,7 @@ extern "C"
     WP_API void* cuda_stream_create(void* context, int priority);
     WP_API void cuda_stream_destroy(void* context, void* stream);
+    WP_API int cuda_stream_query(void* stream);
     WP_API void cuda_stream_register(void* context, void* stream);
     WP_API void cuda_stream_unregister(void* context, void* stream);
     WP_API void* cuda_stream_get_current();
@@ -330,7 +338,8 @@ extern "C"
     WP_API void* cuda_event_create(void* context, unsigned flags);
     WP_API void cuda_event_destroy(void* event);
-    WP_API void cuda_event_record(void* event, void* stream);
+    WP_API int cuda_event_query(void* event);
+    WP_API void cuda_event_record(void* event, void* stream, bool timing=false);
     WP_API void cuda_event_synchronize(void* event);
     WP_API float cuda_event_elapsed_time(void* start_event, void* end_event);

warp/optim/linear.py CHANGED Viewed

@@ -866,7 +866,7 @@ def _diag_mv_vec_kernel(
 def _inverse_diag_coefficient(coeff: Any, use_abs: wp.bool):
     zero = type(coeff)(0.0)
     one = type(coeff)(1.0)
-    return wp.select(coeff == zero, one / wp.select(use_abs, coeff, wp.abs(coeff)), one)
+    return wp.where(coeff == zero, one, one / wp.where(use_abs, wp.abs(coeff), coeff))
 @wp.kernel
@@ -917,7 +917,7 @@ def _cg_kernel_1(
 ):
     i = wp.tid()
-    alpha = wp.select(resid[0] > tol, rz_old.dtype(0.0), rz_old[0] / p_Ap[0])
+    alpha = wp.where(resid[0] > tol, rz_old[0] / p_Ap[0], rz_old.dtype(0.0))
     x[i] = x[i] + alpha * p[i]
     r[i] = r[i] - alpha * Ap[i]
@@ -935,7 +935,7 @@ def _cg_kernel_2(
     #    p = r + (rz_new / rz_old) * p;
     i = wp.tid()
-    beta = wp.select(resid[0] > tol, rz_old.dtype(0.0), rz_new[0] / rz_old[0])
+    beta = wp.where(resid[0] > tol, rz_new[0] / rz_old[0], rz_old.dtype(0.0))
     p[i] = z[i] + beta * p[i]
@@ -955,7 +955,7 @@ def _cr_kernel_1(
 ):
     i = wp.tid()
-    alpha = wp.select(resid[0] > tol and y_Ap[0] > 0.0, zAz_old.dtype(0.0), zAz_old[0] / y_Ap[0])
+    alpha = wp.where(resid[0] > tol and y_Ap[0] > 0.0, zAz_old[0] / y_Ap[0], zAz_old.dtype(0.0))
     x[i] = x[i] + alpha * p[i]
     r[i] = r[i] - alpha * Ap[i]
@@ -976,7 +976,7 @@ def _cr_kernel_2(
     #    p = r + (rz_new / rz_old) * p;
     i = wp.tid()
-    beta = wp.select(resid[0] > tol and zAz_old[0] > 0.0, zAz_old.dtype(0.0), zAz_new[0] / zAz_old[0])
+    beta = wp.where(resid[0] > tol and zAz_old[0] > 0.0, zAz_new[0] / zAz_old[0], zAz_old.dtype(0.0))
     p[i] = z[i] + beta * p[i]
     Ap[i] = Az[i] + beta * Ap[i]
@@ -995,7 +995,7 @@ def _bicgstab_kernel_1(
 ):
     i = wp.tid()
-    alpha = wp.select(resid[0] > tol, rho_old.dtype(0.0), rho_old[0] / r0v[0])
+    alpha = wp.where(resid[0] > tol, rho_old[0] / r0v[0], rho_old.dtype(0.0))
     x[i] += alpha * y[i]
     r[i] -= alpha * v[i]
@@ -1014,7 +1014,7 @@ def _bicgstab_kernel_2(
 ):
     i = wp.tid()
-    omega = wp.select(resid[0] > tol, st.dtype(0.0), st[0] / tt[0])
+    omega = wp.where(resid[0] > tol, st[0] / tt[0], st.dtype(0.0))
     x[i] += omega * z[i]
     r[i] -= omega * t[i]
@@ -1034,8 +1034,8 @@ def _bicgstab_kernel_3(
 ):
     i = wp.tid()
-    beta = wp.select(resid[0] > tol, st.dtype(0.0), rho_new[0] * tt[0] / (r0v[0] * st[0]))
-    beta_omega = wp.select(resid[0] > tol, st.dtype(0.0), rho_new[0] / r0v[0])
+    beta = wp.where(resid[0] > tol, rho_new[0] * tt[0] / (r0v[0] * st[0]), st.dtype(0.0))
+    beta_omega = wp.where(resid[0] > tol, rho_new[0] / r0v[0], st.dtype(0.0))
     p[i] = r[i] + beta * p[i] - beta_omega * v[i]
@@ -1123,7 +1123,7 @@ def _gmres_arnoldi_normalize_kernel(
     alpha: wp.array(dtype=Any),
 ):
     tid = wp.tid()
-    y[tid] = wp.select(alpha[0] == alpha.dtype(0.0), x[tid] / wp.sqrt(alpha[0]), x[tid])
+    y[tid] = wp.where(alpha[0] == alpha.dtype(0.0), x[tid], x[tid] / wp.sqrt(alpha[0]))
 @wp.kernel

warp/render/render_opengl.py CHANGED Viewed

@@ -13,11 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import ctypes
 import sys
 import time
 from collections import defaultdict
-from typing import List, Optional, Tuple, Union
+from typing import List, Union
 import numpy as np
@@ -1500,16 +1502,16 @@ class OpenGLRenderer:
     def setup_tiled_rendering(
         self,
-        instances: List[List[int]],
+        instances: list[list[int]],
         rescale_window: bool = False,
-        tile_width: Optional[int] = None,
-        tile_height: Optional[int] = None,
-        tile_ncols: Optional[int] = None,
-        tile_nrows: Optional[int] = None,
-        tile_positions: Optional[List[Tuple[int]]] = None,
-        tile_sizes: Optional[List[Tuple[int]]] = None,
-        projection_matrices: Optional[List[Mat44]] = None,
-        view_matrices: Optional[List[Mat44]] = None,
+        tile_width: int | None = None,
+        tile_height: int | None = None,
+        tile_ncols: int | None = None,
+        tile_nrows: int | None = None,
+        tile_positions: list[tuple[int]] | None = None,
+        tile_sizes: list[tuple[int]] | None = None,
+        projection_matrices: list[Mat44] | None = None,
+        view_matrices: list[Mat44] | None = None,
     ):
         """
         Set up tiled rendering where the render buffer is split into multiple tiles that can visualize
@@ -1602,11 +1604,11 @@ class OpenGLRenderer:
     def update_tile(
         self,
         tile_id,
-        instances: Optional[List[int]] = None,
-        projection_matrix: Optional[Mat44] = None,
-        view_matrix: Optional[Mat44] = None,
-        tile_size: Optional[Tuple[int]] = None,
-        tile_position: Optional[Tuple[int]] = None,
+        instances: list[int] | None = None,
+        projection_matrix: Mat44 | None = None,
+        view_matrix: Mat44 | None = None,
+        tile_size: tuple[int] | None = None,
+        tile_position: tuple[int] | None = None,
     ):
         """
         Update the shape instances, projection matrix, view matrix, tile size, or tile position
@@ -1806,7 +1808,7 @@ class OpenGLRenderer:
         return np.array((scaling, 0, 0, 0, 0, scaling, 0, 0, 0, 0, scaling, 0, 0, 0, 0, 1), dtype=np.float32)
-    def update_model_matrix(self, model_matrix: Optional[Mat44] = None):
+    def update_model_matrix(self, model_matrix: Mat44 | None = None):
         gl = OpenGLRenderer.gl
         self._switch_context()
@@ -3092,7 +3094,7 @@ Instances: {len(self._instances)}"""
         parent_body: str = None,
         is_template: bool = False,
         up_axis: int = 1,
-        color: Tuple[float, float, float] = None,
+        color: tuple[float, float, float] = None,
     ):
         """Add a arrow for visualization

warp/render/render_usd.py CHANGED Viewed

@@ -13,10 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import numpy as np
 import warp as wp
+UP_AXIS_TOKEN = ("X", "Y", "Z")
+UP_AXIS_VEC = (
+    np.array((1.0, 0.0, 0.0), dtype=float),
+    np.array((0.0, 1.0, 0.0), dtype=float),
+    np.array((0.0, 0.0, 1.0), dtype=float),
+)
 def _usd_add_xform(prim):
     from pxr import UsdGeom
@@ -29,7 +38,13 @@ def _usd_add_xform(prim):
     prim.AddScaleOp()
-def _usd_set_xform(xform, pos: tuple, rot: tuple, scale: tuple, time):
+def _usd_set_xform(
+    xform,
+    pos: tuple | None = None,
+    rot: tuple | None = None,
+    scale: tuple | None = None,
+    time: float = 0.0,
+):
     from pxr import Gf, UsdGeom
     xform = UsdGeom.Xform(xform)
@@ -108,7 +123,7 @@ class UsdRenderer:
         self.stage.SetDefaultPrim(self.root.GetPrim())
         self.stage.SetStartTimeCode(0.0)
         self.stage.SetEndTimeCode(0.0)
-        self.stage.SetTimeCodesPerSecond(self.fps)
+        self.stage.SetFramesPerSecond(self.fps)
         if up_axis == "X":
             UsdGeom.SetStageUpAxis(self.stage, UsdGeom.Tokens.x)
@@ -622,7 +637,82 @@ class UsdRenderer:
         return prim_path
-    def render_line_list(self, name, vertices, indices, color, radius):
+    def render_arrow(
+        self,
+        name: str,
+        pos: tuple,
+        rot: tuple,
+        base_radius: float,
+        base_height: float,
+        cap_radius: float = None,
+        cap_height: float = None,
+        parent_body: str = None,
+        is_template: bool = False,
+        up_axis: int = 1,
+        color: tuple[float, float, float] = None,
+        visible: bool = True,
+    ):
+        from pxr import Gf, Sdf, UsdGeom
+        if is_template:
+            prim_path = self._resolve_path(name, parent_body, is_template)
+            blueprint = UsdGeom.Scope.Define(self.stage, prim_path)
+            blueprint_prim = blueprint.GetPrim()
+            blueprint_prim.SetInstanceable(True)
+            blueprint_prim.SetSpecifier(Sdf.SpecifierClass)
+            arrow_path = prim_path.AppendChild("arrow")
+        else:
+            arrow_path = self._resolve_path(name, parent_body)
+            prim_path = arrow_path
+        arrow = UsdGeom.Xform.Get(self.stage, arrow_path)
+        if not arrow:
+            arrow = UsdGeom.Xform.Define(self.stage, arrow_path)
+            _usd_add_xform(arrow)
+        base_path = arrow_path.AppendChild("base")
+        base = UsdGeom.Xform.Get(self.stage, base_path)
+        if not base:
+            base = UsdGeom.Cylinder.Define(self.stage, base_path)
+            _usd_add_xform(base)
+        base.GetRadiusAttr().Set(float(base_radius))
+        base.GetHeightAttr().Set(float(base_height))
+        base.GetAxisAttr().Set(UP_AXIS_TOKEN[up_axis])
+        _usd_set_xform(base, UP_AXIS_VEC[up_axis] * base_height * 0.5)
+        cap_path = arrow_path.AppendChild("cap")
+        cap = UsdGeom.Xform.Get(self.stage, cap_path)
+        if not cap:
+            cap = UsdGeom.Cone.Define(self.stage, arrow_path.AppendChild("cap"))
+            _usd_add_xform(cap)
+        cap.GetRadiusAttr().Set(float(cap_radius))
+        cap.GetHeightAttr().Set(float(cap_height))
+        cap.GetAxisAttr().Set(UP_AXIS_TOKEN[up_axis])
+        _usd_set_xform(cap, UP_AXIS_VEC[up_axis] * (base_height + cap_height * 0.5))
+        if color is not None:
+            base.GetDisplayColorAttr().Set([Gf.Vec3f(color)], self.time)
+            cap.GetDisplayColorAttr().Set([Gf.Vec3f(color)], self.time)
+        self._shape_constructors[name] = UsdGeom.Xform
+        if not is_template:
+            _usd_set_xform(arrow, pos, rot, (1.0, 1.0, 1.0), self.time)
+        arrow.GetVisibilityAttr().Set("inherited" if visible else "invisible", self.time)
+        return prim_path
+    def render_line_list(
+        self,
+        name: str,
+        vertices,
+        indices,
+        color: tuple = None,
+        radius: float = 0.01,
+        visible: bool = True,
+    ):
         """Debug helper to add a line list as a set of capsules
         Args: