PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (59) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build_dll.py +5 -0
warp/codegen.py +15 -3
warp/config.py +1 -1
warp/context.py +122 -24
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fem/field/virtual.py +2 -0
warp/fem/integrate.py +78 -47
warp/jax_experimental/ffi.py +201 -53
warp/native/array.h +4 -4
warp/native/builtin.h +8 -4
warp/native/coloring.cpp +5 -1
warp/native/cuda_util.cpp +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +3 -3
warp/native/mesh.h +1 -1
warp/native/quat.h +6 -2
warp/native/rand.h +7 -7
warp/native/sparse.cu +1 -1
warp/native/svd.h +23 -8
warp/native/tile.h +20 -1
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +4 -4
warp/native/warp.cpp +1 -1
warp/native/warp.cu +15 -2
warp/native/warp.h +1 -1
warp/render/render_opengl.py +52 -51
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +1 -1
warp/tape.py +2 -0
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +76 -1
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_mat.py +22 -0
warp/tests/test_quat.py +22 -0
warp/tests/test_sparse.py +32 -0
warp/tests/test_static.py +48 -0
warp/tests/test_tape.py +38 -0
warp/tests/test_vec.py +38 -408
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +31 -143
warp/tests/tile/test_tile_mathdx.py +2 -2
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +12 -12
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +10 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/METADATA +4 -4
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/RECORD +59 -57
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0

warp/native/quat.h CHANGED Viewed

@@ -904,8 +904,12 @@ inline CUDA_CALLABLE void adj_div(quat_t<Type> a, Type s, quat_t<Type>& adj_a, T
 template<typename Type>
 inline CUDA_CALLABLE void adj_div(Type s, quat_t<Type> a, Type& adj_s, quat_t<Type>& adj_a, const quat_t<Type>& adj_ret)
 {
-    adj_s -= dot(a, adj_ret)/ (s * s); // - a / s^2
-    adj_a += s / adj_ret;
+    for (unsigned i=0; i < 4; ++i)
+    {
+        Type inv = Type(1) / a[i];
+        adj_a[i] -= s * adj_ret[i] * inv * inv;
+        adj_s += adj_ret[i] * inv;
+    }
 }
 template<typename Type>

warp/native/rand.h CHANGED Viewed

@@ -71,14 +71,14 @@ inline CUDA_CALLABLE float randf(uint32& state, float min, float max) { return (
 // Box-Muller method
 inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state) + RANDN_EPSILON)) * cos(2.f * M_PI_F * randf(state)); }
-inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, float adj_ret) {}
-inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, float adj_ret) {}
+inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, uint32 adj_ret) {}
+inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, uint32 adj_ret) {}
-inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state, float adj_ret) {}
-inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max, float adj_ret) {}
+inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state, int adj_ret) {}
+inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max, int adj_ret) {}
-inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state, float adj_ret) {}
-inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max, float adj_ret) {}
+inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state, uint32 adj_ret) {}
+inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max, uint32 adj_ret) {}
 inline CUDA_CALLABLE void adj_randf(uint32& state, uint32& adj_state, float adj_ret) {}
 inline CUDA_CALLABLE void adj_randf(uint32& state, float min, float max, uint32& adj_state, float& adj_min, float& adj_max, float adj_ret) {}
@@ -195,7 +195,7 @@ inline CUDA_CALLABLE void adj_sample_unit_hemisphere_surface(uint32& state, uint
 inline CUDA_CALLABLE void adj_sample_unit_hemisphere(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
 inline CUDA_CALLABLE void adj_sample_unit_square(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
 inline CUDA_CALLABLE void adj_sample_unit_cube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
-inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
+inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const vec4& adj_ret) {}
 /*
  * log-gamma function to support some of these distributions. The

warp/native/sparse.cu CHANGED Viewed

@@ -334,7 +334,7 @@ WP_API void bsr_matrix_from_triplets_device(
         // Ensures the sorted keys are available in summed_block_indices if needed
         if(return_summed_blocks && d_keys.Current() != tpl_block_indices)
         {
-            check_cuda(cudaMemcpy(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice));
+            check_cuda(cudaMemcpyAsync(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice, stream));
         }
     }

warp/native/svd.h CHANGED Viewed

@@ -50,12 +50,14 @@ namespace wp
 template<typename Type>
 struct _svd_config {
+    static constexpr float SVD_EPSILON = 1.e-6f;
     static constexpr float QR_GIVENS_EPSILON = 1.e-6f;
     static constexpr int JACOBI_ITERATIONS = 4;
 };
 template<>
 struct _svd_config<double> {
+    static constexpr double SVD_EPSILON = 1.e-12;
     static constexpr double QR_GIVENS_EPSILON = 1.e-12;
     static constexpr int JACOBI_ITERATIONS = 8;
 };
@@ -528,13 +530,15 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
                                   const mat_t<3,3,Type>& adj_U,
                                   const vec_t<3,Type>& adj_sigma,
                                   const mat_t<3,3,Type>& adj_V) {
+  const Type epsilon = _svd_config<Type>::SVD_EPSILON;
   Type sx2 = sigma[0] * sigma[0];
   Type sy2 = sigma[1] * sigma[1];
   Type sz2 = sigma[2] * sigma[2];
-  Type F01 = Type(1) / min(sy2 - sx2, Type(-1e-6f));
-  Type F02 = Type(1) / min(sz2 - sx2, Type(-1e-6f));
-  Type F12 = Type(1) / min(sz2 - sy2, Type(-1e-6f));
+  Type F01 = Type(1) / min(sy2 - sx2, Type(-epsilon));
+  Type F02 = Type(1) / min(sz2 - sx2, Type(-epsilon));
+  Type F12 = Type(1) / min(sz2 - sy2, Type(-epsilon));
   mat_t<3,3,Type> F = mat_t<3,3,Type>(0, F01, F02,
                   -F01, 0, F12,
@@ -553,8 +557,13 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
   mat_t<3,3,Type> sigma_term = mul(U, mul(adj_sigma_mat, VT));
-  mat_t<3,3,Type> u_term = mul(mul(U, mul(cw_mul(F, (mul(UT, adj_U) - mul(transpose(adj_U), U))), s_mat)), VT);
-  mat_t<3,3,Type> v_term = mul(U, mul(s_mat, mul(cw_mul(F, (mul(VT, adj_V) - mul(transpose(adj_V), V))), VT)));
+  mat_t<3,3,Type> skew_u = cw_mul(F, mul(UT, adj_U) - mul(transpose(adj_U), U));
+  mat_t<3,3,Type> block_u = mul(skew_u, s_mat);
+  mat_t<3,3,Type> u_term = mul(mul(U, block_u), VT);
+  mat_t<3,3,Type> skew_v = cw_mul(F, mul(VT, adj_V) - mul(transpose(adj_V), V));
+  mat_t<3,3,Type> block_v = mul(skew_v, VT);
+  mat_t<3,3,Type> v_term = mul(U, mul(s_mat, block_v));
   adj_A = adj_A + (u_term + v_term + sigma_term);
 }
@@ -583,11 +592,13 @@ inline CUDA_CALLABLE void adj_svd2(const mat_t<2,2,Type>& A,
                                    const mat_t<2,2,Type>& adj_U,
                                    const vec_t<2,Type>& adj_sigma,
                                    const mat_t<2,2,Type>& adj_V) {
+    const Type epsilon = _svd_config<Type>::SVD_EPSILON;
     Type s1_squared = sigma[0] * sigma[0];
     Type s2_squared = sigma[1] * sigma[1];
     // Compute inverse of (s1^2 - s2^2) if possible, use small epsilon to prevent division by zero
-    Type F01 = Type(1) / min(s2_squared - s1_squared, Type(-1e-6f));
+    Type F01 = Type(1) / min(s2_squared - s1_squared, Type(-epsilon));
     // Construct the matrix F for the adjoint
     mat_t<2,2,Type> F = mat_t<2,2,Type>(0.0, F01,
@@ -609,10 +620,14 @@ inline CUDA_CALLABLE void adj_svd2(const mat_t<2,2,Type>& A,
     mat_t<2,2,Type> sigma_term = mul(U, mul(adj_sigma_mat, VT));
     // Compute the adjoint contributions for U (left singular vectors)
-    mat_t<2,2,Type> u_term = mul(mul(U, mul(cw_mul(F, (mul(UT, adj_U) - mul(transpose(adj_U), U))), s_mat)), VT);
+    mat_t<2,2,Type> skew_u = cw_mul(F, mul(UT, adj_U) - mul(transpose(adj_U), U));
+    mat_t<2,2,Type> block_u = mul(skew_u, s_mat);
+    mat_t<2,2,Type> u_term = mul(mul(U, block_u), VT);
     // Compute the adjoint contributions for V (right singular vectors)
-    mat_t<2,2,Type> v_term = mul(U, mul(s_mat, mul(cw_mul(F, (mul(VT, adj_V) - mul(transpose(adj_V), V))), VT)));
+    mat_t<2,2,Type> skew_v = cw_mul(F, mul(VT, adj_V) - mul(transpose(adj_V), V));
+    mat_t<2,2,Type> block_v = mul(skew_v, VT);
+    mat_t<2,2,Type> v_term = mul(U, mul(s_mat, block_v));
     // Combine the terms to compute the adjoint of A
     adj_A = adj_A + (u_term + v_term + sigma_term);

warp/native/tile.h CHANGED Viewed

@@ -3015,21 +3015,41 @@ inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, int l, const
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, const Scalar& src, AdjTileA& adj_dest, int adj_i, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i));
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i, j));
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i, j, k));
 }
 template <typename TileA, typename AdjTileA, typename Scalar>
 inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, int adj_l, Scalar& adj_src)
 {
+    if (dest.grad.ptr == nullptr)
+    {
+        return;
+    }
     adj_src += dest.grad(tile_coord(i, j, k, l));
 }
@@ -3112,7 +3132,6 @@ inline CUDA_CALLABLE TileC& tile_diag_add(TileA& a, TileB& b, TileC& c)
 template <typename TileA, typename TileB, typename TileC, typename AdjTileA, typename AdjTileB, typename AdjTileC>
 inline CUDA_CALLABLE void adj_tile_diag_add(TileA& a, TileB& b, TileC& c, AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c, AdjTileC& adj_ret)
 {
-    assert(false);
 }

warp/native/tile_radix_sort.h CHANGED Viewed

@@ -122,7 +122,7 @@ inline CUDA_CALLABLE void bitonic_sort_single_stage_full_thread_block(int k, uns
         int thread_id2 = loop_id * WP_TILE_BLOCK_DIM + thread_id;
         key_register[loop_id] = thread_id2 < length ? key_sh_mem[thread_id2] : max_key_value;
-        val_register[loop_id] = thread_id2 < length ? val_sh_mem[thread_id2] : 0;
+        val_register[loop_id] = thread_id2 < length ? val_sh_mem[thread_id2] : static_cast<V>(0);
     }
     __syncthreads();
@@ -342,7 +342,11 @@ inline CUDA_CALLABLE void bitonic_sort_thread_block_shared_mem(
                 values_shared_mem[i] = values_input[i];
             }
             else
+            {
+                // Note that these values may end up in the output If enough NaN or Inf values are present in keys_input
                 keys_shared_mem[i] = key_max_possible_value;
+                values_shared_mem[i] = static_cast<V>(0);
+            }
         }
         __syncthreads();

warp/native/tile_reduce.h CHANGED Viewed

@@ -83,19 +83,7 @@ inline CUDA_CALLABLE wp::vec_t<Length, T> warp_shuffle_down(wp::vec_t<Length, T>
     wp::vec_t<Length, T> result;
     for (unsigned i=0; i < Length; ++i)
-        result.data[i] = __shfl_down_sync(mask, val.data[i], offset, WP_TILE_WARP_SIZE);
-    return result;
-}
-// Quaternion overload
-template <typename T>
-inline CUDA_CALLABLE wp::quat_t<T> warp_shuffle_down(wp::quat_t<T> val, int offset, int mask)
-{
-    wp::quat_t<T> result;
-    for (unsigned i=0; i < 4; ++i)
-        result.data[i] = __shfl_down_sync(mask, val.data[i], offset, WP_TILE_WARP_SIZE);
+        result[i] = __shfl_down_sync(mask, val[i], offset, WP_TILE_WARP_SIZE);
     return result;
 }
@@ -218,6 +206,7 @@ auto tile_reduce_impl(Op f, Tile& t)
     // ensure that only threads with at least one valid item participate in the reduction
     unsigned int mask = __ballot_sync(__activemask(), Layout::valid(Layout::linear_from_register(0)));
+    bool warp_is_active = mask != 0;
     // warp reduction
     T warp_sum = warp_reduce(thread_sum, f, mask);
@@ -233,7 +222,7 @@ auto tile_reduce_impl(Op f, Tile& t)
     // ensure active_warps is initialized
     WP_TILE_SYNC();
-    if (lane_index == 0)
+    if (lane_index == 0 && warp_is_active)
     {
         partials[warp_index] = warp_sum;
         atomicAdd(&active_warps, 1);
@@ -291,6 +280,7 @@ auto tile_arg_reduce_impl(Op f, OpTrack track, Tile& t)
     // ensure that only threads with at least one valid item participate in the reduction
     unsigned int mask = __ballot_sync(__activemask(), Layout::valid(Layout::linear_from_register(0)));
+    bool warp_is_active = mask != 0;
     // warp reduction
     ValueAndIndex<T> warp_sum = warp_reduce_tracked(thread_sum, champion_index, f, track, mask);
@@ -307,7 +297,7 @@ auto tile_arg_reduce_impl(Op f, OpTrack track, Tile& t)
     // ensure active_warps is initialized
     WP_TILE_SYNC();
-    if (lane_index == 0)
+    if (lane_index == 0 && warp_is_active)
     {
         partials[warp_index] = warp_sum.value;
         partials_idx[warp_index] = warp_sum.index;
@@ -422,25 +412,26 @@ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
 {
     using T = typename Tile::Type;
-#if !defined(__CUDA_ARCH__)
-    for (int i=0; i < Tile::Layout::Size; ++i)
-    {
-        adj_t(i) += adj_ret.data[0];
+    auto adj_reg = adj_ret.grad_to_register();
-    }
+#if !defined(__CUDA_ARCH__)
+    T scratch = adj_reg.data[0];
 #else
     // broadcast incoming adjoint to block
     WP_TILE_SHARED T scratch;
     if (WP_TILE_THREAD_IDX == 0)
-        scratch = adj_ret.data[0];
+        scratch = adj_reg.data[0];
     WP_TILE_SYNC();
+#endif
-    // broadcast scalar across input dimensions (note zero strides)
-    auto adj_ret_reg = tile_shared_t<T, tile_layout_strided_t<typename Tile::Layout::Shape, tile_stride_t<0, 0>>, false>(&scratch, nullptr).copy_to_register();
+    auto adj_ret_reg = tile_register_like<Tile>();
+    using Layout = typename decltype(adj_ret_reg)::Layout;
+    for (int i=0; i < Layout::NumRegs; ++i)
+    {
+        adj_ret_reg.data[i] += scratch;
+    }
     adj_t.grad_add(adj_ret_reg);
-#endif
 }
 template <typename Tile>

warp/native/tuple.h CHANGED Viewed

@@ -182,8 +182,8 @@ adj_add(
     const tuple_t<Head, Tail...>& adj_ret
 )
 {
-    adj_add(a.head, b.head, adj_ret.head);
-    adj_add(a.tail, b.tail, adj_ret.tail);
+    adj_add(a.head, b.head, adj_a.head, adj_b.head, adj_ret.head);
+    adj_add(a.tail, b.tail, adj_a.tail, adj_b.tail, adj_ret.tail);
 }
 } // namespace wp

warp/native/vec.h CHANGED Viewed

@@ -969,11 +969,11 @@ template<unsigned Length, typename Type>
 inline CUDA_CALLABLE void adj_div(Type s, vec_t<Length, Type> a, Type& adj_s, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
 {
-    adj_s -= dot(a , adj_ret)/ (s * s); // - a / s^2
-    for( unsigned i=0; i < Length; ++i )
+    for (unsigned i=0; i < Length; ++i)
     {
-        adj_a[i] += s / adj_ret[i];
+        Type inv = Type(1) / a[i];
+        adj_a[i] -= s * adj_ret[i] * inv * inv;
+        adj_s += adj_ret[i] * inv;
     }
 #if FP_CHECK

warp/native/warp.cpp CHANGED Viewed

@@ -1072,7 +1072,7 @@ WP_API float cuda_event_elapsed_time(void* start_event, void* end_event) { retur
 WP_API bool cuda_graph_begin_capture(void* context, void* stream, int external) { return false; }
 WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret) { return false; }
-WP_API bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret) { return false; }
+WP_API bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret) { return false; }
 WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; }
 WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; }
 WP_API bool cuda_graph_exec_destroy(void* context, void* graph_exec) { return false; }

warp/native/warp.cu CHANGED Viewed

@@ -309,7 +309,13 @@ int cuda_init()
                 check_cu(cuDeviceGetAttribute_f(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
                 check_cu(cuDeviceGetAttribute_f(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
                 g_devices[i].arch = 10 * major + minor;
+#ifdef CUDA_VERSION
+#if CUDA_VERSION  < 13000
+                if (g_devices[i].arch == 110) {
+                    g_devices[i].arch = 101;  // Thor SM change
+                }
+#endif
+#endif
                 g_device_map[device] = &g_devices[i];
             }
             else
@@ -2781,7 +2787,7 @@ bool capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
     return true;
 }
-bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
+bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
 {
     ContextGuard guard(context);
@@ -2789,6 +2795,13 @@ bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
     if (!check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, (cudaGraph_t)graph, cudaGraphInstantiateFlagAutoFreeOnLaunch)))
         return false;
+    // Usually uploading the graph explicitly is optional, but when updating graph nodes (e.g., indirect dispatch)
+    // then the upload is required because otherwise the graph nodes that get updated might not yet be uploaded, which
+    // results in undefined behavior.
+    CUstream cuda_stream = static_cast<CUstream>(stream);
+    if (!check_cuda(cudaGraphUpload(graph_exec, cuda_stream)))
+         return false;
     if (graph_exec_ret)
         *graph_exec_ret = graph_exec;

warp/native/warp.h CHANGED Viewed

@@ -308,7 +308,7 @@ extern "C"
     WP_API bool cuda_graph_begin_capture(void* context, void* stream, int external);
     WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret);
-    WP_API bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret);
+    WP_API bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret);
     WP_API bool cuda_graph_launch(void* graph, void* stream);
     WP_API bool cuda_graph_destroy(void* context, void* graph);
     WP_API bool cuda_graph_exec_destroy(void* context, void* graph_exec);

warp/render/render_opengl.py CHANGED Viewed

@@ -320,15 +320,14 @@ def update_vbo_transforms(
 @wp.kernel
 def update_vbo_vertices(
     points: wp.array(dtype=wp.vec3),
-    scale: wp.vec3,
     # outputs
     vbo_vertices: wp.array(dtype=float, ndim=2),
 ):
     tid = wp.tid()
     p = points[tid]
-    vbo_vertices[tid, 0] = p[0] * scale[0]
-    vbo_vertices[tid, 1] = p[1] * scale[1]
-    vbo_vertices[tid, 2] = p[2] * scale[2]
+    vbo_vertices[tid, 0] = p[0]
+    vbo_vertices[tid, 1] = p[1]
+    vbo_vertices[tid, 2] = p[2]
 @wp.kernel
@@ -422,7 +421,6 @@ def compute_gfx_vertices(
 def compute_average_normals(
     indices: wp.array(dtype=int, ndim=2),
     vertices: wp.array(dtype=wp.vec3),
-    scale: wp.vec3,
     # outputs
     normals: wp.array(dtype=wp.vec3),
     faces_per_vertex: wp.array(dtype=int),
@@ -431,9 +429,9 @@ def compute_average_normals(
     i = indices[tid, 0]
     j = indices[tid, 1]
     k = indices[tid, 2]
-    v0 = vertices[i] * scale[0]
-    v1 = vertices[j] * scale[1]
-    v2 = vertices[k] * scale[2]
+    v0 = vertices[i]
+    v1 = vertices[j]
+    v2 = vertices[k]
     n = wp.normalize(wp.cross(v1 - v0, v2 - v0))
     wp.atomic_add(normals, i, n)
     wp.atomic_add(faces_per_vertex, i, 1)
@@ -448,16 +446,15 @@ def assemble_gfx_vertices(
     vertices: wp.array(dtype=wp.vec3, ndim=1),
     normals: wp.array(dtype=wp.vec3),
     faces_per_vertex: wp.array(dtype=int),
-    scale: wp.vec3,
     # outputs
     gfx_vertices: wp.array(dtype=float, ndim=2),
 ):
     tid = wp.tid()
     v = vertices[tid]
     n = normals[tid] / float(faces_per_vertex[tid])
-    gfx_vertices[tid, 0] = v[0] * scale[0]
-    gfx_vertices[tid, 1] = v[1] * scale[1]
-    gfx_vertices[tid, 2] = v[2] * scale[2]
+    gfx_vertices[tid, 0] = v[0]
+    gfx_vertices[tid, 1] = v[1]
+    gfx_vertices[tid, 2] = v[2]
     gfx_vertices[tid, 3] = n[0]
     gfx_vertices[tid, 4] = n[1]
     gfx_vertices[tid, 5] = n[2]
@@ -2445,7 +2442,7 @@ Instances: {len(self._instances)}"""
         gl.glBindVertexArray(0)
-    def update_shape_instance(self, name, pos=None, rot=None, color1=None, color2=None, visible=None):
+    def update_shape_instance(self, name, pos=None, rot=None, color1=None, color2=None, scale=None, visible=None):
         """Update the instance properties of the shape
         Args:
@@ -2461,7 +2458,7 @@ Instances: {len(self._instances)}"""
         self._switch_context()
         if name in self._instances:
-            i, body, shape, tf, scale, old_color1, old_color2, v = self._instances[name]
+            i, body, shape, tf, old_scale, old_color1, old_color2, v = self._instances[name]
             if visible is None:
                 visible = v
             new_tf = np.copy(tf)
@@ -2474,7 +2471,7 @@ Instances: {len(self._instances)}"""
                 body,
                 shape,
                 new_tf,
-                scale,
+                old_scale if scale is None else scale,
                 old_color1 if color1 is None else color1,
                 old_color2 if color2 is None else color2,
                 visible,
@@ -2968,7 +2965,7 @@ Instances: {len(self._instances)}"""
         geo_hash = hash(("box", tuple(extents)))
         if geo_hash in self._shape_geo_hash:
             shape = self._shape_geo_hash[geo_hash]
-            if self.update_shape_instance(name, pos, rot):
+            if self.update_shape_instance(name, pos, rot, color1=color, color2=color):
                 return shape
         else:
             vertices, indices = self._create_box_mesh(extents)
@@ -3031,50 +3028,54 @@ Instances: {len(self._instances)}"""
         if not update_topology:
             if name in self._instances:
                 # Update the instance's transform.
-                self.update_shape_instance(name, pos, rot, color1=colors)
+                self.update_shape_instance(name, pos, rot, color1=colors, color2=colors, scale=scale, visible=visible)
             if shape is not None:
                 # Update the shape's point positions.
-                self.update_shape_vertices(shape, points, scale)
+                self.update_shape_vertices(shape, points)
                 if not is_template and name not in self._instances:
                     # Create a new instance.
                     body = self._resolve_body_id(parent_body)
-                    self.add_shape_instance(name, shape, body, pos, rot, color1=colors)
+                    self.add_shape_instance(name, shape, body, pos, rot, color1=colors, scale=scale)
                 return shape
         # No existing shape for the given mesh was found, or its topology may have changed,
         # so we need to define a new one either way.
-        if smooth_shading:
-            normals = wp.zeros(point_count, dtype=wp.vec3)
-            vertices = wp.array(points, dtype=wp.vec3)
-            faces_per_vertex = wp.zeros(point_count, dtype=int)
-            wp.launch(
-                compute_average_normals,
-                dim=idx_count,
-                inputs=[wp.array(indices, dtype=int), vertices, scale],
-                outputs=[normals, faces_per_vertex],
-            )
-            gfx_vertices = wp.zeros((point_count, 8), dtype=float)
-            wp.launch(
-                assemble_gfx_vertices,
-                dim=point_count,
-                inputs=[vertices, normals, faces_per_vertex, scale],
-                outputs=[gfx_vertices],
-            )
-            gfx_vertices = gfx_vertices.numpy()
-            gfx_indices = indices.flatten()
-        else:
-            gfx_vertices = wp.zeros((idx_count * 3, 8), dtype=float)
-            wp.launch(
-                compute_gfx_vertices,
-                dim=idx_count,
-                inputs=[wp.array(indices, dtype=int), wp.array(points, dtype=wp.vec3), scale],
-                outputs=[gfx_vertices],
-            )
-            gfx_vertices = gfx_vertices.numpy()
-            gfx_indices = np.arange(idx_count * 3)
+        with wp.ScopedDevice(self._device):
+            if smooth_shading:
+                normals = wp.zeros(point_count, dtype=wp.vec3)
+                vertices = wp.array(points, dtype=wp.vec3)
+                faces_per_vertex = wp.zeros(point_count, dtype=int)
+                wp.launch(
+                    compute_average_normals,
+                    dim=idx_count,
+                    inputs=[wp.array(indices, dtype=int), vertices],
+                    outputs=[normals, faces_per_vertex],
+                    record_tape=False,
+                )
+                gfx_vertices = wp.zeros((point_count, 8), dtype=float)
+                wp.launch(
+                    assemble_gfx_vertices,
+                    dim=point_count,
+                    inputs=[vertices, normals, faces_per_vertex],
+                    outputs=[gfx_vertices],
+                    record_tape=False,
+                )
+                gfx_vertices = gfx_vertices.numpy()
+                gfx_indices = indices.flatten()
+            else:
+                gfx_vertices = wp.zeros((idx_count * 3, 8), dtype=float)
+                wp.launch(
+                    compute_gfx_vertices,
+                    dim=idx_count,
+                    inputs=[wp.array(indices, dtype=int), wp.array(points, dtype=wp.vec3)],
+                    outputs=[gfx_vertices],
+                    record_tape=False,
+                )
+                gfx_vertices = gfx_vertices.numpy()
+                gfx_indices = np.arange(idx_count * 3)
         # If there was a shape for the given mesh, clean it up.
         if shape is not None:
@@ -3090,7 +3091,7 @@ Instances: {len(self._instances)}"""
         if not is_template:
             # Create a new instance if necessary.
             body = self._resolve_body_id(parent_body)
-            self.add_shape_instance(name, shape, body, pos, rot, color1=colors)
+            self.add_shape_instance(name, shape, body, pos, rot, color1=colors, scale=scale)
         return shape
@@ -3278,7 +3279,7 @@ Instances: {len(self._instances)}"""
         lines = np.array(lines)
         self._render_lines(name, lines, color, radius)
-    def update_shape_vertices(self, shape, points, scale):
+    def update_shape_vertices(self, shape, points):
         if isinstance(points, wp.array):
             wp_points = points.to(self._device)
         else:
@@ -3291,7 +3292,7 @@ Instances: {len(self._instances)}"""
         wp.launch(
             update_vbo_vertices,
             dim=vertices_shape[0],
-            inputs=[wp_points, scale],
+            inputs=[wp_points],
             outputs=[vbo_vertices],
             device=self._device,
         )

warp/render/render_usd.py CHANGED Viewed

@@ -647,7 +647,6 @@ class UsdRenderer:
             mesh.GetDisplayColorAttr().Set(colors, self.time)
         self._shape_constructors[name] = UsdGeom.Mesh
-        self._shape_custom_scale[name] = scale
         if not is_template:
             _usd_set_xform(mesh, pos, rot, scale, self.time)

warp/sim/collide.py CHANGED Viewed

@@ -1236,8 +1236,7 @@ def handle_contact_pairs(
         p_b_body = closest_point_box(geo_scale_b, query_b)
         p_b_world = wp.transform_point(X_ws_b, p_b_body)
         diff = p_a_world - p_b_world
-        # use center of box A to query normal to make sure we are not inside B
-        query_b = wp.transform_point(X_sw_b, wp.transform_get_translation(X_ws_a))
         normal = wp.transform_vector(X_ws_b, box_sdf_grad(geo_scale_b, query_b))
         distance = wp.dot(diff, normal)

warp/sim/integrator_vbd.py CHANGED Viewed

@@ -1379,6 +1379,8 @@ def VBD_solve_trimesh_no_self_contact(
     edge_rest_length: wp.array(dtype=float),
     edge_bending_properties: wp.array(dtype=float, ndim=2),
     adjacency: ForceElementAdjacencyInfo,
+    particle_forces: wp.array(dtype=wp.vec3),
+    particle_hessians: wp.array(dtype=wp.mat33),
     # contact info
     soft_contact_ke: float,
     soft_contact_kd: float,
@@ -1493,9 +1495,11 @@ def VBD_solve_trimesh_no_self_contact(
             dt,
         )
-        f = f + ground_contact_force
-        h = h + ground_contact_hessian
+        f += ground_contact_force
+        h += ground_contact_hessian
+    f += particle_forces[particle_index]
+    h += particle_hessians[particle_index]
     if abs(wp.determinant(h)) > 1e-5:
         hInv = wp.inverse(h)
         pos_new[particle_index] = particle_pos + hInv * f
@@ -2138,6 +2142,8 @@ class VBDIntegrator(Integrator):
         )
         for _iter in range(self.iterations):
+            self.particle_forces.zero_()
+            self.particle_hessians.zero_()
             for color in range(len(self.model.particle_color_groups)):
                 wp.launch(
                     kernel=VBD_accumulate_contact_force_and_hessian_no_self_contact,
@@ -2191,6 +2197,8 @@ class VBDIntegrator(Integrator):
                         self.model.edge_rest_length,
                         self.model.edge_bending_properties,
                         self.adjacency,
+                        self.particle_forces,
+                        self.particle_hessians,
                         self.model.soft_contact_ke,
                         self.model.soft_contact_kd,
                         self.model.soft_contact_mu,