warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (59) hide show
  1. warp/bin/warp-clang.so +0 -0
  2. warp/bin/warp.so +0 -0
  3. warp/build_dll.py +5 -0
  4. warp/codegen.py +15 -3
  5. warp/config.py +1 -1
  6. warp/context.py +122 -24
  7. warp/examples/interop/example_jax_callable.py +34 -4
  8. warp/examples/interop/example_jax_kernel.py +27 -1
  9. warp/fem/field/virtual.py +2 -0
  10. warp/fem/integrate.py +78 -47
  11. warp/jax_experimental/ffi.py +201 -53
  12. warp/native/array.h +4 -4
  13. warp/native/builtin.h +8 -4
  14. warp/native/coloring.cpp +5 -1
  15. warp/native/cuda_util.cpp +1 -1
  16. warp/native/intersect.h +2 -2
  17. warp/native/mat.h +3 -3
  18. warp/native/mesh.h +1 -1
  19. warp/native/quat.h +6 -2
  20. warp/native/rand.h +7 -7
  21. warp/native/sparse.cu +1 -1
  22. warp/native/svd.h +23 -8
  23. warp/native/tile.h +20 -1
  24. warp/native/tile_radix_sort.h +5 -1
  25. warp/native/tile_reduce.h +16 -25
  26. warp/native/tuple.h +2 -2
  27. warp/native/vec.h +4 -4
  28. warp/native/warp.cpp +1 -1
  29. warp/native/warp.cu +15 -2
  30. warp/native/warp.h +1 -1
  31. warp/render/render_opengl.py +52 -51
  32. warp/render/render_usd.py +0 -1
  33. warp/sim/collide.py +1 -2
  34. warp/sim/integrator_vbd.py +10 -2
  35. warp/sparse.py +1 -1
  36. warp/tape.py +2 -0
  37. warp/tests/sim/test_cloth.py +89 -6
  38. warp/tests/sim/test_coloring.py +76 -1
  39. warp/tests/test_assert.py +53 -0
  40. warp/tests/test_atomic_cas.py +127 -114
  41. warp/tests/test_mat.py +22 -0
  42. warp/tests/test_quat.py +22 -0
  43. warp/tests/test_sparse.py +32 -0
  44. warp/tests/test_static.py +48 -0
  45. warp/tests/test_tape.py +38 -0
  46. warp/tests/test_vec.py +38 -408
  47. warp/tests/test_vec_constructors.py +325 -0
  48. warp/tests/tile/test_tile.py +31 -143
  49. warp/tests/tile/test_tile_mathdx.py +2 -2
  50. warp/tests/tile/test_tile_matmul.py +179 -0
  51. warp/tests/tile/test_tile_reduce.py +100 -11
  52. warp/tests/tile/test_tile_shared_memory.py +12 -12
  53. warp/tests/tile/test_tile_sort.py +59 -55
  54. warp/tests/unittest_suites.py +10 -0
  55. {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/METADATA +4 -4
  56. {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/RECORD +59 -57
  57. {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
  58. {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
  59. {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0
warp/native/quat.h CHANGED
@@ -904,8 +904,12 @@ inline CUDA_CALLABLE void adj_div(quat_t<Type> a, Type s, quat_t<Type>& adj_a, T
904
904
  template<typename Type>
905
905
  inline CUDA_CALLABLE void adj_div(Type s, quat_t<Type> a, Type& adj_s, quat_t<Type>& adj_a, const quat_t<Type>& adj_ret)
906
906
  {
907
- adj_s -= dot(a, adj_ret)/ (s * s); // - a / s^2
908
- adj_a += s / adj_ret;
907
+ for (unsigned i=0; i < 4; ++i)
908
+ {
909
+ Type inv = Type(1) / a[i];
910
+ adj_a[i] -= s * adj_ret[i] * inv * inv;
911
+ adj_s += adj_ret[i] * inv;
912
+ }
909
913
  }
910
914
 
911
915
  template<typename Type>
warp/native/rand.h CHANGED
@@ -71,14 +71,14 @@ inline CUDA_CALLABLE float randf(uint32& state, float min, float max) { return (
71
71
  // Box-Muller method
72
72
  inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state) + RANDN_EPSILON)) * cos(2.f * M_PI_F * randf(state)); }
73
73
 
74
- inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, float adj_ret) {}
75
- inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, float adj_ret) {}
74
+ inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, uint32 adj_ret) {}
75
+ inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, uint32 adj_ret) {}
76
76
 
77
- inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state, float adj_ret) {}
78
- inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max, float adj_ret) {}
77
+ inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state, int adj_ret) {}
78
+ inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max, int adj_ret) {}
79
79
 
80
- inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state, float adj_ret) {}
81
- inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max, float adj_ret) {}
80
+ inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state, uint32 adj_ret) {}
81
+ inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max, uint32 adj_ret) {}
82
82
 
83
83
  inline CUDA_CALLABLE void adj_randf(uint32& state, uint32& adj_state, float adj_ret) {}
84
84
  inline CUDA_CALLABLE void adj_randf(uint32& state, float min, float max, uint32& adj_state, float& adj_min, float& adj_max, float adj_ret) {}
@@ -195,7 +195,7 @@ inline CUDA_CALLABLE void adj_sample_unit_hemisphere_surface(uint32& state, uint
195
195
  inline CUDA_CALLABLE void adj_sample_unit_hemisphere(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
196
196
  inline CUDA_CALLABLE void adj_sample_unit_square(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
197
197
  inline CUDA_CALLABLE void adj_sample_unit_cube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
198
- inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
198
+ inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const vec4& adj_ret) {}
199
199
 
200
200
  /*
201
201
  * log-gamma function to support some of these distributions. The
warp/native/sparse.cu CHANGED
@@ -334,7 +334,7 @@ WP_API void bsr_matrix_from_triplets_device(
334
334
  // Ensures the sorted keys are available in summed_block_indices if needed
335
335
  if(return_summed_blocks && d_keys.Current() != tpl_block_indices)
336
336
  {
337
- check_cuda(cudaMemcpy(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice));
337
+ check_cuda(cudaMemcpyAsync(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice, stream));
338
338
  }
339
339
  }
340
340
 
warp/native/svd.h CHANGED
@@ -50,12 +50,14 @@ namespace wp
50
50
 
51
51
  template<typename Type>
52
52
  struct _svd_config {
53
+ static constexpr float SVD_EPSILON = 1.e-6f;
53
54
  static constexpr float QR_GIVENS_EPSILON = 1.e-6f;
54
55
  static constexpr int JACOBI_ITERATIONS = 4;
55
56
  };
56
57
 
57
58
  template<>
58
59
  struct _svd_config<double> {
60
+ static constexpr double SVD_EPSILON = 1.e-12;
59
61
  static constexpr double QR_GIVENS_EPSILON = 1.e-12;
60
62
  static constexpr int JACOBI_ITERATIONS = 8;
61
63
  };
@@ -528,13 +530,15 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
528
530
  const mat_t<3,3,Type>& adj_U,
529
531
  const vec_t<3,Type>& adj_sigma,
530
532
  const mat_t<3,3,Type>& adj_V) {
533
+ const Type epsilon = _svd_config<Type>::SVD_EPSILON;
534
+
531
535
  Type sx2 = sigma[0] * sigma[0];
532
536
  Type sy2 = sigma[1] * sigma[1];
533
537
  Type sz2 = sigma[2] * sigma[2];
534
538
 
535
- Type F01 = Type(1) / min(sy2 - sx2, Type(-1e-6f));
536
- Type F02 = Type(1) / min(sz2 - sx2, Type(-1e-6f));
537
- Type F12 = Type(1) / min(sz2 - sy2, Type(-1e-6f));
539
+ Type F01 = Type(1) / min(sy2 - sx2, Type(-epsilon));
540
+ Type F02 = Type(1) / min(sz2 - sx2, Type(-epsilon));
541
+ Type F12 = Type(1) / min(sz2 - sy2, Type(-epsilon));
538
542
 
539
543
  mat_t<3,3,Type> F = mat_t<3,3,Type>(0, F01, F02,
540
544
  -F01, 0, F12,
@@ -553,8 +557,13 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
553
557
 
554
558
  mat_t<3,3,Type> sigma_term = mul(U, mul(adj_sigma_mat, VT));
555
559
 
556
- mat_t<3,3,Type> u_term = mul(mul(U, mul(cw_mul(F, (mul(UT, adj_U) - mul(transpose(adj_U), U))), s_mat)), VT);
557
- mat_t<3,3,Type> v_term = mul(U, mul(s_mat, mul(cw_mul(F, (mul(VT, adj_V) - mul(transpose(adj_V), V))), VT)));
560
+ mat_t<3,3,Type> skew_u = cw_mul(F, mul(UT, adj_U) - mul(transpose(adj_U), U));
561
+ mat_t<3,3,Type> block_u = mul(skew_u, s_mat);
562
+ mat_t<3,3,Type> u_term = mul(mul(U, block_u), VT);
563
+
564
+ mat_t<3,3,Type> skew_v = cw_mul(F, mul(VT, adj_V) - mul(transpose(adj_V), V));
565
+ mat_t<3,3,Type> block_v = mul(skew_v, VT);
566
+ mat_t<3,3,Type> v_term = mul(U, mul(s_mat, block_v));
558
567
 
559
568
  adj_A = adj_A + (u_term + v_term + sigma_term);
560
569
  }
@@ -583,11 +592,13 @@ inline CUDA_CALLABLE void adj_svd2(const mat_t<2,2,Type>& A,
583
592
  const mat_t<2,2,Type>& adj_U,
584
593
  const vec_t<2,Type>& adj_sigma,
585
594
  const mat_t<2,2,Type>& adj_V) {
595
+ const Type epsilon = _svd_config<Type>::SVD_EPSILON;
596
+
586
597
  Type s1_squared = sigma[0] * sigma[0];
587
598
  Type s2_squared = sigma[1] * sigma[1];
588
599
 
589
600
  // Compute inverse of (s1^2 - s2^2) if possible, use small epsilon to prevent division by zero
590
- Type F01 = Type(1) / min(s2_squared - s1_squared, Type(-1e-6f));
601
+ Type F01 = Type(1) / min(s2_squared - s1_squared, Type(-epsilon));
591
602
 
592
603
  // Construct the matrix F for the adjoint
593
604
  mat_t<2,2,Type> F = mat_t<2,2,Type>(0.0, F01,
@@ -609,10 +620,14 @@ inline CUDA_CALLABLE void adj_svd2(const mat_t<2,2,Type>& A,
609
620
  mat_t<2,2,Type> sigma_term = mul(U, mul(adj_sigma_mat, VT));
610
621
 
611
622
  // Compute the adjoint contributions for U (left singular vectors)
612
- mat_t<2,2,Type> u_term = mul(mul(U, mul(cw_mul(F, (mul(UT, adj_U) - mul(transpose(adj_U), U))), s_mat)), VT);
623
+ mat_t<2,2,Type> skew_u = cw_mul(F, mul(UT, adj_U) - mul(transpose(adj_U), U));
624
+ mat_t<2,2,Type> block_u = mul(skew_u, s_mat);
625
+ mat_t<2,2,Type> u_term = mul(mul(U, block_u), VT);
613
626
 
614
627
  // Compute the adjoint contributions for V (right singular vectors)
615
- mat_t<2,2,Type> v_term = mul(U, mul(s_mat, mul(cw_mul(F, (mul(VT, adj_V) - mul(transpose(adj_V), V))), VT)));
628
+ mat_t<2,2,Type> skew_v = cw_mul(F, mul(VT, adj_V) - mul(transpose(adj_V), V));
629
+ mat_t<2,2,Type> block_v = mul(skew_v, VT);
630
+ mat_t<2,2,Type> v_term = mul(U, mul(s_mat, block_v));
616
631
 
617
632
  // Combine the terms to compute the adjoint of A
618
633
  adj_A = adj_A + (u_term + v_term + sigma_term);
warp/native/tile.h CHANGED
@@ -3015,21 +3015,41 @@ inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, int l, const
3015
3015
  template <typename TileA, typename AdjTileA, typename Scalar>
3016
3016
  inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, const Scalar& src, AdjTileA& adj_dest, int adj_i, Scalar& adj_src)
3017
3017
  {
3018
+ if (dest.grad.ptr == nullptr)
3019
+ {
3020
+ return;
3021
+ }
3022
+
3018
3023
  adj_src += dest.grad(tile_coord(i));
3019
3024
  }
3020
3025
  template <typename TileA, typename AdjTileA, typename Scalar>
3021
3026
  inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, Scalar& adj_src)
3022
3027
  {
3028
+ if (dest.grad.ptr == nullptr)
3029
+ {
3030
+ return;
3031
+ }
3032
+
3023
3033
  adj_src += dest.grad(tile_coord(i, j));
3024
3034
  }
3025
3035
  template <typename TileA, typename AdjTileA, typename Scalar>
3026
3036
  inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, Scalar& adj_src)
3027
3037
  {
3038
+ if (dest.grad.ptr == nullptr)
3039
+ {
3040
+ return;
3041
+ }
3042
+
3028
3043
  adj_src += dest.grad(tile_coord(i, j, k));
3029
3044
  }
3030
3045
  template <typename TileA, typename AdjTileA, typename Scalar>
3031
3046
  inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, int adj_l, Scalar& adj_src)
3032
3047
  {
3048
+ if (dest.grad.ptr == nullptr)
3049
+ {
3050
+ return;
3051
+ }
3052
+
3033
3053
  adj_src += dest.grad(tile_coord(i, j, k, l));
3034
3054
  }
3035
3055
 
@@ -3112,7 +3132,6 @@ inline CUDA_CALLABLE TileC& tile_diag_add(TileA& a, TileB& b, TileC& c)
3112
3132
  template <typename TileA, typename TileB, typename TileC, typename AdjTileA, typename AdjTileB, typename AdjTileC>
3113
3133
  inline CUDA_CALLABLE void adj_tile_diag_add(TileA& a, TileB& b, TileC& c, AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c, AdjTileC& adj_ret)
3114
3134
  {
3115
- assert(false);
3116
3135
  }
3117
3136
 
3118
3137
 
@@ -122,7 +122,7 @@ inline CUDA_CALLABLE void bitonic_sort_single_stage_full_thread_block(int k, uns
122
122
  int thread_id2 = loop_id * WP_TILE_BLOCK_DIM + thread_id;
123
123
 
124
124
  key_register[loop_id] = thread_id2 < length ? key_sh_mem[thread_id2] : max_key_value;
125
- val_register[loop_id] = thread_id2 < length ? val_sh_mem[thread_id2] : 0;
125
+ val_register[loop_id] = thread_id2 < length ? val_sh_mem[thread_id2] : static_cast<V>(0);
126
126
  }
127
127
 
128
128
  __syncthreads();
@@ -342,7 +342,11 @@ inline CUDA_CALLABLE void bitonic_sort_thread_block_shared_mem(
342
342
  values_shared_mem[i] = values_input[i];
343
343
  }
344
344
  else
345
+ {
346
+ // Note that these values may end up in the output If enough NaN or Inf values are present in keys_input
345
347
  keys_shared_mem[i] = key_max_possible_value;
348
+ values_shared_mem[i] = static_cast<V>(0);
349
+ }
346
350
  }
347
351
  __syncthreads();
348
352
 
warp/native/tile_reduce.h CHANGED
@@ -83,19 +83,7 @@ inline CUDA_CALLABLE wp::vec_t<Length, T> warp_shuffle_down(wp::vec_t<Length, T>
83
83
  wp::vec_t<Length, T> result;
84
84
 
85
85
  for (unsigned i=0; i < Length; ++i)
86
- result.data[i] = __shfl_down_sync(mask, val.data[i], offset, WP_TILE_WARP_SIZE);
87
-
88
- return result;
89
- }
90
-
91
- // Quaternion overload
92
- template <typename T>
93
- inline CUDA_CALLABLE wp::quat_t<T> warp_shuffle_down(wp::quat_t<T> val, int offset, int mask)
94
- {
95
- wp::quat_t<T> result;
96
-
97
- for (unsigned i=0; i < 4; ++i)
98
- result.data[i] = __shfl_down_sync(mask, val.data[i], offset, WP_TILE_WARP_SIZE);
86
+ result[i] = __shfl_down_sync(mask, val[i], offset, WP_TILE_WARP_SIZE);
99
87
 
100
88
  return result;
101
89
  }
@@ -218,6 +206,7 @@ auto tile_reduce_impl(Op f, Tile& t)
218
206
 
219
207
  // ensure that only threads with at least one valid item participate in the reduction
220
208
  unsigned int mask = __ballot_sync(__activemask(), Layout::valid(Layout::linear_from_register(0)));
209
+ bool warp_is_active = mask != 0;
221
210
 
222
211
  // warp reduction
223
212
  T warp_sum = warp_reduce(thread_sum, f, mask);
@@ -233,7 +222,7 @@ auto tile_reduce_impl(Op f, Tile& t)
233
222
  // ensure active_warps is initialized
234
223
  WP_TILE_SYNC();
235
224
 
236
- if (lane_index == 0)
225
+ if (lane_index == 0 && warp_is_active)
237
226
  {
238
227
  partials[warp_index] = warp_sum;
239
228
  atomicAdd(&active_warps, 1);
@@ -291,6 +280,7 @@ auto tile_arg_reduce_impl(Op f, OpTrack track, Tile& t)
291
280
 
292
281
  // ensure that only threads with at least one valid item participate in the reduction
293
282
  unsigned int mask = __ballot_sync(__activemask(), Layout::valid(Layout::linear_from_register(0)));
283
+ bool warp_is_active = mask != 0;
294
284
 
295
285
  // warp reduction
296
286
  ValueAndIndex<T> warp_sum = warp_reduce_tracked(thread_sum, champion_index, f, track, mask);
@@ -307,7 +297,7 @@ auto tile_arg_reduce_impl(Op f, OpTrack track, Tile& t)
307
297
  // ensure active_warps is initialized
308
298
  WP_TILE_SYNC();
309
299
 
310
- if (lane_index == 0)
300
+ if (lane_index == 0 && warp_is_active)
311
301
  {
312
302
  partials[warp_index] = warp_sum.value;
313
303
  partials_idx[warp_index] = warp_sum.index;
@@ -422,25 +412,26 @@ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
422
412
  {
423
413
  using T = typename Tile::Type;
424
414
 
425
- #if !defined(__CUDA_ARCH__)
426
-
427
- for (int i=0; i < Tile::Layout::Size; ++i)
428
- {
429
- adj_t(i) += adj_ret.data[0];
415
+ auto adj_reg = adj_ret.grad_to_register();
430
416
 
431
- }
417
+ #if !defined(__CUDA_ARCH__)
418
+ T scratch = adj_reg.data[0];
432
419
  #else
433
420
  // broadcast incoming adjoint to block
434
421
  WP_TILE_SHARED T scratch;
435
422
  if (WP_TILE_THREAD_IDX == 0)
436
- scratch = adj_ret.data[0];
423
+ scratch = adj_reg.data[0];
437
424
 
438
425
  WP_TILE_SYNC();
426
+ #endif
439
427
 
440
- // broadcast scalar across input dimensions (note zero strides)
441
- auto adj_ret_reg = tile_shared_t<T, tile_layout_strided_t<typename Tile::Layout::Shape, tile_stride_t<0, 0>>, false>(&scratch, nullptr).copy_to_register();
428
+ auto adj_ret_reg = tile_register_like<Tile>();
429
+ using Layout = typename decltype(adj_ret_reg)::Layout;
430
+ for (int i=0; i < Layout::NumRegs; ++i)
431
+ {
432
+ adj_ret_reg.data[i] += scratch;
433
+ }
442
434
  adj_t.grad_add(adj_ret_reg);
443
- #endif
444
435
  }
445
436
 
446
437
  template <typename Tile>
warp/native/tuple.h CHANGED
@@ -182,8 +182,8 @@ adj_add(
182
182
  const tuple_t<Head, Tail...>& adj_ret
183
183
  )
184
184
  {
185
- adj_add(a.head, b.head, adj_ret.head);
186
- adj_add(a.tail, b.tail, adj_ret.tail);
185
+ adj_add(a.head, b.head, adj_a.head, adj_b.head, adj_ret.head);
186
+ adj_add(a.tail, b.tail, adj_a.tail, adj_b.tail, adj_ret.tail);
187
187
  }
188
188
 
189
189
  } // namespace wp
warp/native/vec.h CHANGED
@@ -969,11 +969,11 @@ template<unsigned Length, typename Type>
969
969
  inline CUDA_CALLABLE void adj_div(Type s, vec_t<Length, Type> a, Type& adj_s, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
970
970
  {
971
971
 
972
- adj_s -= dot(a , adj_ret)/ (s * s); // - a / s^2
973
-
974
- for( unsigned i=0; i < Length; ++i )
972
+ for (unsigned i=0; i < Length; ++i)
975
973
  {
976
- adj_a[i] += s / adj_ret[i];
974
+ Type inv = Type(1) / a[i];
975
+ adj_a[i] -= s * adj_ret[i] * inv * inv;
976
+ adj_s += adj_ret[i] * inv;
977
977
  }
978
978
 
979
979
  #if FP_CHECK
warp/native/warp.cpp CHANGED
@@ -1072,7 +1072,7 @@ WP_API float cuda_event_elapsed_time(void* start_event, void* end_event) { retur
1072
1072
 
1073
1073
  WP_API bool cuda_graph_begin_capture(void* context, void* stream, int external) { return false; }
1074
1074
  WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret) { return false; }
1075
- WP_API bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret) { return false; }
1075
+ WP_API bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret) { return false; }
1076
1076
  WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; }
1077
1077
  WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; }
1078
1078
  WP_API bool cuda_graph_exec_destroy(void* context, void* graph_exec) { return false; }
warp/native/warp.cu CHANGED
@@ -309,7 +309,13 @@ int cuda_init()
309
309
  check_cu(cuDeviceGetAttribute_f(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
310
310
  check_cu(cuDeviceGetAttribute_f(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
311
311
  g_devices[i].arch = 10 * major + minor;
312
-
312
+ #ifdef CUDA_VERSION
313
+ #if CUDA_VERSION < 13000
314
+ if (g_devices[i].arch == 110) {
315
+ g_devices[i].arch = 101; // Thor SM change
316
+ }
317
+ #endif
318
+ #endif
313
319
  g_device_map[device] = &g_devices[i];
314
320
  }
315
321
  else
@@ -2781,7 +2787,7 @@ bool capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
2781
2787
  return true;
2782
2788
  }
2783
2789
 
2784
- bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
2790
+ bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
2785
2791
  {
2786
2792
  ContextGuard guard(context);
2787
2793
 
@@ -2789,6 +2795,13 @@ bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
2789
2795
  if (!check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, (cudaGraph_t)graph, cudaGraphInstantiateFlagAutoFreeOnLaunch)))
2790
2796
  return false;
2791
2797
 
2798
+ // Usually uploading the graph explicitly is optional, but when updating graph nodes (e.g., indirect dispatch)
2799
+ // then the upload is required because otherwise the graph nodes that get updated might not yet be uploaded, which
2800
+ // results in undefined behavior.
2801
+ CUstream cuda_stream = static_cast<CUstream>(stream);
2802
+ if (!check_cuda(cudaGraphUpload(graph_exec, cuda_stream)))
2803
+ return false;
2804
+
2792
2805
  if (graph_exec_ret)
2793
2806
  *graph_exec_ret = graph_exec;
2794
2807
 
warp/native/warp.h CHANGED
@@ -308,7 +308,7 @@ extern "C"
308
308
 
309
309
  WP_API bool cuda_graph_begin_capture(void* context, void* stream, int external);
310
310
  WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret);
311
- WP_API bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret);
311
+ WP_API bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret);
312
312
  WP_API bool cuda_graph_launch(void* graph, void* stream);
313
313
  WP_API bool cuda_graph_destroy(void* context, void* graph);
314
314
  WP_API bool cuda_graph_exec_destroy(void* context, void* graph_exec);
@@ -320,15 +320,14 @@ def update_vbo_transforms(
320
320
  @wp.kernel
321
321
  def update_vbo_vertices(
322
322
  points: wp.array(dtype=wp.vec3),
323
- scale: wp.vec3,
324
323
  # outputs
325
324
  vbo_vertices: wp.array(dtype=float, ndim=2),
326
325
  ):
327
326
  tid = wp.tid()
328
327
  p = points[tid]
329
- vbo_vertices[tid, 0] = p[0] * scale[0]
330
- vbo_vertices[tid, 1] = p[1] * scale[1]
331
- vbo_vertices[tid, 2] = p[2] * scale[2]
328
+ vbo_vertices[tid, 0] = p[0]
329
+ vbo_vertices[tid, 1] = p[1]
330
+ vbo_vertices[tid, 2] = p[2]
332
331
 
333
332
 
334
333
  @wp.kernel
@@ -422,7 +421,6 @@ def compute_gfx_vertices(
422
421
  def compute_average_normals(
423
422
  indices: wp.array(dtype=int, ndim=2),
424
423
  vertices: wp.array(dtype=wp.vec3),
425
- scale: wp.vec3,
426
424
  # outputs
427
425
  normals: wp.array(dtype=wp.vec3),
428
426
  faces_per_vertex: wp.array(dtype=int),
@@ -431,9 +429,9 @@ def compute_average_normals(
431
429
  i = indices[tid, 0]
432
430
  j = indices[tid, 1]
433
431
  k = indices[tid, 2]
434
- v0 = vertices[i] * scale[0]
435
- v1 = vertices[j] * scale[1]
436
- v2 = vertices[k] * scale[2]
432
+ v0 = vertices[i]
433
+ v1 = vertices[j]
434
+ v2 = vertices[k]
437
435
  n = wp.normalize(wp.cross(v1 - v0, v2 - v0))
438
436
  wp.atomic_add(normals, i, n)
439
437
  wp.atomic_add(faces_per_vertex, i, 1)
@@ -448,16 +446,15 @@ def assemble_gfx_vertices(
448
446
  vertices: wp.array(dtype=wp.vec3, ndim=1),
449
447
  normals: wp.array(dtype=wp.vec3),
450
448
  faces_per_vertex: wp.array(dtype=int),
451
- scale: wp.vec3,
452
449
  # outputs
453
450
  gfx_vertices: wp.array(dtype=float, ndim=2),
454
451
  ):
455
452
  tid = wp.tid()
456
453
  v = vertices[tid]
457
454
  n = normals[tid] / float(faces_per_vertex[tid])
458
- gfx_vertices[tid, 0] = v[0] * scale[0]
459
- gfx_vertices[tid, 1] = v[1] * scale[1]
460
- gfx_vertices[tid, 2] = v[2] * scale[2]
455
+ gfx_vertices[tid, 0] = v[0]
456
+ gfx_vertices[tid, 1] = v[1]
457
+ gfx_vertices[tid, 2] = v[2]
461
458
  gfx_vertices[tid, 3] = n[0]
462
459
  gfx_vertices[tid, 4] = n[1]
463
460
  gfx_vertices[tid, 5] = n[2]
@@ -2445,7 +2442,7 @@ Instances: {len(self._instances)}"""
2445
2442
 
2446
2443
  gl.glBindVertexArray(0)
2447
2444
 
2448
- def update_shape_instance(self, name, pos=None, rot=None, color1=None, color2=None, visible=None):
2445
+ def update_shape_instance(self, name, pos=None, rot=None, color1=None, color2=None, scale=None, visible=None):
2449
2446
  """Update the instance properties of the shape
2450
2447
 
2451
2448
  Args:
@@ -2461,7 +2458,7 @@ Instances: {len(self._instances)}"""
2461
2458
  self._switch_context()
2462
2459
 
2463
2460
  if name in self._instances:
2464
- i, body, shape, tf, scale, old_color1, old_color2, v = self._instances[name]
2461
+ i, body, shape, tf, old_scale, old_color1, old_color2, v = self._instances[name]
2465
2462
  if visible is None:
2466
2463
  visible = v
2467
2464
  new_tf = np.copy(tf)
@@ -2474,7 +2471,7 @@ Instances: {len(self._instances)}"""
2474
2471
  body,
2475
2472
  shape,
2476
2473
  new_tf,
2477
- scale,
2474
+ old_scale if scale is None else scale,
2478
2475
  old_color1 if color1 is None else color1,
2479
2476
  old_color2 if color2 is None else color2,
2480
2477
  visible,
@@ -2968,7 +2965,7 @@ Instances: {len(self._instances)}"""
2968
2965
  geo_hash = hash(("box", tuple(extents)))
2969
2966
  if geo_hash in self._shape_geo_hash:
2970
2967
  shape = self._shape_geo_hash[geo_hash]
2971
- if self.update_shape_instance(name, pos, rot):
2968
+ if self.update_shape_instance(name, pos, rot, color1=color, color2=color):
2972
2969
  return shape
2973
2970
  else:
2974
2971
  vertices, indices = self._create_box_mesh(extents)
@@ -3031,50 +3028,54 @@ Instances: {len(self._instances)}"""
3031
3028
  if not update_topology:
3032
3029
  if name in self._instances:
3033
3030
  # Update the instance's transform.
3034
- self.update_shape_instance(name, pos, rot, color1=colors)
3031
+ self.update_shape_instance(name, pos, rot, color1=colors, color2=colors, scale=scale, visible=visible)
3035
3032
 
3036
3033
  if shape is not None:
3037
3034
  # Update the shape's point positions.
3038
- self.update_shape_vertices(shape, points, scale)
3035
+ self.update_shape_vertices(shape, points)
3039
3036
 
3040
3037
  if not is_template and name not in self._instances:
3041
3038
  # Create a new instance.
3042
3039
  body = self._resolve_body_id(parent_body)
3043
- self.add_shape_instance(name, shape, body, pos, rot, color1=colors)
3040
+ self.add_shape_instance(name, shape, body, pos, rot, color1=colors, scale=scale)
3044
3041
 
3045
3042
  return shape
3046
3043
 
3047
3044
  # No existing shape for the given mesh was found, or its topology may have changed,
3048
3045
  # so we need to define a new one either way.
3049
- if smooth_shading:
3050
- normals = wp.zeros(point_count, dtype=wp.vec3)
3051
- vertices = wp.array(points, dtype=wp.vec3)
3052
- faces_per_vertex = wp.zeros(point_count, dtype=int)
3053
- wp.launch(
3054
- compute_average_normals,
3055
- dim=idx_count,
3056
- inputs=[wp.array(indices, dtype=int), vertices, scale],
3057
- outputs=[normals, faces_per_vertex],
3058
- )
3059
- gfx_vertices = wp.zeros((point_count, 8), dtype=float)
3060
- wp.launch(
3061
- assemble_gfx_vertices,
3062
- dim=point_count,
3063
- inputs=[vertices, normals, faces_per_vertex, scale],
3064
- outputs=[gfx_vertices],
3065
- )
3066
- gfx_vertices = gfx_vertices.numpy()
3067
- gfx_indices = indices.flatten()
3068
- else:
3069
- gfx_vertices = wp.zeros((idx_count * 3, 8), dtype=float)
3070
- wp.launch(
3071
- compute_gfx_vertices,
3072
- dim=idx_count,
3073
- inputs=[wp.array(indices, dtype=int), wp.array(points, dtype=wp.vec3), scale],
3074
- outputs=[gfx_vertices],
3075
- )
3076
- gfx_vertices = gfx_vertices.numpy()
3077
- gfx_indices = np.arange(idx_count * 3)
3046
+ with wp.ScopedDevice(self._device):
3047
+ if smooth_shading:
3048
+ normals = wp.zeros(point_count, dtype=wp.vec3)
3049
+ vertices = wp.array(points, dtype=wp.vec3)
3050
+ faces_per_vertex = wp.zeros(point_count, dtype=int)
3051
+ wp.launch(
3052
+ compute_average_normals,
3053
+ dim=idx_count,
3054
+ inputs=[wp.array(indices, dtype=int), vertices],
3055
+ outputs=[normals, faces_per_vertex],
3056
+ record_tape=False,
3057
+ )
3058
+ gfx_vertices = wp.zeros((point_count, 8), dtype=float)
3059
+ wp.launch(
3060
+ assemble_gfx_vertices,
3061
+ dim=point_count,
3062
+ inputs=[vertices, normals, faces_per_vertex],
3063
+ outputs=[gfx_vertices],
3064
+ record_tape=False,
3065
+ )
3066
+ gfx_vertices = gfx_vertices.numpy()
3067
+ gfx_indices = indices.flatten()
3068
+ else:
3069
+ gfx_vertices = wp.zeros((idx_count * 3, 8), dtype=float)
3070
+ wp.launch(
3071
+ compute_gfx_vertices,
3072
+ dim=idx_count,
3073
+ inputs=[wp.array(indices, dtype=int), wp.array(points, dtype=wp.vec3)],
3074
+ outputs=[gfx_vertices],
3075
+ record_tape=False,
3076
+ )
3077
+ gfx_vertices = gfx_vertices.numpy()
3078
+ gfx_indices = np.arange(idx_count * 3)
3078
3079
 
3079
3080
  # If there was a shape for the given mesh, clean it up.
3080
3081
  if shape is not None:
@@ -3090,7 +3091,7 @@ Instances: {len(self._instances)}"""
3090
3091
  if not is_template:
3091
3092
  # Create a new instance if necessary.
3092
3093
  body = self._resolve_body_id(parent_body)
3093
- self.add_shape_instance(name, shape, body, pos, rot, color1=colors)
3094
+ self.add_shape_instance(name, shape, body, pos, rot, color1=colors, scale=scale)
3094
3095
 
3095
3096
  return shape
3096
3097
 
@@ -3278,7 +3279,7 @@ Instances: {len(self._instances)}"""
3278
3279
  lines = np.array(lines)
3279
3280
  self._render_lines(name, lines, color, radius)
3280
3281
 
3281
- def update_shape_vertices(self, shape, points, scale):
3282
+ def update_shape_vertices(self, shape, points):
3282
3283
  if isinstance(points, wp.array):
3283
3284
  wp_points = points.to(self._device)
3284
3285
  else:
@@ -3291,7 +3292,7 @@ Instances: {len(self._instances)}"""
3291
3292
  wp.launch(
3292
3293
  update_vbo_vertices,
3293
3294
  dim=vertices_shape[0],
3294
- inputs=[wp_points, scale],
3295
+ inputs=[wp_points],
3295
3296
  outputs=[vbo_vertices],
3296
3297
  device=self._device,
3297
3298
  )
warp/render/render_usd.py CHANGED
@@ -647,7 +647,6 @@ class UsdRenderer:
647
647
  mesh.GetDisplayColorAttr().Set(colors, self.time)
648
648
 
649
649
  self._shape_constructors[name] = UsdGeom.Mesh
650
- self._shape_custom_scale[name] = scale
651
650
 
652
651
  if not is_template:
653
652
  _usd_set_xform(mesh, pos, rot, scale, self.time)
warp/sim/collide.py CHANGED
@@ -1236,8 +1236,7 @@ def handle_contact_pairs(
1236
1236
  p_b_body = closest_point_box(geo_scale_b, query_b)
1237
1237
  p_b_world = wp.transform_point(X_ws_b, p_b_body)
1238
1238
  diff = p_a_world - p_b_world
1239
- # use center of box A to query normal to make sure we are not inside B
1240
- query_b = wp.transform_point(X_sw_b, wp.transform_get_translation(X_ws_a))
1239
+
1241
1240
  normal = wp.transform_vector(X_ws_b, box_sdf_grad(geo_scale_b, query_b))
1242
1241
  distance = wp.dot(diff, normal)
1243
1242
 
@@ -1379,6 +1379,8 @@ def VBD_solve_trimesh_no_self_contact(
1379
1379
  edge_rest_length: wp.array(dtype=float),
1380
1380
  edge_bending_properties: wp.array(dtype=float, ndim=2),
1381
1381
  adjacency: ForceElementAdjacencyInfo,
1382
+ particle_forces: wp.array(dtype=wp.vec3),
1383
+ particle_hessians: wp.array(dtype=wp.mat33),
1382
1384
  # contact info
1383
1385
  soft_contact_ke: float,
1384
1386
  soft_contact_kd: float,
@@ -1493,9 +1495,11 @@ def VBD_solve_trimesh_no_self_contact(
1493
1495
  dt,
1494
1496
  )
1495
1497
 
1496
- f = f + ground_contact_force
1497
- h = h + ground_contact_hessian
1498
+ f += ground_contact_force
1499
+ h += ground_contact_hessian
1498
1500
 
1501
+ f += particle_forces[particle_index]
1502
+ h += particle_hessians[particle_index]
1499
1503
  if abs(wp.determinant(h)) > 1e-5:
1500
1504
  hInv = wp.inverse(h)
1501
1505
  pos_new[particle_index] = particle_pos + hInv * f
@@ -2138,6 +2142,8 @@ class VBDIntegrator(Integrator):
2138
2142
  )
2139
2143
 
2140
2144
  for _iter in range(self.iterations):
2145
+ self.particle_forces.zero_()
2146
+ self.particle_hessians.zero_()
2141
2147
  for color in range(len(self.model.particle_color_groups)):
2142
2148
  wp.launch(
2143
2149
  kernel=VBD_accumulate_contact_force_and_hessian_no_self_contact,
@@ -2191,6 +2197,8 @@ class VBDIntegrator(Integrator):
2191
2197
  self.model.edge_rest_length,
2192
2198
  self.model.edge_bending_properties,
2193
2199
  self.adjacency,
2200
+ self.particle_forces,
2201
+ self.particle_hessians,
2194
2202
  self.model.soft_contact_ke,
2195
2203
  self.model.soft_contact_kd,
2196
2204
  self.model.soft_contact_mu,