warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build_dll.py +5 -0
- warp/codegen.py +15 -3
- warp/config.py +1 -1
- warp/context.py +122 -24
- warp/examples/interop/example_jax_callable.py +34 -4
- warp/examples/interop/example_jax_kernel.py +27 -1
- warp/fem/field/virtual.py +2 -0
- warp/fem/integrate.py +78 -47
- warp/jax_experimental/ffi.py +201 -53
- warp/native/array.h +4 -4
- warp/native/builtin.h +8 -4
- warp/native/coloring.cpp +5 -1
- warp/native/cuda_util.cpp +1 -1
- warp/native/intersect.h +2 -2
- warp/native/mat.h +3 -3
- warp/native/mesh.h +1 -1
- warp/native/quat.h +6 -2
- warp/native/rand.h +7 -7
- warp/native/sparse.cu +1 -1
- warp/native/svd.h +23 -8
- warp/native/tile.h +20 -1
- warp/native/tile_radix_sort.h +5 -1
- warp/native/tile_reduce.h +16 -25
- warp/native/tuple.h +2 -2
- warp/native/vec.h +4 -4
- warp/native/warp.cpp +1 -1
- warp/native/warp.cu +15 -2
- warp/native/warp.h +1 -1
- warp/render/render_opengl.py +52 -51
- warp/render/render_usd.py +0 -1
- warp/sim/collide.py +1 -2
- warp/sim/integrator_vbd.py +10 -2
- warp/sparse.py +1 -1
- warp/tape.py +2 -0
- warp/tests/sim/test_cloth.py +89 -6
- warp/tests/sim/test_coloring.py +76 -1
- warp/tests/test_assert.py +53 -0
- warp/tests/test_atomic_cas.py +127 -114
- warp/tests/test_mat.py +22 -0
- warp/tests/test_quat.py +22 -0
- warp/tests/test_sparse.py +32 -0
- warp/tests/test_static.py +48 -0
- warp/tests/test_tape.py +38 -0
- warp/tests/test_vec.py +38 -408
- warp/tests/test_vec_constructors.py +325 -0
- warp/tests/tile/test_tile.py +31 -143
- warp/tests/tile/test_tile_mathdx.py +2 -2
- warp/tests/tile/test_tile_matmul.py +179 -0
- warp/tests/tile/test_tile_reduce.py +100 -11
- warp/tests/tile/test_tile_shared_memory.py +12 -12
- warp/tests/tile/test_tile_sort.py +59 -55
- warp/tests/unittest_suites.py +10 -0
- {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/METADATA +4 -4
- {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/RECORD +59 -57
- {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
- {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0
warp/native/quat.h
CHANGED
|
@@ -904,8 +904,12 @@ inline CUDA_CALLABLE void adj_div(quat_t<Type> a, Type s, quat_t<Type>& adj_a, T
|
|
|
904
904
|
template<typename Type>
|
|
905
905
|
inline CUDA_CALLABLE void adj_div(Type s, quat_t<Type> a, Type& adj_s, quat_t<Type>& adj_a, const quat_t<Type>& adj_ret)
|
|
906
906
|
{
|
|
907
|
-
|
|
908
|
-
|
|
907
|
+
for (unsigned i=0; i < 4; ++i)
|
|
908
|
+
{
|
|
909
|
+
Type inv = Type(1) / a[i];
|
|
910
|
+
adj_a[i] -= s * adj_ret[i] * inv * inv;
|
|
911
|
+
adj_s += adj_ret[i] * inv;
|
|
912
|
+
}
|
|
909
913
|
}
|
|
910
914
|
|
|
911
915
|
template<typename Type>
|
warp/native/rand.h
CHANGED
|
@@ -71,14 +71,14 @@ inline CUDA_CALLABLE float randf(uint32& state, float min, float max) { return (
|
|
|
71
71
|
// Box-Muller method
|
|
72
72
|
inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state) + RANDN_EPSILON)) * cos(2.f * M_PI_F * randf(state)); }
|
|
73
73
|
|
|
74
|
-
inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed,
|
|
75
|
-
inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset,
|
|
74
|
+
inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, uint32 adj_ret) {}
|
|
75
|
+
inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, uint32 adj_ret) {}
|
|
76
76
|
|
|
77
|
-
inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state,
|
|
78
|
-
inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max,
|
|
77
|
+
inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state, int adj_ret) {}
|
|
78
|
+
inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max, int adj_ret) {}
|
|
79
79
|
|
|
80
|
-
inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state,
|
|
81
|
-
inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max,
|
|
80
|
+
inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state, uint32 adj_ret) {}
|
|
81
|
+
inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max, uint32 adj_ret) {}
|
|
82
82
|
|
|
83
83
|
inline CUDA_CALLABLE void adj_randf(uint32& state, uint32& adj_state, float adj_ret) {}
|
|
84
84
|
inline CUDA_CALLABLE void adj_randf(uint32& state, float min, float max, uint32& adj_state, float& adj_min, float& adj_max, float adj_ret) {}
|
|
@@ -195,7 +195,7 @@ inline CUDA_CALLABLE void adj_sample_unit_hemisphere_surface(uint32& state, uint
|
|
|
195
195
|
inline CUDA_CALLABLE void adj_sample_unit_hemisphere(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
196
196
|
inline CUDA_CALLABLE void adj_sample_unit_square(uint32& state, uint32& adj_state, const vec2& adj_ret) {}
|
|
197
197
|
inline CUDA_CALLABLE void adj_sample_unit_cube(uint32& state, uint32& adj_state, const vec3& adj_ret) {}
|
|
198
|
-
inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const
|
|
198
|
+
inline CUDA_CALLABLE void adj_sample_unit_hypercube(uint32& state, uint32& adj_state, const vec4& adj_ret) {}
|
|
199
199
|
|
|
200
200
|
/*
|
|
201
201
|
* log-gamma function to support some of these distributions. The
|
warp/native/sparse.cu
CHANGED
|
@@ -334,7 +334,7 @@ WP_API void bsr_matrix_from_triplets_device(
|
|
|
334
334
|
// Ensures the sorted keys are available in summed_block_indices if needed
|
|
335
335
|
if(return_summed_blocks && d_keys.Current() != tpl_block_indices)
|
|
336
336
|
{
|
|
337
|
-
check_cuda(
|
|
337
|
+
check_cuda(cudaMemcpyAsync(tpl_block_indices, d_keys.Current(), nnz * sizeof(int), cudaMemcpyDeviceToDevice, stream));
|
|
338
338
|
}
|
|
339
339
|
}
|
|
340
340
|
|
warp/native/svd.h
CHANGED
|
@@ -50,12 +50,14 @@ namespace wp
|
|
|
50
50
|
|
|
51
51
|
template<typename Type>
|
|
52
52
|
struct _svd_config {
|
|
53
|
+
static constexpr float SVD_EPSILON = 1.e-6f;
|
|
53
54
|
static constexpr float QR_GIVENS_EPSILON = 1.e-6f;
|
|
54
55
|
static constexpr int JACOBI_ITERATIONS = 4;
|
|
55
56
|
};
|
|
56
57
|
|
|
57
58
|
template<>
|
|
58
59
|
struct _svd_config<double> {
|
|
60
|
+
static constexpr double SVD_EPSILON = 1.e-12;
|
|
59
61
|
static constexpr double QR_GIVENS_EPSILON = 1.e-12;
|
|
60
62
|
static constexpr int JACOBI_ITERATIONS = 8;
|
|
61
63
|
};
|
|
@@ -528,13 +530,15 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
|
|
|
528
530
|
const mat_t<3,3,Type>& adj_U,
|
|
529
531
|
const vec_t<3,Type>& adj_sigma,
|
|
530
532
|
const mat_t<3,3,Type>& adj_V) {
|
|
533
|
+
const Type epsilon = _svd_config<Type>::SVD_EPSILON;
|
|
534
|
+
|
|
531
535
|
Type sx2 = sigma[0] * sigma[0];
|
|
532
536
|
Type sy2 = sigma[1] * sigma[1];
|
|
533
537
|
Type sz2 = sigma[2] * sigma[2];
|
|
534
538
|
|
|
535
|
-
Type F01 = Type(1) / min(sy2 - sx2, Type(-
|
|
536
|
-
Type F02 = Type(1) / min(sz2 - sx2, Type(-
|
|
537
|
-
Type F12 = Type(1) / min(sz2 - sy2, Type(-
|
|
539
|
+
Type F01 = Type(1) / min(sy2 - sx2, Type(-epsilon));
|
|
540
|
+
Type F02 = Type(1) / min(sz2 - sx2, Type(-epsilon));
|
|
541
|
+
Type F12 = Type(1) / min(sz2 - sy2, Type(-epsilon));
|
|
538
542
|
|
|
539
543
|
mat_t<3,3,Type> F = mat_t<3,3,Type>(0, F01, F02,
|
|
540
544
|
-F01, 0, F12,
|
|
@@ -553,8 +557,13 @@ inline CUDA_CALLABLE void adj_svd3(const mat_t<3,3,Type>& A,
|
|
|
553
557
|
|
|
554
558
|
mat_t<3,3,Type> sigma_term = mul(U, mul(adj_sigma_mat, VT));
|
|
555
559
|
|
|
556
|
-
mat_t<3,3,Type>
|
|
557
|
-
mat_t<3,3,Type>
|
|
560
|
+
mat_t<3,3,Type> skew_u = cw_mul(F, mul(UT, adj_U) - mul(transpose(adj_U), U));
|
|
561
|
+
mat_t<3,3,Type> block_u = mul(skew_u, s_mat);
|
|
562
|
+
mat_t<3,3,Type> u_term = mul(mul(U, block_u), VT);
|
|
563
|
+
|
|
564
|
+
mat_t<3,3,Type> skew_v = cw_mul(F, mul(VT, adj_V) - mul(transpose(adj_V), V));
|
|
565
|
+
mat_t<3,3,Type> block_v = mul(skew_v, VT);
|
|
566
|
+
mat_t<3,3,Type> v_term = mul(U, mul(s_mat, block_v));
|
|
558
567
|
|
|
559
568
|
adj_A = adj_A + (u_term + v_term + sigma_term);
|
|
560
569
|
}
|
|
@@ -583,11 +592,13 @@ inline CUDA_CALLABLE void adj_svd2(const mat_t<2,2,Type>& A,
|
|
|
583
592
|
const mat_t<2,2,Type>& adj_U,
|
|
584
593
|
const vec_t<2,Type>& adj_sigma,
|
|
585
594
|
const mat_t<2,2,Type>& adj_V) {
|
|
595
|
+
const Type epsilon = _svd_config<Type>::SVD_EPSILON;
|
|
596
|
+
|
|
586
597
|
Type s1_squared = sigma[0] * sigma[0];
|
|
587
598
|
Type s2_squared = sigma[1] * sigma[1];
|
|
588
599
|
|
|
589
600
|
// Compute inverse of (s1^2 - s2^2) if possible, use small epsilon to prevent division by zero
|
|
590
|
-
Type F01 = Type(1) / min(s2_squared - s1_squared, Type(-
|
|
601
|
+
Type F01 = Type(1) / min(s2_squared - s1_squared, Type(-epsilon));
|
|
591
602
|
|
|
592
603
|
// Construct the matrix F for the adjoint
|
|
593
604
|
mat_t<2,2,Type> F = mat_t<2,2,Type>(0.0, F01,
|
|
@@ -609,10 +620,14 @@ inline CUDA_CALLABLE void adj_svd2(const mat_t<2,2,Type>& A,
|
|
|
609
620
|
mat_t<2,2,Type> sigma_term = mul(U, mul(adj_sigma_mat, VT));
|
|
610
621
|
|
|
611
622
|
// Compute the adjoint contributions for U (left singular vectors)
|
|
612
|
-
mat_t<2,2,Type>
|
|
623
|
+
mat_t<2,2,Type> skew_u = cw_mul(F, mul(UT, adj_U) - mul(transpose(adj_U), U));
|
|
624
|
+
mat_t<2,2,Type> block_u = mul(skew_u, s_mat);
|
|
625
|
+
mat_t<2,2,Type> u_term = mul(mul(U, block_u), VT);
|
|
613
626
|
|
|
614
627
|
// Compute the adjoint contributions for V (right singular vectors)
|
|
615
|
-
mat_t<2,2,Type>
|
|
628
|
+
mat_t<2,2,Type> skew_v = cw_mul(F, mul(VT, adj_V) - mul(transpose(adj_V), V));
|
|
629
|
+
mat_t<2,2,Type> block_v = mul(skew_v, VT);
|
|
630
|
+
mat_t<2,2,Type> v_term = mul(U, mul(s_mat, block_v));
|
|
616
631
|
|
|
617
632
|
// Combine the terms to compute the adjoint of A
|
|
618
633
|
adj_A = adj_A + (u_term + v_term + sigma_term);
|
warp/native/tile.h
CHANGED
|
@@ -3015,21 +3015,41 @@ inline CUDA_CALLABLE void assign(TileA& dest, int i, int j, int k, int l, const
|
|
|
3015
3015
|
template <typename TileA, typename AdjTileA, typename Scalar>
|
|
3016
3016
|
inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, const Scalar& src, AdjTileA& adj_dest, int adj_i, Scalar& adj_src)
|
|
3017
3017
|
{
|
|
3018
|
+
if (dest.grad.ptr == nullptr)
|
|
3019
|
+
{
|
|
3020
|
+
return;
|
|
3021
|
+
}
|
|
3022
|
+
|
|
3018
3023
|
adj_src += dest.grad(tile_coord(i));
|
|
3019
3024
|
}
|
|
3020
3025
|
template <typename TileA, typename AdjTileA, typename Scalar>
|
|
3021
3026
|
inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, Scalar& adj_src)
|
|
3022
3027
|
{
|
|
3028
|
+
if (dest.grad.ptr == nullptr)
|
|
3029
|
+
{
|
|
3030
|
+
return;
|
|
3031
|
+
}
|
|
3032
|
+
|
|
3023
3033
|
adj_src += dest.grad(tile_coord(i, j));
|
|
3024
3034
|
}
|
|
3025
3035
|
template <typename TileA, typename AdjTileA, typename Scalar>
|
|
3026
3036
|
inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, Scalar& adj_src)
|
|
3027
3037
|
{
|
|
3038
|
+
if (dest.grad.ptr == nullptr)
|
|
3039
|
+
{
|
|
3040
|
+
return;
|
|
3041
|
+
}
|
|
3042
|
+
|
|
3028
3043
|
adj_src += dest.grad(tile_coord(i, j, k));
|
|
3029
3044
|
}
|
|
3030
3045
|
template <typename TileA, typename AdjTileA, typename Scalar>
|
|
3031
3046
|
inline CUDA_CALLABLE void adj_assign(TileA& dest, int i, int j, int k, int l, const Scalar& src, AdjTileA& adj_dest, int adj_i, int adj_j, int adj_k, int adj_l, Scalar& adj_src)
|
|
3032
3047
|
{
|
|
3048
|
+
if (dest.grad.ptr == nullptr)
|
|
3049
|
+
{
|
|
3050
|
+
return;
|
|
3051
|
+
}
|
|
3052
|
+
|
|
3033
3053
|
adj_src += dest.grad(tile_coord(i, j, k, l));
|
|
3034
3054
|
}
|
|
3035
3055
|
|
|
@@ -3112,7 +3132,6 @@ inline CUDA_CALLABLE TileC& tile_diag_add(TileA& a, TileB& b, TileC& c)
|
|
|
3112
3132
|
template <typename TileA, typename TileB, typename TileC, typename AdjTileA, typename AdjTileB, typename AdjTileC>
|
|
3113
3133
|
inline CUDA_CALLABLE void adj_tile_diag_add(TileA& a, TileB& b, TileC& c, AdjTileA& adj_a, AdjTileB& adj_b, AdjTileC& adj_c, AdjTileC& adj_ret)
|
|
3114
3134
|
{
|
|
3115
|
-
assert(false);
|
|
3116
3135
|
}
|
|
3117
3136
|
|
|
3118
3137
|
|
warp/native/tile_radix_sort.h
CHANGED
|
@@ -122,7 +122,7 @@ inline CUDA_CALLABLE void bitonic_sort_single_stage_full_thread_block(int k, uns
|
|
|
122
122
|
int thread_id2 = loop_id * WP_TILE_BLOCK_DIM + thread_id;
|
|
123
123
|
|
|
124
124
|
key_register[loop_id] = thread_id2 < length ? key_sh_mem[thread_id2] : max_key_value;
|
|
125
|
-
val_register[loop_id] = thread_id2 < length ? val_sh_mem[thread_id2] : 0;
|
|
125
|
+
val_register[loop_id] = thread_id2 < length ? val_sh_mem[thread_id2] : static_cast<V>(0);
|
|
126
126
|
}
|
|
127
127
|
|
|
128
128
|
__syncthreads();
|
|
@@ -342,7 +342,11 @@ inline CUDA_CALLABLE void bitonic_sort_thread_block_shared_mem(
|
|
|
342
342
|
values_shared_mem[i] = values_input[i];
|
|
343
343
|
}
|
|
344
344
|
else
|
|
345
|
+
{
|
|
346
|
+
// Note that these values may end up in the output If enough NaN or Inf values are present in keys_input
|
|
345
347
|
keys_shared_mem[i] = key_max_possible_value;
|
|
348
|
+
values_shared_mem[i] = static_cast<V>(0);
|
|
349
|
+
}
|
|
346
350
|
}
|
|
347
351
|
__syncthreads();
|
|
348
352
|
|
warp/native/tile_reduce.h
CHANGED
|
@@ -83,19 +83,7 @@ inline CUDA_CALLABLE wp::vec_t<Length, T> warp_shuffle_down(wp::vec_t<Length, T>
|
|
|
83
83
|
wp::vec_t<Length, T> result;
|
|
84
84
|
|
|
85
85
|
for (unsigned i=0; i < Length; ++i)
|
|
86
|
-
result
|
|
87
|
-
|
|
88
|
-
return result;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// Quaternion overload
|
|
92
|
-
template <typename T>
|
|
93
|
-
inline CUDA_CALLABLE wp::quat_t<T> warp_shuffle_down(wp::quat_t<T> val, int offset, int mask)
|
|
94
|
-
{
|
|
95
|
-
wp::quat_t<T> result;
|
|
96
|
-
|
|
97
|
-
for (unsigned i=0; i < 4; ++i)
|
|
98
|
-
result.data[i] = __shfl_down_sync(mask, val.data[i], offset, WP_TILE_WARP_SIZE);
|
|
86
|
+
result[i] = __shfl_down_sync(mask, val[i], offset, WP_TILE_WARP_SIZE);
|
|
99
87
|
|
|
100
88
|
return result;
|
|
101
89
|
}
|
|
@@ -218,6 +206,7 @@ auto tile_reduce_impl(Op f, Tile& t)
|
|
|
218
206
|
|
|
219
207
|
// ensure that only threads with at least one valid item participate in the reduction
|
|
220
208
|
unsigned int mask = __ballot_sync(__activemask(), Layout::valid(Layout::linear_from_register(0)));
|
|
209
|
+
bool warp_is_active = mask != 0;
|
|
221
210
|
|
|
222
211
|
// warp reduction
|
|
223
212
|
T warp_sum = warp_reduce(thread_sum, f, mask);
|
|
@@ -233,7 +222,7 @@ auto tile_reduce_impl(Op f, Tile& t)
|
|
|
233
222
|
// ensure active_warps is initialized
|
|
234
223
|
WP_TILE_SYNC();
|
|
235
224
|
|
|
236
|
-
if (lane_index == 0)
|
|
225
|
+
if (lane_index == 0 && warp_is_active)
|
|
237
226
|
{
|
|
238
227
|
partials[warp_index] = warp_sum;
|
|
239
228
|
atomicAdd(&active_warps, 1);
|
|
@@ -291,6 +280,7 @@ auto tile_arg_reduce_impl(Op f, OpTrack track, Tile& t)
|
|
|
291
280
|
|
|
292
281
|
// ensure that only threads with at least one valid item participate in the reduction
|
|
293
282
|
unsigned int mask = __ballot_sync(__activemask(), Layout::valid(Layout::linear_from_register(0)));
|
|
283
|
+
bool warp_is_active = mask != 0;
|
|
294
284
|
|
|
295
285
|
// warp reduction
|
|
296
286
|
ValueAndIndex<T> warp_sum = warp_reduce_tracked(thread_sum, champion_index, f, track, mask);
|
|
@@ -307,7 +297,7 @@ auto tile_arg_reduce_impl(Op f, OpTrack track, Tile& t)
|
|
|
307
297
|
// ensure active_warps is initialized
|
|
308
298
|
WP_TILE_SYNC();
|
|
309
299
|
|
|
310
|
-
if (lane_index == 0)
|
|
300
|
+
if (lane_index == 0 && warp_is_active)
|
|
311
301
|
{
|
|
312
302
|
partials[warp_index] = warp_sum.value;
|
|
313
303
|
partials_idx[warp_index] = warp_sum.index;
|
|
@@ -422,25 +412,26 @@ void adj_tile_sum(Tile& t, Tile& adj_t, AdjTile& adj_ret)
|
|
|
422
412
|
{
|
|
423
413
|
using T = typename Tile::Type;
|
|
424
414
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
for (int i=0; i < Tile::Layout::Size; ++i)
|
|
428
|
-
{
|
|
429
|
-
adj_t(i) += adj_ret.data[0];
|
|
415
|
+
auto adj_reg = adj_ret.grad_to_register();
|
|
430
416
|
|
|
431
|
-
|
|
417
|
+
#if !defined(__CUDA_ARCH__)
|
|
418
|
+
T scratch = adj_reg.data[0];
|
|
432
419
|
#else
|
|
433
420
|
// broadcast incoming adjoint to block
|
|
434
421
|
WP_TILE_SHARED T scratch;
|
|
435
422
|
if (WP_TILE_THREAD_IDX == 0)
|
|
436
|
-
scratch =
|
|
423
|
+
scratch = adj_reg.data[0];
|
|
437
424
|
|
|
438
425
|
WP_TILE_SYNC();
|
|
426
|
+
#endif
|
|
439
427
|
|
|
440
|
-
|
|
441
|
-
|
|
428
|
+
auto adj_ret_reg = tile_register_like<Tile>();
|
|
429
|
+
using Layout = typename decltype(adj_ret_reg)::Layout;
|
|
430
|
+
for (int i=0; i < Layout::NumRegs; ++i)
|
|
431
|
+
{
|
|
432
|
+
adj_ret_reg.data[i] += scratch;
|
|
433
|
+
}
|
|
442
434
|
adj_t.grad_add(adj_ret_reg);
|
|
443
|
-
#endif
|
|
444
435
|
}
|
|
445
436
|
|
|
446
437
|
template <typename Tile>
|
warp/native/tuple.h
CHANGED
|
@@ -182,8 +182,8 @@ adj_add(
|
|
|
182
182
|
const tuple_t<Head, Tail...>& adj_ret
|
|
183
183
|
)
|
|
184
184
|
{
|
|
185
|
-
adj_add(a.head, b.head, adj_ret.head);
|
|
186
|
-
adj_add(a.tail, b.tail, adj_ret.tail);
|
|
185
|
+
adj_add(a.head, b.head, adj_a.head, adj_b.head, adj_ret.head);
|
|
186
|
+
adj_add(a.tail, b.tail, adj_a.tail, adj_b.tail, adj_ret.tail);
|
|
187
187
|
}
|
|
188
188
|
|
|
189
189
|
} // namespace wp
|
warp/native/vec.h
CHANGED
|
@@ -969,11 +969,11 @@ template<unsigned Length, typename Type>
|
|
|
969
969
|
inline CUDA_CALLABLE void adj_div(Type s, vec_t<Length, Type> a, Type& adj_s, vec_t<Length, Type>& adj_a, const vec_t<Length, Type>& adj_ret)
|
|
970
970
|
{
|
|
971
971
|
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
for( unsigned i=0; i < Length; ++i )
|
|
972
|
+
for (unsigned i=0; i < Length; ++i)
|
|
975
973
|
{
|
|
976
|
-
|
|
974
|
+
Type inv = Type(1) / a[i];
|
|
975
|
+
adj_a[i] -= s * adj_ret[i] * inv * inv;
|
|
976
|
+
adj_s += adj_ret[i] * inv;
|
|
977
977
|
}
|
|
978
978
|
|
|
979
979
|
#if FP_CHECK
|
warp/native/warp.cpp
CHANGED
|
@@ -1072,7 +1072,7 @@ WP_API float cuda_event_elapsed_time(void* start_event, void* end_event) { retur
|
|
|
1072
1072
|
|
|
1073
1073
|
WP_API bool cuda_graph_begin_capture(void* context, void* stream, int external) { return false; }
|
|
1074
1074
|
WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret) { return false; }
|
|
1075
|
-
WP_API bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret) { return false; }
|
|
1075
|
+
WP_API bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret) { return false; }
|
|
1076
1076
|
WP_API bool cuda_graph_launch(void* graph, void* stream) { return false; }
|
|
1077
1077
|
WP_API bool cuda_graph_destroy(void* context, void* graph) { return false; }
|
|
1078
1078
|
WP_API bool cuda_graph_exec_destroy(void* context, void* graph_exec) { return false; }
|
warp/native/warp.cu
CHANGED
|
@@ -309,7 +309,13 @@ int cuda_init()
|
|
|
309
309
|
check_cu(cuDeviceGetAttribute_f(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
|
|
310
310
|
check_cu(cuDeviceGetAttribute_f(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
|
|
311
311
|
g_devices[i].arch = 10 * major + minor;
|
|
312
|
-
|
|
312
|
+
#ifdef CUDA_VERSION
|
|
313
|
+
#if CUDA_VERSION < 13000
|
|
314
|
+
if (g_devices[i].arch == 110) {
|
|
315
|
+
g_devices[i].arch = 101; // Thor SM change
|
|
316
|
+
}
|
|
317
|
+
#endif
|
|
318
|
+
#endif
|
|
313
319
|
g_device_map[device] = &g_devices[i];
|
|
314
320
|
}
|
|
315
321
|
else
|
|
@@ -2781,7 +2787,7 @@ bool capture_debug_dot_print(void* graph, const char *path, uint32_t flags)
|
|
|
2781
2787
|
return true;
|
|
2782
2788
|
}
|
|
2783
2789
|
|
|
2784
|
-
bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
|
|
2790
|
+
bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret)
|
|
2785
2791
|
{
|
|
2786
2792
|
ContextGuard guard(context);
|
|
2787
2793
|
|
|
@@ -2789,6 +2795,13 @@ bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret)
|
|
|
2789
2795
|
if (!check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, (cudaGraph_t)graph, cudaGraphInstantiateFlagAutoFreeOnLaunch)))
|
|
2790
2796
|
return false;
|
|
2791
2797
|
|
|
2798
|
+
// Usually uploading the graph explicitly is optional, but when updating graph nodes (e.g., indirect dispatch)
|
|
2799
|
+
// then the upload is required because otherwise the graph nodes that get updated might not yet be uploaded, which
|
|
2800
|
+
// results in undefined behavior.
|
|
2801
|
+
CUstream cuda_stream = static_cast<CUstream>(stream);
|
|
2802
|
+
if (!check_cuda(cudaGraphUpload(graph_exec, cuda_stream)))
|
|
2803
|
+
return false;
|
|
2804
|
+
|
|
2792
2805
|
if (graph_exec_ret)
|
|
2793
2806
|
*graph_exec_ret = graph_exec;
|
|
2794
2807
|
|
warp/native/warp.h
CHANGED
|
@@ -308,7 +308,7 @@ extern "C"
|
|
|
308
308
|
|
|
309
309
|
WP_API bool cuda_graph_begin_capture(void* context, void* stream, int external);
|
|
310
310
|
WP_API bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret);
|
|
311
|
-
WP_API bool cuda_graph_create_exec(void* context, void* graph, void** graph_exec_ret);
|
|
311
|
+
WP_API bool cuda_graph_create_exec(void* context, void* stream, void* graph, void** graph_exec_ret);
|
|
312
312
|
WP_API bool cuda_graph_launch(void* graph, void* stream);
|
|
313
313
|
WP_API bool cuda_graph_destroy(void* context, void* graph);
|
|
314
314
|
WP_API bool cuda_graph_exec_destroy(void* context, void* graph_exec);
|
warp/render/render_opengl.py
CHANGED
|
@@ -320,15 +320,14 @@ def update_vbo_transforms(
|
|
|
320
320
|
@wp.kernel
|
|
321
321
|
def update_vbo_vertices(
|
|
322
322
|
points: wp.array(dtype=wp.vec3),
|
|
323
|
-
scale: wp.vec3,
|
|
324
323
|
# outputs
|
|
325
324
|
vbo_vertices: wp.array(dtype=float, ndim=2),
|
|
326
325
|
):
|
|
327
326
|
tid = wp.tid()
|
|
328
327
|
p = points[tid]
|
|
329
|
-
vbo_vertices[tid, 0] = p[0]
|
|
330
|
-
vbo_vertices[tid, 1] = p[1]
|
|
331
|
-
vbo_vertices[tid, 2] = p[2]
|
|
328
|
+
vbo_vertices[tid, 0] = p[0]
|
|
329
|
+
vbo_vertices[tid, 1] = p[1]
|
|
330
|
+
vbo_vertices[tid, 2] = p[2]
|
|
332
331
|
|
|
333
332
|
|
|
334
333
|
@wp.kernel
|
|
@@ -422,7 +421,6 @@ def compute_gfx_vertices(
|
|
|
422
421
|
def compute_average_normals(
|
|
423
422
|
indices: wp.array(dtype=int, ndim=2),
|
|
424
423
|
vertices: wp.array(dtype=wp.vec3),
|
|
425
|
-
scale: wp.vec3,
|
|
426
424
|
# outputs
|
|
427
425
|
normals: wp.array(dtype=wp.vec3),
|
|
428
426
|
faces_per_vertex: wp.array(dtype=int),
|
|
@@ -431,9 +429,9 @@ def compute_average_normals(
|
|
|
431
429
|
i = indices[tid, 0]
|
|
432
430
|
j = indices[tid, 1]
|
|
433
431
|
k = indices[tid, 2]
|
|
434
|
-
v0 = vertices[i]
|
|
435
|
-
v1 = vertices[j]
|
|
436
|
-
v2 = vertices[k]
|
|
432
|
+
v0 = vertices[i]
|
|
433
|
+
v1 = vertices[j]
|
|
434
|
+
v2 = vertices[k]
|
|
437
435
|
n = wp.normalize(wp.cross(v1 - v0, v2 - v0))
|
|
438
436
|
wp.atomic_add(normals, i, n)
|
|
439
437
|
wp.atomic_add(faces_per_vertex, i, 1)
|
|
@@ -448,16 +446,15 @@ def assemble_gfx_vertices(
|
|
|
448
446
|
vertices: wp.array(dtype=wp.vec3, ndim=1),
|
|
449
447
|
normals: wp.array(dtype=wp.vec3),
|
|
450
448
|
faces_per_vertex: wp.array(dtype=int),
|
|
451
|
-
scale: wp.vec3,
|
|
452
449
|
# outputs
|
|
453
450
|
gfx_vertices: wp.array(dtype=float, ndim=2),
|
|
454
451
|
):
|
|
455
452
|
tid = wp.tid()
|
|
456
453
|
v = vertices[tid]
|
|
457
454
|
n = normals[tid] / float(faces_per_vertex[tid])
|
|
458
|
-
gfx_vertices[tid, 0] = v[0]
|
|
459
|
-
gfx_vertices[tid, 1] = v[1]
|
|
460
|
-
gfx_vertices[tid, 2] = v[2]
|
|
455
|
+
gfx_vertices[tid, 0] = v[0]
|
|
456
|
+
gfx_vertices[tid, 1] = v[1]
|
|
457
|
+
gfx_vertices[tid, 2] = v[2]
|
|
461
458
|
gfx_vertices[tid, 3] = n[0]
|
|
462
459
|
gfx_vertices[tid, 4] = n[1]
|
|
463
460
|
gfx_vertices[tid, 5] = n[2]
|
|
@@ -2445,7 +2442,7 @@ Instances: {len(self._instances)}"""
|
|
|
2445
2442
|
|
|
2446
2443
|
gl.glBindVertexArray(0)
|
|
2447
2444
|
|
|
2448
|
-
def update_shape_instance(self, name, pos=None, rot=None, color1=None, color2=None, visible=None):
|
|
2445
|
+
def update_shape_instance(self, name, pos=None, rot=None, color1=None, color2=None, scale=None, visible=None):
|
|
2449
2446
|
"""Update the instance properties of the shape
|
|
2450
2447
|
|
|
2451
2448
|
Args:
|
|
@@ -2461,7 +2458,7 @@ Instances: {len(self._instances)}"""
|
|
|
2461
2458
|
self._switch_context()
|
|
2462
2459
|
|
|
2463
2460
|
if name in self._instances:
|
|
2464
|
-
i, body, shape, tf,
|
|
2461
|
+
i, body, shape, tf, old_scale, old_color1, old_color2, v = self._instances[name]
|
|
2465
2462
|
if visible is None:
|
|
2466
2463
|
visible = v
|
|
2467
2464
|
new_tf = np.copy(tf)
|
|
@@ -2474,7 +2471,7 @@ Instances: {len(self._instances)}"""
|
|
|
2474
2471
|
body,
|
|
2475
2472
|
shape,
|
|
2476
2473
|
new_tf,
|
|
2477
|
-
scale,
|
|
2474
|
+
old_scale if scale is None else scale,
|
|
2478
2475
|
old_color1 if color1 is None else color1,
|
|
2479
2476
|
old_color2 if color2 is None else color2,
|
|
2480
2477
|
visible,
|
|
@@ -2968,7 +2965,7 @@ Instances: {len(self._instances)}"""
|
|
|
2968
2965
|
geo_hash = hash(("box", tuple(extents)))
|
|
2969
2966
|
if geo_hash in self._shape_geo_hash:
|
|
2970
2967
|
shape = self._shape_geo_hash[geo_hash]
|
|
2971
|
-
if self.update_shape_instance(name, pos, rot):
|
|
2968
|
+
if self.update_shape_instance(name, pos, rot, color1=color, color2=color):
|
|
2972
2969
|
return shape
|
|
2973
2970
|
else:
|
|
2974
2971
|
vertices, indices = self._create_box_mesh(extents)
|
|
@@ -3031,50 +3028,54 @@ Instances: {len(self._instances)}"""
|
|
|
3031
3028
|
if not update_topology:
|
|
3032
3029
|
if name in self._instances:
|
|
3033
3030
|
# Update the instance's transform.
|
|
3034
|
-
self.update_shape_instance(name, pos, rot, color1=colors)
|
|
3031
|
+
self.update_shape_instance(name, pos, rot, color1=colors, color2=colors, scale=scale, visible=visible)
|
|
3035
3032
|
|
|
3036
3033
|
if shape is not None:
|
|
3037
3034
|
# Update the shape's point positions.
|
|
3038
|
-
self.update_shape_vertices(shape, points
|
|
3035
|
+
self.update_shape_vertices(shape, points)
|
|
3039
3036
|
|
|
3040
3037
|
if not is_template and name not in self._instances:
|
|
3041
3038
|
# Create a new instance.
|
|
3042
3039
|
body = self._resolve_body_id(parent_body)
|
|
3043
|
-
self.add_shape_instance(name, shape, body, pos, rot, color1=colors)
|
|
3040
|
+
self.add_shape_instance(name, shape, body, pos, rot, color1=colors, scale=scale)
|
|
3044
3041
|
|
|
3045
3042
|
return shape
|
|
3046
3043
|
|
|
3047
3044
|
# No existing shape for the given mesh was found, or its topology may have changed,
|
|
3048
3045
|
# so we need to define a new one either way.
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3069
|
-
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3046
|
+
with wp.ScopedDevice(self._device):
|
|
3047
|
+
if smooth_shading:
|
|
3048
|
+
normals = wp.zeros(point_count, dtype=wp.vec3)
|
|
3049
|
+
vertices = wp.array(points, dtype=wp.vec3)
|
|
3050
|
+
faces_per_vertex = wp.zeros(point_count, dtype=int)
|
|
3051
|
+
wp.launch(
|
|
3052
|
+
compute_average_normals,
|
|
3053
|
+
dim=idx_count,
|
|
3054
|
+
inputs=[wp.array(indices, dtype=int), vertices],
|
|
3055
|
+
outputs=[normals, faces_per_vertex],
|
|
3056
|
+
record_tape=False,
|
|
3057
|
+
)
|
|
3058
|
+
gfx_vertices = wp.zeros((point_count, 8), dtype=float)
|
|
3059
|
+
wp.launch(
|
|
3060
|
+
assemble_gfx_vertices,
|
|
3061
|
+
dim=point_count,
|
|
3062
|
+
inputs=[vertices, normals, faces_per_vertex],
|
|
3063
|
+
outputs=[gfx_vertices],
|
|
3064
|
+
record_tape=False,
|
|
3065
|
+
)
|
|
3066
|
+
gfx_vertices = gfx_vertices.numpy()
|
|
3067
|
+
gfx_indices = indices.flatten()
|
|
3068
|
+
else:
|
|
3069
|
+
gfx_vertices = wp.zeros((idx_count * 3, 8), dtype=float)
|
|
3070
|
+
wp.launch(
|
|
3071
|
+
compute_gfx_vertices,
|
|
3072
|
+
dim=idx_count,
|
|
3073
|
+
inputs=[wp.array(indices, dtype=int), wp.array(points, dtype=wp.vec3)],
|
|
3074
|
+
outputs=[gfx_vertices],
|
|
3075
|
+
record_tape=False,
|
|
3076
|
+
)
|
|
3077
|
+
gfx_vertices = gfx_vertices.numpy()
|
|
3078
|
+
gfx_indices = np.arange(idx_count * 3)
|
|
3078
3079
|
|
|
3079
3080
|
# If there was a shape for the given mesh, clean it up.
|
|
3080
3081
|
if shape is not None:
|
|
@@ -3090,7 +3091,7 @@ Instances: {len(self._instances)}"""
|
|
|
3090
3091
|
if not is_template:
|
|
3091
3092
|
# Create a new instance if necessary.
|
|
3092
3093
|
body = self._resolve_body_id(parent_body)
|
|
3093
|
-
self.add_shape_instance(name, shape, body, pos, rot, color1=colors)
|
|
3094
|
+
self.add_shape_instance(name, shape, body, pos, rot, color1=colors, scale=scale)
|
|
3094
3095
|
|
|
3095
3096
|
return shape
|
|
3096
3097
|
|
|
@@ -3278,7 +3279,7 @@ Instances: {len(self._instances)}"""
|
|
|
3278
3279
|
lines = np.array(lines)
|
|
3279
3280
|
self._render_lines(name, lines, color, radius)
|
|
3280
3281
|
|
|
3281
|
-
def update_shape_vertices(self, shape, points
|
|
3282
|
+
def update_shape_vertices(self, shape, points):
|
|
3282
3283
|
if isinstance(points, wp.array):
|
|
3283
3284
|
wp_points = points.to(self._device)
|
|
3284
3285
|
else:
|
|
@@ -3291,7 +3292,7 @@ Instances: {len(self._instances)}"""
|
|
|
3291
3292
|
wp.launch(
|
|
3292
3293
|
update_vbo_vertices,
|
|
3293
3294
|
dim=vertices_shape[0],
|
|
3294
|
-
inputs=[wp_points
|
|
3295
|
+
inputs=[wp_points],
|
|
3295
3296
|
outputs=[vbo_vertices],
|
|
3296
3297
|
device=self._device,
|
|
3297
3298
|
)
|
warp/render/render_usd.py
CHANGED
warp/sim/collide.py
CHANGED
|
@@ -1236,8 +1236,7 @@ def handle_contact_pairs(
|
|
|
1236
1236
|
p_b_body = closest_point_box(geo_scale_b, query_b)
|
|
1237
1237
|
p_b_world = wp.transform_point(X_ws_b, p_b_body)
|
|
1238
1238
|
diff = p_a_world - p_b_world
|
|
1239
|
-
|
|
1240
|
-
query_b = wp.transform_point(X_sw_b, wp.transform_get_translation(X_ws_a))
|
|
1239
|
+
|
|
1241
1240
|
normal = wp.transform_vector(X_ws_b, box_sdf_grad(geo_scale_b, query_b))
|
|
1242
1241
|
distance = wp.dot(diff, normal)
|
|
1243
1242
|
|
warp/sim/integrator_vbd.py
CHANGED
|
@@ -1379,6 +1379,8 @@ def VBD_solve_trimesh_no_self_contact(
|
|
|
1379
1379
|
edge_rest_length: wp.array(dtype=float),
|
|
1380
1380
|
edge_bending_properties: wp.array(dtype=float, ndim=2),
|
|
1381
1381
|
adjacency: ForceElementAdjacencyInfo,
|
|
1382
|
+
particle_forces: wp.array(dtype=wp.vec3),
|
|
1383
|
+
particle_hessians: wp.array(dtype=wp.mat33),
|
|
1382
1384
|
# contact info
|
|
1383
1385
|
soft_contact_ke: float,
|
|
1384
1386
|
soft_contact_kd: float,
|
|
@@ -1493,9 +1495,11 @@ def VBD_solve_trimesh_no_self_contact(
|
|
|
1493
1495
|
dt,
|
|
1494
1496
|
)
|
|
1495
1497
|
|
|
1496
|
-
f
|
|
1497
|
-
h
|
|
1498
|
+
f += ground_contact_force
|
|
1499
|
+
h += ground_contact_hessian
|
|
1498
1500
|
|
|
1501
|
+
f += particle_forces[particle_index]
|
|
1502
|
+
h += particle_hessians[particle_index]
|
|
1499
1503
|
if abs(wp.determinant(h)) > 1e-5:
|
|
1500
1504
|
hInv = wp.inverse(h)
|
|
1501
1505
|
pos_new[particle_index] = particle_pos + hInv * f
|
|
@@ -2138,6 +2142,8 @@ class VBDIntegrator(Integrator):
|
|
|
2138
2142
|
)
|
|
2139
2143
|
|
|
2140
2144
|
for _iter in range(self.iterations):
|
|
2145
|
+
self.particle_forces.zero_()
|
|
2146
|
+
self.particle_hessians.zero_()
|
|
2141
2147
|
for color in range(len(self.model.particle_color_groups)):
|
|
2142
2148
|
wp.launch(
|
|
2143
2149
|
kernel=VBD_accumulate_contact_force_and_hessian_no_self_contact,
|
|
@@ -2191,6 +2197,8 @@ class VBDIntegrator(Integrator):
|
|
|
2191
2197
|
self.model.edge_rest_length,
|
|
2192
2198
|
self.model.edge_bending_properties,
|
|
2193
2199
|
self.adjacency,
|
|
2200
|
+
self.particle_forces,
|
|
2201
|
+
self.particle_hessians,
|
|
2194
2202
|
self.model.soft_contact_ke,
|
|
2195
2203
|
self.model.soft_contact_kd,
|
|
2196
2204
|
self.model.soft_contact_mu,
|