PyPI - warp-lang - Versions diffs - 1.5.1__py3-none-manylinux2014_x86_64.whl → 1.6.1__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.5.1__py3-none-manylinux2014_x86_64.whl → 1.6.1__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (131) hide show

warp/__init__.py +5 -0
warp/autograd.py +414 -191
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +40 -12
warp/build_dll.py +13 -6
warp/builtins.py +1077 -481
warp/codegen.py +250 -122
warp/config.py +65 -21
warp/context.py +500 -149
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_gemm.py +27 -18
warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
warp/examples/core/example_marching_cubes.py +1 -1
warp/examples/core/example_mesh.py +1 -1
warp/examples/core/example_torch.py +18 -34
warp/examples/core/example_wave.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -0
warp/examples/fem/example_mixed_elasticity.py +1 -1
warp/examples/optim/example_bounce.py +1 -1
warp/examples/optim/example_cloth_throw.py +1 -1
warp/examples/optim/example_diffray.py +4 -15
warp/examples/optim/example_drone.py +1 -1
warp/examples/optim/example_softbody_properties.py +392 -0
warp/examples/optim/example_trajectory.py +1 -3
warp/examples/optim/example_walker.py +5 -0
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth_self_contact.py +314 -0
warp/examples/sim/example_granular_collision_sdf.py +4 -5
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_quadruped.py +5 -2
warp/examples/tile/example_tile_cholesky.py +79 -0
warp/examples/tile/example_tile_convolution.py +2 -2
warp/examples/tile/example_tile_fft.py +2 -2
warp/examples/tile/example_tile_filtering.py +3 -3
warp/examples/tile/example_tile_matmul.py +4 -4
warp/examples/tile/example_tile_mlp.py +12 -12
warp/examples/tile/example_tile_nbody.py +191 -0
warp/examples/tile/example_tile_walker.py +319 -0
warp/math.py +147 -0
warp/native/array.h +12 -0
warp/native/builtin.h +0 -1
warp/native/bvh.cpp +149 -70
warp/native/bvh.cu +287 -68
warp/native/bvh.h +195 -85
warp/native/clang/clang.cpp +6 -2
warp/native/crt.h +1 -0
warp/native/cuda_util.cpp +35 -0
warp/native/cuda_util.h +5 -0
warp/native/exports.h +40 -40
warp/native/intersect.h +17 -0
warp/native/mat.h +57 -3
warp/native/mathdx.cpp +19 -0
warp/native/mesh.cpp +25 -8
warp/native/mesh.cu +153 -101
warp/native/mesh.h +482 -403
warp/native/quat.h +40 -0
warp/native/solid_angle.h +7 -0
warp/native/sort.cpp +85 -0
warp/native/sort.cu +34 -0
warp/native/sort.h +3 -1
warp/native/spatial.h +11 -0
warp/native/tile.h +1189 -664
warp/native/tile_reduce.h +8 -6
warp/native/vec.h +41 -0
warp/native/warp.cpp +8 -1
warp/native/warp.cu +263 -40
warp/native/warp.h +19 -5
warp/optim/linear.py +22 -4
warp/render/render_opengl.py +132 -59
warp/render/render_usd.py +10 -2
warp/sim/__init__.py +6 -1
warp/sim/collide.py +289 -32
warp/sim/import_urdf.py +20 -5
warp/sim/integrator_euler.py +25 -7
warp/sim/integrator_featherstone.py +147 -35
warp/sim/integrator_vbd.py +842 -40
warp/sim/model.py +173 -112
warp/sim/render.py +2 -2
warp/stubs.py +249 -116
warp/tape.py +28 -30
warp/tests/aux_test_module_unload.py +15 -0
warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
warp/tests/test_array.py +100 -0
warp/tests/test_assert.py +242 -0
warp/tests/test_codegen.py +14 -61
warp/tests/test_collision.py +8 -8
warp/tests/test_examples.py +16 -1
warp/tests/test_grad_debug.py +87 -2
warp/tests/test_hash_grid.py +1 -1
warp/tests/test_ipc.py +116 -0
warp/tests/test_launch.py +77 -26
warp/tests/test_mat.py +213 -168
warp/tests/test_math.py +47 -1
warp/tests/test_matmul.py +11 -7
warp/tests/test_matmul_lite.py +4 -4
warp/tests/test_mesh.py +84 -60
warp/tests/test_mesh_query_aabb.py +165 -0
warp/tests/test_mesh_query_point.py +328 -286
warp/tests/test_mesh_query_ray.py +134 -121
warp/tests/test_mlp.py +2 -2
warp/tests/test_operators.py +43 -0
warp/tests/test_overwrite.py +6 -5
warp/tests/test_quat.py +77 -0
warp/tests/test_reload.py +29 -0
warp/tests/test_sim_grad_bounce_linear.py +204 -0
warp/tests/test_static.py +16 -0
warp/tests/test_tape.py +25 -0
warp/tests/test_tile.py +134 -191
warp/tests/test_tile_load.py +399 -0
warp/tests/test_tile_mathdx.py +61 -8
warp/tests/test_tile_mlp.py +17 -17
warp/tests/test_tile_reduce.py +24 -18
warp/tests/test_tile_shared_memory.py +66 -17
warp/tests/test_tile_view.py +165 -0
warp/tests/test_torch.py +35 -0
warp/tests/test_utils.py +36 -24
warp/tests/test_vec.py +110 -0
warp/tests/unittest_suites.py +29 -4
warp/tests/unittest_utils.py +30 -11
warp/thirdparty/unittest_parallel.py +5 -2
warp/types.py +419 -111
warp/utils.py +9 -5
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/METADATA +86 -45
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/RECORD +129 -118
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/WHEEL +1 -1
warp/examples/benchmarks/benchmark_tile.py +0 -179
warp/native/tile_gemm.h +0 -341
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/top_level.txt +0 -0

warp/native/mat.h CHANGED Viewed

@@ -394,6 +394,36 @@ inline CUDA_CALLABLE void adj_index(const mat_t<Rows,Cols,Type>& m, int row, int
 }
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void augassign_add(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+{
+    m.data[row][col] += value;
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_augassign_add(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
+                                        mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
+{
+    adj_value += adj_m.data[row][col];
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void augassign_sub(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
+{
+    m.data[row][col] -= value;
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_augassign_sub(mat_t<Rows,Cols,Type>& m, int row, int col, Type value,
+                                        mat_t<Rows,Cols,Type>& adj_m, int adj_row, int adj_col, Type& adj_value)
+{
+    adj_value -= adj_m.data[row][col];
+}
 template<unsigned Rows, unsigned Cols, typename Type>
 inline CUDA_CALLABLE mat_t<Rows,Cols,Type> assign(mat_t<Rows,Cols,Type>& m, int row, int col, Type value)
 {
@@ -651,6 +681,20 @@ inline CUDA_CALLABLE vec_t<Cols,Type> mul(const vec_t<Rows,Type>& b, const mat_t
     return r;
 }
+template<typename T>
+inline CUDA_CALLABLE T muladd(T a, T b, T c) {
+    return c + a*b;
+}
+template<>
+inline CUDA_CALLABLE float muladd(float a, float b, float c) {
+    return fmaf(a, b, c);
+}
+template<>
+inline CUDA_CALLABLE double muladd(double a, double b, double c) {
+    return fma(a, b, c);
+}
 template<unsigned Rows, unsigned Cols, unsigned ColsOut, typename Type>
 inline CUDA_CALLABLE mat_t<Rows,ColsOut,Type> mul(const mat_t<Rows,Cols,Type>& a, const mat_t<Cols,ColsOut,Type>& b)
 {
@@ -663,8 +707,7 @@ inline CUDA_CALLABLE mat_t<Rows,ColsOut,Type> mul(const mat_t<Rows,Cols,Type>& a
             for (unsigned k=0; k < Cols; ++k)
             {
-                //t.data[i][j] += a.data[i][k]*b.data[k][j];
-                sum = fmaf(a.data[i][k], b.data[k][j], sum);
+                sum = muladd<Type>(a.data[i][k], b.data[k][j], sum);
             }
             t.data[i][j] = sum;
@@ -683,7 +726,7 @@ inline CUDA_CALLABLE Type ddot(const mat_t<Rows,Cols,Type>& a, const mat_t<Rows,
     {
         for (unsigned j=0; j < Cols; ++j)
         {
-            r += a.data[i][j] * b.data[i][j];
+            r = muladd<Type>(a.data[i][j], b.data[i][j], r);
         }
     }
     return r;
@@ -1650,4 +1693,15 @@ inline CUDA_CALLABLE void adj_mat44(float m00, float m01, float m02, float m03,
     a33 += adj_ret.data[3][3];
 }
+template<unsigned Rows, unsigned Cols, typename Type>
+CUDA_CALLABLE inline int len(const mat_t<Rows,Cols,Type>& x)
+{
+    return Rows;
+}
+template<unsigned Rows, unsigned Cols, typename Type>
+CUDA_CALLABLE inline void adj_len(const mat_t<Rows,Cols,Type>& x, mat_t<Rows,Cols,Type>& adj_x, const int& adj_ret)
+{
+}
 } // namespace wp

warp/native/mathdx.cpp CHANGED Viewed

@@ -32,6 +32,7 @@ bool cuda_compile_fft(
 }
 WP_API bool cuda_compile_dot(
+                             const char* fatbin_output_path,
                              const char* ltoir_output_path,
                              const char* symbol_name,
                              int num_include_dirs,
@@ -54,6 +55,24 @@ WP_API bool cuda_compile_dot(
     return false;
 }
+WP_API bool cuda_compile_solver(
+                                const char* ltoir_output_path,
+                                const char* symbol_name,
+                                int num_include_dirs,
+                                const char** include_dirs,
+                                const char* mathdx_include_dir,
+                                int arch,
+                                int M,
+                                int N,
+                                int function,
+                                int precision,
+                                int fill_mode,
+                                int num_threads)
+{
+    printf("CUDA is disabled and/or Warp was not compiled with MathDx support.\n");
+    return false;
+}
 } // extern "C"
 #endif // !WP_ENABLE_CUDA || !WP_ENABLE_MATHDX

warp/native/mesh.cpp CHANGED Viewed

@@ -67,11 +67,28 @@ void bvh_refit_with_solid_angle_recursive_host(BVH& bvh, int index, Mesh& mesh)
     if (lower.b)
     {
         // Leaf, compute properties
-        const int leaf_index = lower.i;
-        precompute_triangle_solid_angle_props(mesh.points[mesh.indices[leaf_index*3+0]], mesh.points[mesh.indices[leaf_index*3+1]], mesh.points[mesh.indices[leaf_index*3+2]], mesh.solid_angle_props[index]);
-        (vec3&)lower = mesh.solid_angle_props[index].box.lower;
-        (vec3&)upper = mesh.solid_angle_props[index].box.upper;
+         const int start = lower.i;
+         const int end = upper.i;
+         // loops through primitives in the leaf
+         for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
+         {
+             int primitive_index = mesh.bvh.primitive_indices[primitive_counter];
+             if (primitive_counter == start)
+             {
+                 precompute_triangle_solid_angle_props(mesh.points[mesh.indices[primitive_index * 3 + 0]], mesh.points[mesh.indices[primitive_index * 3 + 1]],
+                     mesh.points[mesh.indices[primitive_index * 3 + 2]], mesh.solid_angle_props[index]);
+             }
+             else
+             {
+                 SolidAngleProps triangle_solid_angle_props;
+                 precompute_triangle_solid_angle_props(mesh.points[mesh.indices[primitive_index * 3 + 0]], mesh.points[mesh.indices[primitive_index * 3 + 1]],
+                     mesh.points[mesh.indices[primitive_index * 3 + 2]], triangle_solid_angle_props);
+                 mesh.solid_angle_props[index] = combine_precomputed_solid_angle_props(&mesh.solid_angle_props[index], &triangle_solid_angle_props);
+             }
+         }
+         (vec3&)lower = mesh.solid_angle_props[index].box.lower;
+         (vec3&)upper = mesh.solid_angle_props[index].box.upper;
     }
     else
     {
@@ -109,7 +126,7 @@ void bvh_refit_with_solid_angle_host(BVH& bvh, Mesh& mesh)
     bvh_refit_with_solid_angle_recursive_host(bvh, 0, mesh);
 }
-uint64_t mesh_create_host(array_t<wp::vec3> points, array_t<wp::vec3> velocities, array_t<int> indices, int num_points, int num_tris, int support_winding_number)
+uint64_t mesh_create_host(array_t<wp::vec3> points, array_t<wp::vec3> velocities, array_t<int> indices, int num_points, int num_tris, int support_winding_number, int constructor_type)
 {
     Mesh* m = new Mesh(points, velocities, indices, num_points, num_tris);
@@ -137,7 +154,7 @@ uint64_t mesh_create_host(array_t<wp::vec3> points, array_t<wp::vec3> velocities
     }
     m->average_edge_length = sum / (num_tris*3);
-    wp::bvh_create_host(m->lowers, m->uppers, num_tris, m->bvh);
+    wp::bvh_create_host(m->lowers, m->uppers, num_tris, constructor_type, m->bvh);
     if (support_winding_number)
     {
@@ -230,7 +247,7 @@ void mesh_set_velocities_host(uint64_t id, wp::array_t<wp::vec3> velocities)
 #if !WP_ENABLE_CUDA
-WP_API uint64_t mesh_create_device(void* context, wp::array_t<wp::vec3> points, wp::array_t<wp::vec3> velocities, wp::array_t<int> tris, int num_points, int num_tris, int support_winding_number) { return 0; }
+WP_API uint64_t mesh_create_device(void* context, wp::array_t<wp::vec3> points, wp::array_t<wp::vec3> velocities, wp::array_t<int> tris, int num_points, int num_tris, int support_winding_number, int constructor_type) { return 0; }
 WP_API void mesh_destroy_device(uint64_t id) {}
 WP_API void mesh_refit_device(uint64_t id) {}
 WP_API void mesh_set_points_device(uint64_t id, wp::array_t<wp::vec3> points) {};

warp/native/mesh.cu CHANGED Viewed

@@ -63,23 +63,61 @@ __global__ void compute_average_mesh_edge_length(int n, float* sum_edge_lengths,
     m->average_edge_length = sum_edge_lengths[n - 1] / (3*n);
 }
-__global__ void bvh_refit_with_solid_angle_kernel(int n, const int* __restrict__ parents, int* __restrict__ child_count, BVHPackedNodeHalf* __restrict__ lowers, BVHPackedNodeHalf* __restrict__ uppers, const vec3* points, const int* indices, SolidAngleProps* solid_angle_props)
+__global__ void bvh_refit_with_solid_angle_kernel(int n, const int* __restrict__ parents,
+    int* __restrict__ child_count, BVHPackedNodeHalf* __restrict__ node_lowers, BVHPackedNodeHalf* __restrict__ node_uppers,
+    const vec3* points, const int* indices, const int* primitive_indices, SolidAngleProps* solid_angle_props)
 {
     int index = blockDim.x*blockIdx.x + threadIdx.x;
     if (index < n)
     {
-        bool leaf = lowers[index].b;
+        bool leaf = node_lowers[index].b;
+        int parent = parents[index];
         if (leaf)
         {
+            BVHPackedNodeHalf& lower = node_lowers[index];
+            BVHPackedNodeHalf& upper = node_uppers[index];
             // update the leaf node
-            const int leaf_index = lowers[index].i;
-            precompute_triangle_solid_angle_props(points[indices[leaf_index*3+0]], points[indices[leaf_index*3+1]], points[indices[leaf_index*3+2]], solid_angle_props[index]);
+            bool true_leaf = true;
+            if (parent != -1)
+            {
+                true_leaf = !node_lowers[parent].b;
+            }
-            make_node(lowers+index, solid_angle_props[index].box.lower, leaf_index, true);
-            make_node(uppers+index, solid_angle_props[index].box.upper, 0, false);
+            if (true_leaf)
+            {
+                SolidAngleProps node_solid_angle_props;
+                const int start = lower.i;
+                const int end = upper.i;
+                // loops through primitives in the leaf
+                for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
+                {
+                    int primitive_index = primitive_indices[primitive_counter];
+                    if (primitive_counter == start)
+                    {
+                        precompute_triangle_solid_angle_props(points[indices[primitive_index * 3 + 0]], points[indices[primitive_index * 3 + 1]],
+                            points[indices[primitive_index * 3 + 2]], node_solid_angle_props);
+                    }
+                    else
+                    {
+                        SolidAngleProps triangle_solid_angle_props;
+                        precompute_triangle_solid_angle_props(points[indices[primitive_index * 3 + 0]], points[indices[primitive_index * 3 + 1]],
+                            points[indices[primitive_index * 3 + 2]], triangle_solid_angle_props);
+                        node_solid_angle_props = combine_precomputed_solid_angle_props(&node_solid_angle_props, &triangle_solid_angle_props);
+                    }
+                }
+                (vec3&)lower = node_solid_angle_props.box.lower;
+                (vec3&)upper = node_solid_angle_props.box.upper;
+                solid_angle_props[index] = node_solid_angle_props;
+            }
         }
         else
         {
             // only keep leaf threads
@@ -89,7 +127,7 @@ __global__ void bvh_refit_with_solid_angle_kernel(int n, const int* __restrict__
         // update hierarchy
         for (;;)
         {
-            int parent = parents[index];
+            parent = parents[index];
             // reached root
             if (parent == -1)
@@ -104,41 +142,74 @@ __global__ void bvh_refit_with_solid_angle_kernel(int n, const int* __restrict__
             // then update its bounds and move onto the next parent in the hierarchy
             if (finished == 1)
             {
-                //printf("Compute non-leaf at %d\n", index);
-                const int left_child = lowers[parent].i;
-                const int right_child = uppers[parent].i;
-                vec3 left_lower = vec3(lowers[left_child].x,
-                                       lowers[left_child].y,
-                                       lowers[left_child].z);
-                vec3 left_upper = vec3(uppers[left_child].x,
-                                       uppers[left_child].y,
-                                       uppers[left_child].z);
-                vec3 right_lower = vec3(lowers[right_child].x,
-                                       lowers[right_child].y,
-                                       lowers[right_child].z);
-                vec3 right_upper = vec3(uppers[right_child].x,
-                                       uppers[right_child].y,
-                                       uppers[right_child].z);
-                // union of child bounds
-                vec3 lower = min(left_lower, right_lower);
-                vec3 upper = max(left_upper, right_upper);
-                // write new BVH nodes
-                make_node(lowers+parent, lower, left_child, false);
-                make_node(uppers+parent, upper, right_child, false);
-                // combine
-                SolidAngleProps* left_child_data = &solid_angle_props[left_child];
-                SolidAngleProps* right_child_data = (left_child != right_child) ? &solid_angle_props[right_child] : NULL;
-                combine_precomputed_solid_angle_props(solid_angle_props[parent], left_child_data, right_child_data);
+                BVHPackedNodeHalf& parent_lower = node_lowers[parent];
+                BVHPackedNodeHalf& parent_upper = node_uppers[parent];
+                if (parent_lower.b)
+                    // a packed leaf node can still be a parent in LBVH, we need to recompute its bounds
+                    // since we've lost its left and right child node index in the muting process
+                {
+                    int parent_parent = parents[parent];;
+                    // only need to compute bound when this is a valid leaf node
+                    bool true_leaf = true;
+                    if (parent_parent != -1)
+                    {
+                        true_leaf = !node_lowers[parent_parent].b;
+                    }
+                    if (true_leaf)
+                    {
+                        SolidAngleProps node_solid_angle_props;
+                        const int start = parent_lower.i;
+                        const int end = parent_upper.i;
+                        // loops through primitives in the leaf
+                        for (int primitive_counter = start; primitive_counter < end; primitive_counter++)
+                        {
+                            int primitive_index = primitive_indices[primitive_counter];
+                            if (primitive_counter == start)
+                            {
+                                precompute_triangle_solid_angle_props(points[indices[primitive_index * 3 + 0]], points[indices[primitive_index * 3 + 1]],
+                                    points[indices[primitive_index * 3 + 2]], node_solid_angle_props);
+                            }
+                            else
+                            {
+                                SolidAngleProps triangle_solid_angle_props;
+                                precompute_triangle_solid_angle_props(points[indices[primitive_index * 3 + 0]], points[indices[primitive_index * 3 + 1]],
+                                    points[indices[primitive_index * 3 + 2]], triangle_solid_angle_props);
+                                node_solid_angle_props = combine_precomputed_solid_angle_props(&node_solid_angle_props, &triangle_solid_angle_props);
+                            }
+                        }
+                        (vec3&)parent_lower = node_solid_angle_props.box.lower;
+                        (vec3&)parent_upper = node_solid_angle_props.box.upper;
+                        solid_angle_props[parent] = node_solid_angle_props;
+                    }
+                }
+                else
+                {
+                    //printf("Compute non-leaf at %d\n", index);
+                    const int left_child = node_lowers[parent].i;
+                    const int right_child = node_uppers[parent].i;
+                    vec3 left_lower = (vec3&)(node_lowers[left_child]);
+                    vec3 left_upper = (vec3&)(node_uppers[left_child]);
+                    vec3 right_lower = (vec3&)(node_lowers[right_child]);
+                    vec3 right_upper = (vec3&)(node_uppers[right_child]);
+                    // union of child bounds
+                    vec3 lower = min(left_lower, right_lower);
+                    vec3 upper = max(left_upper, right_upper);
+                    // write new BVH nodes
+                    (vec3&)parent_lower = lower;
+                    (vec3&)parent_upper = upper;
+                    // combine
+                    SolidAngleProps* left_child_data = &solid_angle_props[left_child];
+                    SolidAngleProps* right_child_data = (left_child != right_child) ? &solid_angle_props[right_child] : NULL;
+                    combine_precomputed_solid_angle_props(solid_angle_props[parent], left_child_data, right_child_data);
+                }
                 // move onto processing the parent
                 index = parent;
             }
@@ -157,15 +228,15 @@ void bvh_refit_with_solid_angle_device(BVH& bvh, Mesh& mesh)
     ContextGuard guard(bvh.context);
     // clear child counters
-    memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int)*bvh.max_nodes);
-    wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_with_solid_angle_kernel, bvh.num_items, (bvh.num_items, bvh.node_parents, bvh.node_counts, bvh.node_lowers, bvh.node_uppers, mesh.points, mesh.indices, mesh.solid_angle_props));
+    memset_device(WP_CURRENT_CONTEXT, bvh.node_counts, 0, sizeof(int) * bvh.max_nodes);
+    wp_launch_device(WP_CURRENT_CONTEXT, bvh_refit_with_solid_angle_kernel, bvh.num_leaf_nodes,
+        (bvh.num_leaf_nodes, bvh.node_parents, bvh.node_counts, bvh.node_lowers, bvh.node_uppers, mesh.points, mesh.indices, bvh.primitive_indices, mesh.solid_angle_props));
 }
 } // namespace wp
-uint64_t mesh_create_device(void* context, wp::array_t<wp::vec3> points, wp::array_t<wp::vec3> velocities, wp::array_t<int> indices, int num_points, int num_tris, int support_winding_number)
+uint64_t mesh_create_device(void* context, wp::array_t<wp::vec3> points, wp::array_t<wp::vec3> velocities, wp::array_t<int> indices, int num_points, int num_tris, int support_winding_number, int constructor_type)
 {
     ContextGuard guard(context);
@@ -173,55 +244,38 @@ uint64_t mesh_create_device(void* context, wp::array_t<wp::vec3> points, wp::arr
     mesh.context = context ? context : cuda_context_get_current();
+    // create lower upper arrays expected by GPU BVH builder
+    mesh.lowers = (wp::vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::vec3)*num_tris);
+    mesh.uppers = (wp::vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::vec3)*num_tris);
+    if (support_winding_number)
     {
-        // // todo: BVH creation only on CPU at the moment so temporarily bring all the data back to host
-        // vec3* points_host = (vec3*)alloc_host(sizeof(vec3)*num_points);
-        // int* indices_host = (int*)alloc_host(sizeof(int)*num_tris*3);
-        // bounds3* bounds_host = (bounds3*)alloc_host(sizeof(bounds3)*num_tris);
-        // memcpy_d2h(WP_CURRENT_CONTEXT, points_host, points, sizeof(vec3)*num_points);
-        // memcpy_d2h(WP_CURRENT_CONTEXT, indices_host, indices, sizeof(int)*num_tris*3);
-        // cuda_context_synchronize(WP_CURRENT_CONTEXT);
-        // float sum = 0.0;
-        // for (int i=0; i < num_tris; ++i)
-        // {
-        //     bounds_host[i] = bounds3();
-        //     wp::vec3 p0 = points_host[indices_host[i*3+0]];
-        //     wp::vec3 p1 = points_host[indices_host[i*3+1]];
-        //     wp::vec3 p2 = points_host[indices_host[i*3+2]];
-        //     bounds_host[i].add_point(p0);
-        //     bounds_host[i].add_point(p1);
-        //     bounds_host[i].add_point(p2);
-        //     sum += length(p0-p1) + length(p0-p2) + length(p2-p1);
-        // }
-        // mesh.average_edge_length = sum / (num_tris*3);
-        // BVH bvh_host = bvh_create(bounds_host, num_tris);
-        // BVH bvh_device = bvh_clone(WP_CURRENT_CONTEXT, bvh_host);
-        // bvh_destroy_host(bvh_host);
-        // create lower upper arrays expected by GPU BVH builder
-        mesh.lowers = (wp::vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::vec3)*num_tris);
-        mesh.uppers = (wp::vec3*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::vec3)*num_tris);
-        wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_triangle_bounds, num_tris, (num_tris, points.data, indices.data, mesh.lowers, mesh.uppers));
-        wp::bvh_create_device(mesh.context, mesh.lowers, mesh.uppers, num_tris, mesh.bvh);
-        if (support_winding_number)
-        {
-            int num_bvh_nodes = 2*num_tris;
-            mesh.solid_angle_props = (wp::SolidAngleProps*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::SolidAngleProps)*num_bvh_nodes);
-        }
+        int num_bvh_nodes = 2 * num_tris;
+        mesh.solid_angle_props = (wp::SolidAngleProps*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::SolidAngleProps) * num_bvh_nodes);
     }
     wp::Mesh* mesh_device = (wp::Mesh*)alloc_device(WP_CURRENT_CONTEXT, sizeof(wp::Mesh));
     memcpy_h2d(WP_CURRENT_CONTEXT, mesh_device, &mesh, sizeof(wp::Mesh));
     // save descriptor
     uint64_t mesh_id = (uint64_t)mesh_device;
+    // we compute mesh the average edge length
+    // for use in mesh_query_point_sign_normal()
+    // since it relies on an epsilon for welding
+    // reuse bounds memory temporarily for computing edge lengths
+    float* length_tmp_ptr = (float*)mesh.lowers;
+    wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_mesh_edge_lengths, mesh.num_tris, (mesh.num_tris, mesh.points, mesh.indices, length_tmp_ptr));
+    scan_device(length_tmp_ptr, length_tmp_ptr, mesh.num_tris, true);
+    wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_average_mesh_edge_length, 1, (mesh.num_tris, length_tmp_ptr, mesh_device));
+    // compute triangle bound and construct BVH
+    wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_triangle_bounds, mesh.num_tris, (mesh.num_tris, mesh.points, mesh.indices, mesh.lowers, mesh.uppers));
+    wp::bvh_create_device(mesh.context, mesh.lowers, mesh.uppers, num_tris, constructor_type, mesh.bvh);
+    // we need to overwrite mesh.bvh because it is not initialized when we construct it on device
+    memcpy_h2d(WP_CURRENT_CONTEXT, &(mesh_device->bvh), &mesh.bvh, sizeof(wp::BVH));
     mesh_add_descriptor(mesh_id, mesh);
     if (support_winding_number)
@@ -263,23 +317,21 @@ void mesh_refit_device(uint64_t id)
     {
         ContextGuard guard(m.context);
+        // we compute mesh the average edge length
+        // for use in mesh_query_point_sign_normal()
+        // since it relies on an epsilon for welding
+        // reuse bounds memory temporarily for computing edge lengths
+        float* length_tmp_ptr = (float*)m.lowers;
+        wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_mesh_edge_lengths, m.num_tris, (m.num_tris, m.points, m.indices, length_tmp_ptr));
+        scan_device(length_tmp_ptr, length_tmp_ptr, m.num_tris, true);
+        wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_average_mesh_edge_length, 1, (m.num_tris, length_tmp_ptr, (wp::Mesh*)id));
         wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_triangle_bounds, m.num_tris, (m.num_tris, m.points, m.indices, m.lowers, m.uppers));
         if (m.solid_angle_props)
         {
-            // we compute mesh the average edge length
-            // for use in mesh_query_point_sign_normal()
-            // since it relies on an epsilon for welding
-            // reuse bounds memory temporarily for computing edge lengths
-            float* length_tmp_ptr = (float*)m.lowers;
-            wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_mesh_edge_lengths, m.num_tris, (m.num_tris, m.points, m.indices, length_tmp_ptr));
-            scan_device(length_tmp_ptr, length_tmp_ptr, m.num_tris, true);
-            wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_average_mesh_edge_length, 1, (m.num_tris, length_tmp_ptr, (wp::Mesh*)id));
-            wp_launch_device(WP_CURRENT_CONTEXT, wp::compute_triangle_bounds, m.num_tris, (m.num_tris, m.points, m.indices, m.lowers, m.uppers));
             // update solid angle data
             bvh_refit_with_solid_angle_device(m.bvh, m);
         }