PyPI - warp-lang - Versions diffs - 1.2.2__py3-none-win_amd64.whl → 1.3.0__py3-none-win_amd64.whl - Mend

warp-lang 1.2.2__py3-none-win_amd64.whl → 1.3.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (194) hide show

warp/__init__.py +8 -6
warp/autograd.py +823 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +6 -2
warp/builtins.py +1410 -886
warp/codegen.py +503 -166
warp/config.py +48 -18
warp/context.py +400 -198
warp/dlpack.py +8 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +1 -1
warp/examples/benchmarks/benchmark_interop_torch.py +158 -0
warp/examples/benchmarks/benchmark_launches.py +1 -1
warp/examples/core/example_cupy.py +78 -0
warp/examples/fem/example_apic_fluid.py +17 -36
warp/examples/fem/example_burgers.py +9 -18
warp/examples/fem/example_convection_diffusion.py +7 -17
warp/examples/fem/example_convection_diffusion_dg.py +27 -47
warp/examples/fem/example_deformed_geometry.py +11 -22
warp/examples/fem/example_diffusion.py +7 -18
warp/examples/fem/example_diffusion_3d.py +24 -28
warp/examples/fem/example_diffusion_mgpu.py +7 -14
warp/examples/fem/example_magnetostatics.py +190 -0
warp/examples/fem/example_mixed_elasticity.py +111 -80
warp/examples/fem/example_navier_stokes.py +30 -34
warp/examples/fem/example_nonconforming_contact.py +290 -0
warp/examples/fem/example_stokes.py +17 -32
warp/examples/fem/example_stokes_transfer.py +12 -21
warp/examples/fem/example_streamlines.py +350 -0
warp/examples/fem/utils.py +936 -0
warp/fabric.py +5 -2
warp/fem/__init__.py +13 -3
warp/fem/cache.py +161 -11
warp/fem/dirichlet.py +37 -28
warp/fem/domain.py +105 -14
warp/fem/field/__init__.py +14 -3
warp/fem/field/field.py +454 -11
warp/fem/field/nodal_field.py +33 -18
warp/fem/geometry/deformed_geometry.py +50 -15
warp/fem/geometry/hexmesh.py +12 -24
warp/fem/geometry/nanogrid.py +106 -31
warp/fem/geometry/quadmesh_2d.py +6 -11
warp/fem/geometry/tetmesh.py +103 -61
warp/fem/geometry/trimesh_2d.py +98 -47
warp/fem/integrate.py +231 -186
warp/fem/operator.py +14 -9
warp/fem/quadrature/pic_quadrature.py +35 -9
warp/fem/quadrature/quadrature.py +119 -32
warp/fem/space/basis_space.py +98 -22
warp/fem/space/collocated_function_space.py +3 -1
warp/fem/space/function_space.py +7 -2
warp/fem/space/grid_2d_function_space.py +3 -3
warp/fem/space/grid_3d_function_space.py +4 -4
warp/fem/space/hexmesh_function_space.py +3 -2
warp/fem/space/nanogrid_function_space.py +12 -14
warp/fem/space/partition.py +45 -47
warp/fem/space/restriction.py +19 -16
warp/fem/space/shape/cube_shape_function.py +91 -3
warp/fem/space/shape/shape_function.py +7 -0
warp/fem/space/shape/square_shape_function.py +32 -0
warp/fem/space/shape/tet_shape_function.py +11 -7
warp/fem/space/shape/triangle_shape_function.py +10 -1
warp/fem/space/topology.py +116 -42
warp/fem/types.py +8 -1
warp/fem/utils.py +301 -83
warp/native/array.h +16 -0
warp/native/builtin.h +0 -15
warp/native/cuda_util.cpp +14 -6
warp/native/exports.h +1348 -1308
warp/native/quat.h +79 -0
warp/native/rand.h +27 -4
warp/native/sparse.cpp +83 -81
warp/native/sparse.cu +381 -453
warp/native/vec.h +64 -0
warp/native/volume.cpp +40 -49
warp/native/volume_builder.cu +2 -3
warp/native/volume_builder.h +12 -17
warp/native/warp.cu +3 -3
warp/native/warp.h +69 -59
warp/render/render_opengl.py +17 -9
warp/sim/articulation.py +117 -17
warp/sim/collide.py +35 -29
warp/sim/model.py +123 -18
warp/sim/render.py +3 -1
warp/sparse.py +867 -203
warp/stubs.py +312 -541
warp/tape.py +29 -1
warp/tests/disabled_kinematics.py +1 -1
warp/tests/test_adam.py +1 -1
warp/tests/test_arithmetic.py +1 -1
warp/tests/test_array.py +58 -1
warp/tests/test_array_reduce.py +1 -1
warp/tests/test_async.py +1 -1
warp/tests/test_atomic.py +1 -1
warp/tests/test_bool.py +1 -1
warp/tests/test_builtins_resolution.py +1 -1
warp/tests/test_bvh.py +6 -1
warp/tests/test_closest_point_edge_edge.py +1 -1
warp/tests/test_codegen.py +66 -1
warp/tests/test_compile_consts.py +1 -1
warp/tests/test_conditional.py +1 -1
warp/tests/test_copy.py +1 -1
warp/tests/test_ctypes.py +1 -1
warp/tests/test_dense.py +1 -1
warp/tests/test_devices.py +1 -1
warp/tests/test_dlpack.py +1 -1
warp/tests/test_examples.py +33 -4
warp/tests/test_fabricarray.py +5 -2
warp/tests/test_fast_math.py +1 -1
warp/tests/test_fem.py +213 -6
warp/tests/test_fp16.py +1 -1
warp/tests/test_func.py +1 -1
warp/tests/test_future_annotations.py +90 -0
warp/tests/test_generics.py +1 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +1 -1
warp/tests/test_grad_debug.py +247 -0
warp/tests/test_hash_grid.py +6 -1
warp/tests/test_implicit_init.py +354 -0
warp/tests/test_import.py +1 -1
warp/tests/test_indexedarray.py +1 -1
warp/tests/test_intersect.py +1 -1
warp/tests/test_jax.py +1 -1
warp/tests/test_large.py +1 -1
warp/tests/test_launch.py +1 -1
warp/tests/test_lerp.py +1 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_lvalue.py +1 -1
warp/tests/test_marching_cubes.py +5 -2
warp/tests/test_mat.py +34 -35
warp/tests/test_mat_lite.py +2 -1
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_math.py +1 -1
warp/tests/test_matmul.py +20 -16
warp/tests/test_matmul_lite.py +1 -1
warp/tests/test_mempool.py +1 -1
warp/tests/test_mesh.py +5 -2
warp/tests/test_mesh_query_aabb.py +1 -1
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_mesh_query_ray.py +1 -1
warp/tests/test_mlp.py +1 -1
warp/tests/test_model.py +1 -1
warp/tests/test_module_hashing.py +77 -1
warp/tests/test_modules_lite.py +1 -1
warp/tests/test_multigpu.py +1 -1
warp/tests/test_noise.py +1 -1
warp/tests/test_operators.py +1 -1
warp/tests/test_options.py +1 -1
warp/tests/test_overwrite.py +542 -0
warp/tests/test_peer.py +1 -1
warp/tests/test_pinned.py +1 -1
warp/tests/test_print.py +1 -1
warp/tests/test_quat.py +15 -1
warp/tests/test_rand.py +1 -1
warp/tests/test_reload.py +1 -1
warp/tests/test_rounding.py +1 -1
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +95 -0
warp/tests/test_sim_grad.py +1 -1
warp/tests/test_sim_kinematics.py +1 -1
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +82 -15
warp/tests/test_spatial.py +1 -1
warp/tests/test_special_values.py +2 -11
warp/tests/test_streams.py +11 -1
warp/tests/test_struct.py +1 -1
warp/tests/test_tape.py +1 -1
warp/tests/test_torch.py +194 -1
warp/tests/test_transient_module.py +1 -1
warp/tests/test_types.py +1 -1
warp/tests/test_utils.py +1 -1
warp/tests/test_vec.py +15 -63
warp/tests/test_vec_lite.py +2 -1
warp/tests/test_vec_scalar_ops.py +65 -1
warp/tests/test_verify_fp.py +1 -1
warp/tests/test_volume.py +28 -2
warp/tests/test_volume_write.py +1 -1
warp/tests/unittest_serial.py +1 -1
warp/tests/unittest_suites.py +9 -1
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +2 -5
warp/torch.py +103 -41
warp/types.py +341 -224
warp/utils.py +11 -2
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/METADATA +99 -46
warp_lang-1.3.0.dist-info/RECORD +368 -0
warp/examples/fem/bsr_utils.py +0 -378
warp/examples/fem/mesh_utils.py +0 -133
warp/examples/fem/plot_utils.py +0 -292
warp_lang-1.2.2.dist-info/RECORD +0 -359
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/WHEEL +0 -0
{warp_lang-1.2.2.dist-info → warp_lang-1.3.0.dist-info}/top_level.txt +0 -0

warp/native/quat.h CHANGED Viewed

@@ -36,6 +36,42 @@ struct quat_t
     // real part
     Type w;
+    inline CUDA_CALLABLE Type operator[](int index) const
+    {
+        switch (index)
+        {
+            case 0:
+                return x;
+            case 1:
+                return y;
+            case 2:
+                return z;
+            case 3:
+                return w;
+            default:
+                assert(0);
+                return x;
+        }
+    }
+    inline CUDA_CALLABLE Type& operator[](int index)
+    {
+        switch (index)
+        {
+            case 0:
+                return x;
+            case 1:
+                return y;
+            case 2:
+                return z;
+            case 3:
+                return w;
+            default:
+                assert(0);
+                return x;
+        }
+    }
 };
 using quat = quat_t<float>;
@@ -400,6 +436,49 @@ inline CUDA_CALLABLE Type extract(const quat_t<Type>& a, int idx)
     else                {return a.w;}
 }
+template<typename Type>
+inline CUDA_CALLABLE Type* index(quat_t<Type>& q, int idx)
+{
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    return &q[idx];
+}
+template<typename Type>
+inline CUDA_CALLABLE Type* indexref(quat_t<Type>* q, int idx)
+{
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat store %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    return &((*q)[idx]);
+}
+template<typename Type>
+inline CUDA_CALLABLE void adj_index(quat_t<Type>& q, int idx,
+                                    quat_t<Type>& adj_q, int adj_idx, const Type& adj_value)
+{
+    // nop
+}
+template<typename Type>
+inline CUDA_CALLABLE void adj_indexref(quat_t<Type>* q, int idx,
+                                       quat_t<Type>& adj_q, int adj_idx, const Type& adj_value)
+{
+    // nop
+}
 template<typename Type>
 CUDA_CALLABLE inline quat_t<Type> lerp(const quat_t<Type>& a, const quat_t<Type>& b, Type t)
 {

warp/native/rand.h CHANGED Viewed

@@ -13,13 +13,24 @@
 #define M_PI_F 3.14159265358979323846f
 #endif
-#ifndef LOG_EPSILON
-#define LOG_EPSILON 5.96e-8f
+/*
+ * Please first read the randf comment. randf returns values uniformly distributed in the range [0.f, 1.f - 2.^-24] in equal intervals of size 2.^-24.
+ * randn computes sqrt(-2.f * log(x)). For this to return a real value, log(x) < 0.f (we exclude 0.f as a precaution) and therefore x < 1.f.
+ * For it to be finite, x > 0.f. So x must be in (0.f, 1.f). We define RANDN_EPSILON to be 2^-24 truncated to 5.96e-8f and add it to the range of randf,
+ * giving the domain [RANDN_EPSILON, 1.f - 2.^-24 + RAND_EPSILON] which satisfies the requirement that x is in (0.f, 1.f).
+ */
+#ifndef RANDN_EPSILON
+#define RANDN_EPSILON 5.96e-8f
 #endif
 namespace wp
 {
+/*
+ * Mark Jarzynski and Marc Olano, Hash Functions for GPU Rendering, Journal of Computer
+ * Graphics Techniques (JCGT), vol. 9, no. 3, 20–38, 2020
+ */
 inline CUDA_CALLABLE uint32 rand_pcg(uint32 state)
 {
     uint32 b = state * 747796405u + 2891336453u;
@@ -33,11 +44,20 @@ inline CUDA_CALLABLE uint32 rand_init(int seed, int offset) { return rand_pcg(ui
 inline CUDA_CALLABLE int randi(uint32& state) { state = rand_pcg(state); return int(state); }
 inline CUDA_CALLABLE int randi(uint32& state, int min, int max) { state = rand_pcg(state); return state % (max - min) + min; }
+/*
+ * We want to ensure randf adheres to a uniform distribution over [0,1). The set of all possible float32 (IEEE 754 standard) values is not uniformly distributed however.
+ * On the other hand, for a given sign and exponent, the mantissa of the float32 representation is uniformly distributed.
+ * Fixing an exponent of -1, we can craft a uniform distribution using the sign bit and 23-bit mantissa that spans the domain [0, 1) in 2^24 equal intervals.
+ * We can map 2^24 unique unsigned integers to these 2^24 intervals, so if our random number generator returns values in the range [0, 2^24) without bias,
+ * we can ensure that our float distribution in the range [0, 1) is also without bias.
+ * Our random number generator returns values in the range [0, 2^32), so we bit shift a random unsigned int 8 places, and then make the assumption that the remaining bit strings
+ * are uniformly distributed. After dividing by 2.^24, randf returns values uniformly distributed in the range [0.f, 1.f - 2.^-24].
+ */
 inline CUDA_CALLABLE float randf(uint32& state) { state = rand_pcg(state); return (state >> 8) * (1.0f / 16777216.0f); }
 inline CUDA_CALLABLE float randf(uint32& state, float min, float max) { return (max - min) * randf(state) + min; }
 // Box-Muller method
-inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state) + LOG_EPSILON)) * cos(2.f * M_PI_F * randf(state)); }
+inline CUDA_CALLABLE float randn(uint32& state) { return sqrt(-2.f * log(randf(state) + RANDN_EPSILON)) * cos(2.f * M_PI_F * randf(state)); }
 inline CUDA_CALLABLE void adj_rand_init(int seed, int& adj_seed, float adj_ret) {}
 inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int& adj_offset, float adj_ret) {}
@@ -56,6 +76,9 @@ inline CUDA_CALLABLE int sample_cdf(uint32& state, const array_t<float>& cdf)
     return lower_bound<float>(cdf, u);
 }
+/*
+ * uniform sampling methods for various geometries
+ */
 inline CUDA_CALLABLE vec2 sample_triangle(uint32& state)
 {
     float r = sqrt(randf(state));
@@ -301,4 +324,4 @@ inline CUDA_CALLABLE void random_poisson_mult(uint32& state, float lam, uint32&
 inline CUDA_CALLABLE void adj_random_poisson(uint32& state, float lam, uint32& adj_state, float& adj_lam, const uint32& adj_ret) {}
 inline CUDA_CALLABLE void adj_poisson(uint32& state, float lam, uint32& adj_state, float& adj_lam, const uint32& adj_ret) {}
-} // namespace wp
+} // namespace wp

warp/native/sparse.cpp CHANGED Viewed

@@ -10,17 +10,17 @@ namespace
 // Specialized is_zero and accumulation function for common block sizes
 // Rely on compiler to unroll loops when block size is known
-template <int N, typename T> bool bsr_fixed_block_is_zero(const T *val, int value_size)
+template <int N, typename T> bool bsr_fixed_block_is_zero(const T* val, int value_size)
 {
     return std::all_of(val, val + N, [](float v) { return v == T(0); });
 }
-template <typename T> bool bsr_dyn_block_is_zero(const T *val, int value_size)
+template <typename T> bool bsr_dyn_block_is_zero(const T* val, int value_size)
 {
     return std::all_of(val, val + value_size, [](float v) { return v == T(0); });
 }
-template <int N, typename T> void bsr_fixed_block_accumulate(const T *val, T *sum, int value_size)
+template <int N, typename T> void bsr_fixed_block_accumulate(const T* val, T* sum, int value_size)
 {
     for (int i = 0; i < N; ++i, ++val, ++sum)
     {
@@ -28,7 +28,7 @@ template <int N, typename T> void bsr_fixed_block_accumulate(const T *val, T *su
     }
 }
-template <typename T> void bsr_dyn_block_accumulate(const T *val, T *sum, int value_size)
+template <typename T> void bsr_dyn_block_accumulate(const T* val, T* sum, int value_size)
 {
     for (int i = 0; i < value_size; ++i, ++val, ++sum)
     {
@@ -37,7 +37,7 @@ template <typename T> void bsr_dyn_block_accumulate(const T *val, T *sum, int va
 }
 template <int Rows, int Cols, typename T>
-void bsr_fixed_block_transpose(const T *src, T *dest, int row_count, int col_count)
+void bsr_fixed_block_transpose(const T* src, T* dest, int row_count, int col_count)
 {
     for (int r = 0; r < Rows; ++r)
     {
@@ -48,7 +48,7 @@ void bsr_fixed_block_transpose(const T *src, T *dest, int row_count, int col_cou
     }
 }
-template <typename T> void bsr_dyn_block_transpose(const T *src, T *dest, int row_count, int col_count)
+template <typename T> void bsr_dyn_block_transpose(const T* src, T* dest, int row_count, int col_count)
 {
     for (int r = 0; r < row_count; ++r)
     {
@@ -63,15 +63,15 @@ template <typename T> void bsr_dyn_block_transpose(const T *src, T *dest, int ro
 template <typename T>
 int bsr_matrix_from_triplets_host(const int rows_per_block, const int cols_per_block, const int row_count,
-                                  const int nnz, const int *tpl_rows, const int *tpl_columns, const T *tpl_values,
-                                  int *bsr_offsets, int *bsr_columns, T *bsr_values)
+                                  const int nnz, const int* tpl_rows, const int* tpl_columns, const T* tpl_values,
+                                  const bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns, T* bsr_values)
 {
     // get specialized accumulator for common block sizes (1,1), (1,2), (1,3),
     // (2,2), (2,3), (3,3)
     const int block_size = rows_per_block * cols_per_block;
-    void (*block_accumulate_func)(const T *, T *, int);
-    bool (*block_is_zero_func)(const T *, int);
+    void (*block_accumulate_func)(const T*, T*, int);
+    bool (*block_is_zero_func)(const T*, int);
     switch (block_size)
     {
     case 1:
@@ -106,20 +106,19 @@ int bsr_matrix_from_triplets_host(const int rows_per_block, const int cols_per_b
     std::vector<int> block_indices(nnz);
     std::iota(block_indices.begin(), block_indices.end(), 0);
-    // remove zero block indices
-    if (tpl_values)
-    {
-        block_indices.erase(std::remove_if(block_indices.begin(), block_indices.end(),
-                                           [block_is_zero_func, tpl_values, block_size](int i) {
-                                               return block_is_zero_func(tpl_values + i * block_size, block_size);
-                                           }),
-                            block_indices.end());
-    }
+    // remove zero blocks  and invalid row indices
+    block_indices.erase(std::remove_if(block_indices.begin(), block_indices.end(),
+                                       [&](int i)
+                                       {
+                                           return tpl_rows[i] < 0 || tpl_rows[i] >= row_count ||
+                                                  (prune_numerical_zeros && tpl_values &&
+                                                   block_is_zero_func(tpl_values + i * block_size, block_size));
+                                       }),
+                        block_indices.end());
     // sort block indices according to lexico order
-    std::sort(block_indices.begin(), block_indices.end(), [tpl_rows, tpl_columns](int i, int j) -> bool {
-        return tpl_rows[i] < tpl_rows[j] || (tpl_rows[i] == tpl_rows[j] && tpl_columns[i] < tpl_columns[j]);
-    });
+    std::sort(block_indices.begin(), block_indices.end(), [tpl_rows, tpl_columns](int i, int j) -> bool
+              { return tpl_rows[i] < tpl_rows[j] || (tpl_rows[i] == tpl_rows[j] && tpl_columns[i] < tpl_columns[j]); });
     // accumulate blocks at same locations, count blocks per row
     std::fill_n(bsr_offsets, row_count + 1, 0);
@@ -138,7 +137,7 @@ int bsr_matrix_from_triplets_host(const int rows_per_block, const int cols_per_b
         int idx = block_indices[i];
         int row = tpl_rows[idx];
         int col = tpl_columns[idx];
-        const T *val = tpl_values + idx * block_size;
+        const T* val = tpl_values + idx * block_size;
         if (row == current_row && col == current_col)
         {
@@ -171,14 +170,14 @@ int bsr_matrix_from_triplets_host(const int rows_per_block, const int cols_per_b
 }
 template <typename T>
-void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, int col_count, int nnz,
-                        const int *bsr_offsets, const int *bsr_columns, const T *bsr_values,
-                        int *transposed_bsr_offsets, int *transposed_bsr_columns, T *transposed_bsr_values)
+void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, int col_count, int nnz_up,
+                        const int* bsr_offsets, const int* bsr_columns, const T* bsr_values,
+                        int* transposed_bsr_offsets, int* transposed_bsr_columns, T* transposed_bsr_values)
 {
+    const int nnz = bsr_offsets[row_count];
     const int block_size = rows_per_block * cols_per_block;
-    void (*block_transpose_func)(const T *, T *, int, int) = bsr_dyn_block_transpose<T>;
+    void (*block_transpose_func)(const T*, T*, int, int) = bsr_dyn_block_transpose<T>;
     switch (rows_per_block)
     {
     case 1:
@@ -235,9 +234,9 @@ void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, i
     }
     // sort block indices according to (transposed) lexico order
-    std::sort(block_indices.begin(), block_indices.end(), [&bsr_rows, bsr_columns](int i, int j) -> bool {
-        return bsr_columns[i] < bsr_columns[j] || (bsr_columns[i] == bsr_columns[j] && bsr_rows[i] < bsr_rows[j]);
-    });
+    std::sort(
+        block_indices.begin(), block_indices.end(), [&bsr_rows, bsr_columns](int i, int j) -> bool
+        { return bsr_columns[i] < bsr_columns[j] || (bsr_columns[i] == bsr_columns[j] && bsr_rows[i] < bsr_rows[j]); });
     // Count blocks per column and transpose blocks
     std::fill_n(transposed_bsr_offsets, col_count + 1, 0);
@@ -251,88 +250,91 @@ void bsr_transpose_host(int rows_per_block, int cols_per_block, int row_count, i
         ++transposed_bsr_offsets[col + 1];
         transposed_bsr_columns[i] = row;
-        const T *src_block = bsr_values + idx * block_size;
-        T *dst_block = transposed_bsr_values + i * block_size;
-        block_transpose_func(src_block, dst_block, rows_per_block, cols_per_block);
+        if (transposed_bsr_values != nullptr)
+        {
+            const T* src_block = bsr_values + idx * block_size;
+            T* dst_block = transposed_bsr_values + i * block_size;
+            block_transpose_func(src_block, dst_block, rows_per_block, cols_per_block);
+        }
     }
     // build postfix sum of column counts
     std::partial_sum(transposed_bsr_offsets, transposed_bsr_offsets + col_count + 1, transposed_bsr_offsets);
 }
-WP_API int bsr_matrix_from_triplets_float_host(int rows_per_block, int cols_per_block, int row_count, int nnz,
-                                               uint64_t tpl_rows, uint64_t tpl_columns, uint64_t tpl_values,
-                                               uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values)
+WP_API void bsr_matrix_from_triplets_float_host(int rows_per_block, int cols_per_block, int row_count, int nnz,
+                                                int* tpl_rows, int* tpl_columns, void* tpl_values,
+                                                bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
+                                                void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
-    return bsr_matrix_from_triplets_host(
-        rows_per_block, cols_per_block, row_count, nnz, reinterpret_cast<const int *>(tpl_rows),
-        reinterpret_cast<const int *>(tpl_columns), reinterpret_cast<const float *>(tpl_values),
-        reinterpret_cast<int *>(bsr_offsets), reinterpret_cast<int *>(bsr_columns),
-        reinterpret_cast<float *>(bsr_values));
+    bsr_matrix_from_triplets_host<float>(rows_per_block, cols_per_block, row_count, nnz, tpl_rows, tpl_columns,
+                                         static_cast<const float*>(tpl_values), prune_numerical_zeros, bsr_offsets,
+                                         bsr_columns, static_cast<float*>(bsr_values));
+    if (bsr_nnz)
+    {
+        *bsr_nnz = bsr_offsets[row_count];
+    }
 }
-WP_API int bsr_matrix_from_triplets_double_host(int rows_per_block, int cols_per_block, int row_count, int nnz,
-                                                uint64_t tpl_rows, uint64_t tpl_columns, uint64_t tpl_values,
-                                                uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values)
+WP_API void bsr_matrix_from_triplets_double_host(int rows_per_block, int cols_per_block, int row_count, int nnz,
+                                                 int* tpl_rows, int* tpl_columns, void* tpl_values,
+                                                 bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
+                                                 void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
-    return bsr_matrix_from_triplets_host(
-        rows_per_block, cols_per_block, row_count, nnz, reinterpret_cast<const int *>(tpl_rows),
-        reinterpret_cast<const int *>(tpl_columns), reinterpret_cast<const double *>(tpl_values),
-        reinterpret_cast<int *>(bsr_offsets), reinterpret_cast<int *>(bsr_columns),
-        reinterpret_cast<double *>(bsr_values));
+    bsr_matrix_from_triplets_host<double>(rows_per_block, cols_per_block, row_count, nnz, tpl_rows, tpl_columns,
+                                          static_cast<const double*>(tpl_values), prune_numerical_zeros, bsr_offsets,
+                                          bsr_columns, static_cast<double*>(bsr_values));
+    if (bsr_nnz)
+    {
+        *bsr_nnz = bsr_offsets[row_count];
+    }
 }
 WP_API void bsr_transpose_float_host(int rows_per_block, int cols_per_block, int row_count, int col_count, int nnz,
-                                     uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values,
-                                     uint64_t transposed_bsr_offsets, uint64_t transposed_bsr_columns,
-                                     uint64_t transposed_bsr_values)
+                                     int* bsr_offsets, int* bsr_columns, void* bsr_values, int* transposed_bsr_offsets,
+                                     int* transposed_bsr_columns, void* transposed_bsr_values)
 {
-    bsr_transpose_host(rows_per_block, cols_per_block, row_count, col_count, nnz,
-                       reinterpret_cast<const int *>(bsr_offsets), reinterpret_cast<const int *>(bsr_columns),
-                       reinterpret_cast<const float *>(bsr_values), reinterpret_cast<int *>(transposed_bsr_offsets),
-                       reinterpret_cast<int *>(transposed_bsr_columns),
-                       reinterpret_cast<float *>(transposed_bsr_values));
+    bsr_transpose_host(rows_per_block, cols_per_block, row_count, col_count, nnz, bsr_offsets, bsr_columns,
+                       static_cast<const float*>(bsr_values), transposed_bsr_offsets, transposed_bsr_columns,
+                       static_cast<float*>(transposed_bsr_values));
 }
 WP_API void bsr_transpose_double_host(int rows_per_block, int cols_per_block, int row_count, int col_count, int nnz,
-                                      uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values,
-                                      uint64_t transposed_bsr_offsets, uint64_t transposed_bsr_columns,
-                                      uint64_t transposed_bsr_values)
+                                      int* bsr_offsets, int* bsr_columns, void* bsr_values, int* transposed_bsr_offsets,
+                                      int* transposed_bsr_columns, void* transposed_bsr_values)
 {
-    bsr_transpose_host(rows_per_block, cols_per_block, row_count, col_count, nnz,
-                       reinterpret_cast<const int *>(bsr_offsets), reinterpret_cast<const int *>(bsr_columns),
-                       reinterpret_cast<const double *>(bsr_values), reinterpret_cast<int *>(transposed_bsr_offsets),
-                       reinterpret_cast<int *>(transposed_bsr_columns),
-                       reinterpret_cast<double *>(transposed_bsr_values));
+    bsr_transpose_host(rows_per_block, cols_per_block, row_count, col_count, nnz, bsr_offsets, bsr_columns,
+                       static_cast<const double*>(bsr_values), transposed_bsr_offsets, transposed_bsr_columns,
+                       static_cast<double*>(transposed_bsr_values));
 }
 #if !WP_ENABLE_CUDA
-WP_API int bsr_matrix_from_triplets_float_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
-                                                 uint64_t tpl_rows, uint64_t tpl_columns, uint64_t tpl_values,
-                                                 uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values)
+WP_API void bsr_matrix_from_triplets_float_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
+                                                   int* tpl_rows, int* tpl_columns, void* tpl_values,
+                                                   bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
+                                                   void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
-    return 0;
 }
-WP_API int bsr_matrix_from_triplets_double_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
-                                                  uint64_t tpl_rows, uint64_t tpl_columns, uint64_t tpl_values,
-                                                  uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values)
+WP_API void bsr_matrix_from_triplets_double_device(int rows_per_block, int cols_per_block, int row_count, int nnz,
+                                                   int* tpl_rows, int* tpl_columns, void* tpl_values,
+                                                   bool prune_numerical_zeros, int* bsr_offsets, int* bsr_columns,
+                                                   void* bsr_values, int* bsr_nnz, void* bsr_nnz_event)
 {
-    return 0;
 }
 WP_API void bsr_transpose_float_device(int rows_per_block, int cols_per_block, int row_count, int col_count, int nnz,
-                                       uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values,
-                                       uint64_t transposed_bsr_offsets, uint64_t transposed_bsr_columns,
-                                       uint64_t transposed_bsr_values)
+                                       int* bsr_offsets, int* bsr_columns, void* bsr_values,
+                                       int* transposed_bsr_offsets, int* transposed_bsr_columns,
+                                       void* transposed_bsr_values)
 {
 }
 WP_API void bsr_transpose_double_device(int rows_per_block, int cols_per_block, int row_count, int col_count, int nnz,
-                                        uint64_t bsr_offsets, uint64_t bsr_columns, uint64_t bsr_values,
-                                        uint64_t transposed_bsr_offsets, uint64_t transposed_bsr_columns,
-                                        uint64_t transposed_bsr_values)
+                                        int* bsr_offsets, int* bsr_columns, void* bsr_values,
+                                        int* transposed_bsr_offsets, int* transposed_bsr_columns,
+                                        void* transposed_bsr_values)
 {
 }
-#endif
+#endif