PyPI - warp-lang - Versions diffs - 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +1904 -114
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +93 -30
warp/build_dll.py +331 -101
warp/builtins.py +1244 -160
warp/codegen.py +317 -206
warp/config.py +1 -1
warp/context.py +1465 -789
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_kernel.py +2 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +264 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +129 -51
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +25 -2
warp/jax_experimental/ffi.py +22 -1
warp/jax_experimental/xla_ffi.py +16 -7
warp/marching_cubes.py +708 -0
warp/native/array.h +99 -4
warp/native/builtin.h +86 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +8 -2
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +41 -10
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +1910 -116
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +4 -2
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +331 -14
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +40 -31
warp/native/sort.h +2 -0
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +13 -13
warp/native/spatial.h +366 -17
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +471 -82
warp/native/vec.h +328 -14
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +377 -216
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +99 -18
warp/render/render_usd.py +1 -0
warp/sim/graph_coloring.py +2 -2
warp/sparse.py +558 -175
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/interop/test_jax.py +608 -28
warp/tests/sim/test_coloring.py +6 -6
warp/tests/test_array.py +58 -5
warp/tests/test_codegen.py +4 -3
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +49 -6
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +15 -1
warp/tests/test_mat.py +1518 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +140 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +71 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +61 -20
warp/tests/test_vec.py +179 -34
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/tile/test_tile.py +245 -18
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_shared_memory.py +5 -5
warp/tests/unittest_suites.py +6 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +571 -267
warp/utils.py +68 -86
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0

warp/native/quat.h CHANGED Viewed

@@ -459,14 +459,19 @@ inline CUDA_CALLABLE quat_t<Type> quat_from_matrix(const mat_t<Rows,Cols,Type>&
 template<typename Type>
 inline CUDA_CALLABLE Type extract(const quat_t<Type>& a, int idx)
 {
-#if FP_CHECK
-    if (idx < 0 || idx > 3)
+#ifndef NDEBUG
+    if (idx < -4 || idx >= 4)
     {
         printf("quat_t index %d out of bounds at %s %d", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     /*
     * Because quat data is not stored in an array, we index the quaternion by checking all possible idx values.
     * (&a.x)[idx] would be the preferred access strategy, but this results in undefined behavior in the clang compiler
@@ -478,17 +483,48 @@ inline CUDA_CALLABLE Type extract(const quat_t<Type>& a, int idx)
     else                {return a.w;}
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE vec_t<SliceLength, Type> extract(const quat_t<Type> & a, slice_t slice)
+{
+    vec_t<SliceLength, Type> ret;
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int idx = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        ret[idx] = a[i];
+        ++idx;
+    }
+    return ret;
+}
 template<typename Type>
 inline CUDA_CALLABLE Type* index(quat_t<Type>& q, int idx)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     return &q[idx];
 }
@@ -496,13 +532,18 @@ template<typename Type>
 inline CUDA_CALLABLE Type* indexref(quat_t<Type>* q, int idx)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat store %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     return &((*q)[idx]);
 }
@@ -526,120 +567,328 @@ template<typename Type>
 inline CUDA_CALLABLE void add_inplace(quat_t<Type>& q, int idx, Type value)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     q[idx] += value;
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void add_inplace(quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        q[i] += a[ii];
+        ++ii;
+    }
+    assert(ii == SliceLength);
+}
 template<typename Type>
 inline CUDA_CALLABLE void adj_add_inplace(quat_t<Type>& q, int idx, Type value,
                                         quat_t<Type>& adj_q, int adj_idx, Type& adj_value)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     adj_value += adj_q[idx];
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void adj_add_inplace(
+    const quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a,
+    quat_t<Type>& adj_q, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a
+)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        adj_a[ii] += adj_q[i];
+        ++ii;
+    }
+    assert(ii == SliceLength);
+}
 template<typename Type>
 inline CUDA_CALLABLE void sub_inplace(quat_t<Type>& q, int idx, Type value)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     q[idx] -= value;
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void sub_inplace(quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        q[i] -= a[ii];
+        ++ii;
+    }
+    assert(ii == SliceLength);
+}
 template<typename Type>
 inline CUDA_CALLABLE void adj_sub_inplace(quat_t<Type>& q, int idx, Type value,
                                         quat_t<Type>& adj_q, int adj_idx, Type& adj_value)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     adj_value -= adj_q[idx];
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void adj_sub_inplace(
+    const quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a,
+    quat_t<Type>& adj_q, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a
+)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        adj_a[ii] -= adj_q[i];
+        ++ii;
+    }
+    assert(ii == SliceLength);
+}
 template<typename Type>
 inline CUDA_CALLABLE void assign_inplace(quat_t<Type>& q, int idx, Type value)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     q[idx] = value;
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void assign_inplace(quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        q[i] = a[ii];
+        ++ii;
+    }
+    assert(ii == SliceLength);
+}
 template<typename Type>
 inline CUDA_CALLABLE void adj_assign_inplace(quat_t<Type>& q, int idx, Type value, quat_t<Type>& adj_q, int& adj_idx, Type& adj_value)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     adj_value += adj_q[idx];
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(
+    const quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a,
+    quat_t<Type>& adj_q, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a
+)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        adj_a[ii] += adj_q[i];
+        ++ii;
+    }
+    assert(ii == SliceLength);
+}
 template<typename Type>
 inline CUDA_CALLABLE quat_t<Type> assign_copy(quat_t<Type>& q, int idx, Type value)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     quat_t<Type> ret(q);
     ret[idx] = value;
     return ret;
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE quat_t<Type> assign_copy(quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a)
+{
+    quat_t<Type> ret(q);
+    assign_inplace<SliceLength>(ret, slice, a);
+    return ret;
+}
 template<typename Type>
 inline CUDA_CALLABLE void adj_assign_copy(quat_t<Type>& q, int idx, Type value, quat_t<Type>& adj_q, int& adj_idx, Type& adj_value, const quat_t<Type>& adj_ret)
 {
 #ifndef NDEBUG
-    if (idx < 0 || idx > 3)
+    if (idx < -4 || idx >= 4)
     {
         printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     adj_value += adj_ret[idx];
     for(unsigned i=0; i < 4; ++i)
     {
@@ -648,6 +897,41 @@ inline CUDA_CALLABLE void adj_assign_copy(quat_t<Type>& q, int idx, Type value,
     }
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void adj_assign_copy(
+    quat_t<Type>& q, slice_t slice, const vec_t<SliceLength, Type> &a,
+    quat_t<Type>& adj_q, slice_t& adj_slice, vec_t<SliceLength, Type>& adj_a,
+    const quat_t<Type>& adj_ret
+)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (int i = 0; i < 4; ++i)
+    {
+        bool in_slice = is_reversed
+            ? (i <= slice.start && i > slice.stop && (slice.start - i) % (-slice.step) == 0)
+            : (i >= slice.start && i < slice.stop && (i - slice.start) % slice.step == 0);
+        if (!in_slice)
+        {
+            adj_q[i] += adj_ret[i];
+        }
+        else
+        {
+            adj_a[ii] += adj_ret[i];
+            ++ii;
+        }
+    }
+    assert(ii == SliceLength);
+}
 template<typename Type>
 CUDA_CALLABLE inline quat_t<Type> lerp(const quat_t<Type>& a, const quat_t<Type>& b, Type t)
@@ -666,14 +950,19 @@ CUDA_CALLABLE inline void adj_lerp(const quat_t<Type>& a, const quat_t<Type>& b,
 template<typename Type>
 inline CUDA_CALLABLE void adj_extract(const quat_t<Type>& a, int idx, quat_t<Type>& adj_a, int & adj_idx, Type & adj_ret)
 {
-#if FP_CHECK
-    if (idx < 0 || idx > 3)
+#ifndef NDEBUG
+    if (idx < -4 || idx >= 4)
     {
         printf("quat_t index %d out of bounds at %s %d", idx, __FILE__, __LINE__);
         assert(0);
     }
 #endif
+    if (idx < 0)
+    {
+        idx += 4;
+    }
     // See wp::extract(const quat_t<Type>& a, int idx) note
     if (idx == 0)       {adj_a.x += adj_ret;}
     else if (idx == 1)  {adj_a.y += adj_ret;}
@@ -681,6 +970,34 @@ inline CUDA_CALLABLE void adj_extract(const quat_t<Type>& a, int idx, quat_t<Typ
     else                {adj_a.w += adj_ret;}
 }
+template<unsigned SliceLength, typename Type>
+inline CUDA_CALLABLE void adj_extract(
+    const quat_t<Type>& a, slice_t slice,
+    quat_t<Type>& adj_a, slice_t& adj_slice,
+    const vec_t<SliceLength, Type>& adj_ret
+)
+{
+    assert(slice.start >= 0 && slice.start <= 4);
+    assert(slice.stop >= -1 && slice.stop <= 4);
+    assert(slice.step != 0 && slice.step < 0 ? slice.start >= slice.stop : slice.start <= slice.stop);
+    assert(slice_get_length(slice) == SliceLength);
+    bool is_reversed = slice.step < 0;
+    int ii = 0;
+    for (
+        int i = slice.start;
+        is_reversed ? (i > slice.stop) : (i < slice.stop);
+        i += slice.step
+    )
+    {
+        adj_a[i] += adj_ret[ii];
+        ++ii;
+    }
+    assert(ii == SliceLength);
+}
 // backward methods
 template<typename Type>

warp/native/range.h CHANGED Viewed

@@ -115,7 +115,13 @@ CUDA_CALLABLE inline range_t iter_reverse(const range_t& r)
     // generates a reverse range, equivalent to reversed(range())
     range_t rev;
-    if (r.step > 0)
+    if (r.step == 0)
+    {
+        // degenerate case where step == 0, return empty range
+        rev.start = r.start;
+        rev.end = r.start;
+    }
+    else if (r.step > 0)
     {
         rev.start = r.start + int((r.end - r.start - 1) / r.step) * r.step;
     }

warp/native/reduce.cpp CHANGED Viewed

@@ -119,7 +119,7 @@ template <typename T> void array_sum_host(const T *ptr_a, T *ptr_out, int count,
         accumulate_func(ptr_a + i * stride, ptr_out, type_length);
 }
-void array_inner_float_host(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+void wp_array_inner_float_host(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
                             int type_length)
 {
     const float *ptr_a = (const float *)(a);
@@ -129,7 +129,7 @@ void array_inner_float_host(uint64_t a, uint64_t b, uint64_t out, int count, int
     array_inner_host(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_length);
 }
-void array_inner_double_host(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+void wp_array_inner_double_host(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
                              int type_length)
 {
     const double *ptr_a = (const double *)(a);
@@ -139,14 +139,14 @@ void array_inner_double_host(uint64_t a, uint64_t b, uint64_t out, int count, in
     array_inner_host(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_length);
 }
-void array_sum_float_host(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
+void wp_array_sum_float_host(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
 {
     const float *ptr_a = (const float *)(a);
     float *ptr_out = (float *)(out);
     array_sum_host(ptr_a, ptr_out, count, byte_stride_a, type_length);
 }
-void array_sum_double_host(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
+void wp_array_sum_double_host(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
 {
     const double *ptr_a = (const double *)(a);
     double *ptr_out = (double *)(out);
@@ -154,21 +154,21 @@ void array_sum_double_host(uint64_t a, uint64_t out, int count, int byte_stride_
 }
 #if !WP_ENABLE_CUDA
-void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
-                              int type_length)
+void wp_array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+                                 int type_length)
 {
 }
-void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
-                               int type_length)
+void wp_array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+                                  int type_length)
 {
 }
-void array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
+void wp_array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
 {
 }
-void array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
+void wp_array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride_a, int type_length)
 {
 }
 #endif

warp/native/reduce.cu CHANGED Viewed

@@ -22,7 +22,6 @@
 #define THRUST_IGNORE_CUB_VERSION_CHECK
 #include <cub/device/device_reduce.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
 namespace
 {
@@ -119,14 +118,14 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
     assert((byte_stride % sizeof(T)) == 0);
     const int stride = byte_stride / sizeof(T);
-    ContextGuard guard(cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     cub_strided_iterator<const T> ptr_strided{ptr_a, stride};
     size_t buff_size = 0;
     check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, ptr_strided, ptr_out, count, stream));
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
     for (int k = 0; k < type_length; ++k)
     {
@@ -134,7 +133,7 @@ template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int coun
         check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, ptr_strided, ptr_out + k, count, stream));
     }
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template <typename T>
@@ -280,18 +279,18 @@ void array_inner_device(const ElemT *ptr_a, const ElemT *ptr_b, ScalarT *ptr_out
     const int stride_a = byte_stride_a / sizeof(ElemT);
     const int stride_b = byte_stride_b / sizeof(ElemT);
-    ContextGuard guard(cuda_context_get_current());
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    ContextGuard guard(wp_cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(wp_cuda_stream_get_current());
     cub_inner_product_iterator<ElemT, ScalarT> inner_iterator{ptr_a, ptr_b, stride_a, stride_b, type_length};
     size_t buff_size = 0;
     check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, inner_iterator, ptr_out, count, stream));
-    void* temp_buffer = alloc_device(WP_CURRENT_CONTEXT, buff_size);
+    void* temp_buffer = wp_alloc_device(WP_CURRENT_CONTEXT, buff_size);
     check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, inner_iterator, ptr_out, count, stream));
-    free_device(WP_CURRENT_CONTEXT, temp_buffer);
+    wp_free_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template <typename T>
@@ -327,10 +326,10 @@ void array_inner_device_dispatch(const T *ptr_a, const T *ptr_b, T *ptr_out, int
 } // anonymous namespace
-void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+void wp_array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
                               int type_len)
 {
-    void *context = cuda_context_get_current();
+    void *context = wp_cuda_context_get_current();
     const float *ptr_a = (const float *)(a);
     const float *ptr_b = (const float *)(b);
@@ -339,7 +338,7 @@ void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, i
     array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
 }
-void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+void wp_array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
                                int type_len)
 {
     const double *ptr_a = (const double *)(a);
@@ -349,14 +348,14 @@ void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count,
     array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
 }
-void array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
+void wp_array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
 {
     const float *ptr_a = (const float *)(a);
     float *ptr_out = (float *)(out);
     array_sum_device_dispatch(ptr_a, ptr_out, count, byte_stride, type_length);
 }
-void array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
+void wp_array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
 {
     const double *ptr_a = (const double *)(a);
     double *ptr_out = (double *)(out);