PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/native/reduce.cu ADDED Viewed

@@ -0,0 +1,348 @@
+#include "cuda_util.h"
+#include "warp.h"
+#include "temp_buffer.h"
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+#include <cub/device/device_reduce.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+namespace
+{
+template <typename T>
+__global__ void cwise_mult_kernel(int len, int stride_a, int stride_b, const T *a, const T *b, T *out)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= len)
+        return;
+    out[i] = a[i * stride_a] * b[i * stride_b];
+}
+/// Custom iterator for allowing strided access with CUB
+template <typename T> struct cub_strided_iterator
+{
+    typedef cub_strided_iterator<T> self_type;
+    typedef std::ptrdiff_t difference_type;
+    typedef T value_type;
+    typedef T *pointer;
+    typedef T &reference;
+    typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
+    T *ptr = nullptr;
+    int stride = 1;
+    CUDA_CALLABLE self_type operator++(int)
+    {
+        return ++(self_type(*this));
+    }
+    CUDA_CALLABLE self_type &operator++()
+    {
+        ptr += stride;
+        return *this;
+    }
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return *ptr;
+    }
+    CUDA_CALLABLE self_type operator+(difference_type n) const
+    {
+        return self_type(*this) += n;
+    }
+    CUDA_CALLABLE self_type &operator+=(difference_type n)
+    {
+        ptr += n * stride;
+        return *this;
+    }
+    CUDA_CALLABLE self_type operator-(difference_type n) const
+    {
+        return self_type(*this) -= n;
+    }
+    CUDA_CALLABLE self_type &operator-=(difference_type n)
+    {
+        ptr -= n * stride;
+        return *this;
+    }
+    CUDA_CALLABLE difference_type operator-(const self_type &other) const
+    {
+        return (ptr - other.ptr) / stride;
+    }
+    CUDA_CALLABLE reference operator[](difference_type n) const
+    {
+        return *(ptr + n * stride);
+    }
+    CUDA_CALLABLE pointer operator->() const
+    {
+        return ptr;
+    }
+    CUDA_CALLABLE bool operator==(const self_type &rhs) const
+    {
+        return (ptr == rhs.ptr);
+    }
+    CUDA_CALLABLE bool operator!=(const self_type &rhs) const
+    {
+        return (ptr != rhs.ptr);
+    }
+};
+template <typename T> void array_sum_device(const T *ptr_a, T *ptr_out, int count, int byte_stride, int type_length)
+{
+    assert((byte_stride % sizeof(T)) == 0);
+    const int stride = byte_stride / sizeof(T);
+    ContextGuard guard(cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cub_strided_iterator<const T> ptr_strided{ptr_a, stride};
+    size_t buff_size = 0;
+    check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, ptr_strided, ptr_out, count, stream));
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, buff_size);
+    for (int k = 0; k < type_length; ++k)
+    {
+        cub_strided_iterator<const T> ptr_strided{ptr_a + k, stride};
+        check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, ptr_strided, ptr_out + k, count, stream));
+    }
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
+}
+template <typename T>
+void array_sum_device_dispatch(const T *ptr_a, T *ptr_out, int count, int byte_stride, int type_length)
+{
+    using vec2 = wp::vec_t<2, T>;
+    using vec3 = wp::vec_t<3, T>;
+    using vec4 = wp::vec_t<4, T>;
+    // specialized calls for common vector types
+    if ((type_length % 4) == 0 && (byte_stride % sizeof(vec4)) == 0)
+    {
+        return array_sum_device(reinterpret_cast<const vec4 *>(ptr_a), reinterpret_cast<vec4 *>(ptr_out), count,
+                                byte_stride, type_length / 4);
+    }
+    if ((type_length % 3) == 0 && (byte_stride % sizeof(vec3)) == 0)
+    {
+        return array_sum_device(reinterpret_cast<const vec3 *>(ptr_a), reinterpret_cast<vec3 *>(ptr_out), count,
+                                byte_stride, type_length / 3);
+    }
+    if ((type_length % 2) == 0 && (byte_stride % sizeof(vec2)) == 0)
+    {
+        return array_sum_device(reinterpret_cast<const vec2 *>(ptr_a), reinterpret_cast<vec2 *>(ptr_out), count,
+                                byte_stride, type_length / 2);
+    }
+    return array_sum_device(ptr_a, ptr_out, count, byte_stride, type_length);
+}
+template <typename T> CUDA_CALLABLE T element_inner_product(const T &a, const T &b)
+{
+    return a * b;
+}
+template <unsigned Length, typename T>
+CUDA_CALLABLE T element_inner_product(const wp::vec_t<Length, T> &a, const wp::vec_t<Length, T> &b)
+{
+    return wp::dot(a, b);
+}
+/// Custom iterator for allowing strided access with CUB
+template <typename ElemT, typename ScalarT> struct cub_inner_product_iterator
+{
+    typedef cub_inner_product_iterator<ElemT, ScalarT> self_type;
+    typedef std::ptrdiff_t difference_type;
+    typedef ScalarT value_type;
+    typedef ScalarT *pointer;
+    typedef ScalarT reference;
+    typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
+    const ElemT *ptr_a = nullptr;
+    const ElemT *ptr_b = nullptr;
+    int stride_a = 1;
+    int stride_b = 1;
+    int type_length = 1;
+    CUDA_CALLABLE self_type operator++(int)
+    {
+        return ++(self_type(*this));
+    }
+    CUDA_CALLABLE self_type &operator++()
+    {
+        ptr_a += stride_a;
+        ptr_b += stride_b;
+        return *this;
+    }
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return compute_value(0);
+    }
+    CUDA_CALLABLE self_type operator+(difference_type n) const
+    {
+        return self_type(*this) += n;
+    }
+    CUDA_CALLABLE self_type &operator+=(difference_type n)
+    {
+        ptr_a += n * stride_a;
+        ptr_b += n * stride_b;
+        return *this;
+    }
+    CUDA_CALLABLE self_type operator-(difference_type n) const
+    {
+        return self_type(*this) -= n;
+    }
+    CUDA_CALLABLE self_type &operator-=(difference_type n)
+    {
+        ptr_a -= n * stride_a;
+        ptr_b -= n * stride_b;
+        return *this;
+    }
+    CUDA_CALLABLE difference_type operator-(const self_type &other) const
+    {
+        return (ptr_a - other.ptr_a) / stride_a;
+    }
+    CUDA_CALLABLE reference operator[](difference_type n) const
+    {
+        return compute_value(n);
+    }
+    CUDA_CALLABLE bool operator==(const self_type &rhs) const
+    {
+        return (ptr_a == rhs.ptr_a);
+    }
+    CUDA_CALLABLE bool operator!=(const self_type &rhs) const
+    {
+        return (ptr_a != rhs.ptr_a);
+    }
+  private:
+    CUDA_CALLABLE ScalarT compute_value(difference_type n) const
+    {
+        ScalarT val(0);
+        const ElemT *a = ptr_a + n * stride_a;
+        const ElemT *b = ptr_b + n * stride_b;
+        for (int k = 0; k < type_length; ++k)
+        {
+            val += element_inner_product(a[k], b[k]);
+        }
+        return val;
+    }
+};
+template <typename ElemT, typename ScalarT>
+void array_inner_device(const ElemT *ptr_a, const ElemT *ptr_b, ScalarT *ptr_out, int count, int byte_stride_a,
+                        int byte_stride_b, int type_length)
+{
+    assert((byte_stride_a % sizeof(ElemT)) == 0);
+    assert((byte_stride_b % sizeof(ElemT)) == 0);
+    const int stride_a = byte_stride_a / sizeof(ElemT);
+    const int stride_b = byte_stride_b / sizeof(ElemT);
+    ContextGuard guard(cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cub_inner_product_iterator<ElemT, ScalarT> inner_iterator{ptr_a, ptr_b, stride_a, stride_b, type_length};
+    size_t buff_size = 0;
+    check_cuda(cub::DeviceReduce::Sum(nullptr, buff_size, inner_iterator, ptr_out, count, stream));
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, buff_size);
+    check_cuda(cub::DeviceReduce::Sum(temp_buffer, buff_size, inner_iterator, ptr_out, count, stream));
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
+}
+template <typename T>
+void array_inner_device_dispatch(const T *ptr_a, const T *ptr_b, T *ptr_out, int count, int byte_stride_a,
+                                 int byte_stride_b, int type_length)
+{
+    using vec2 = wp::vec_t<2, T>;
+    using vec3 = wp::vec_t<3, T>;
+    using vec4 = wp::vec_t<4, T>;
+    // specialized calls for common vector types
+    if ((type_length % 4) == 0 && (byte_stride_a % sizeof(vec4)) == 0 && (byte_stride_b % sizeof(vec4)) == 0)
+    {
+        return array_inner_device(reinterpret_cast<const vec4 *>(ptr_a), reinterpret_cast<const vec4 *>(ptr_b), ptr_out,
+                                  count, byte_stride_a, byte_stride_b, type_length / 4);
+    }
+    if ((type_length % 3) == 0 && (byte_stride_a % sizeof(vec3)) == 0 && (byte_stride_b % sizeof(vec3)) == 0)
+    {
+        return array_inner_device(reinterpret_cast<const vec3 *>(ptr_a), reinterpret_cast<const vec3 *>(ptr_b), ptr_out,
+                                  count, byte_stride_a, byte_stride_b, type_length / 3);
+    }
+    if ((type_length % 2) == 0 && (byte_stride_a % sizeof(vec2)) == 0 && (byte_stride_b % sizeof(vec2)) == 0)
+    {
+        return array_inner_device(reinterpret_cast<const vec2 *>(ptr_a), reinterpret_cast<const vec2 *>(ptr_b), ptr_out,
+                                  count, byte_stride_a, byte_stride_b, type_length / 2);
+    }
+    return array_inner_device(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_length);
+}
+} // anonymous namespace
+void array_inner_float_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+                              int type_len)
+{
+    void *context = cuda_context_get_current();
+    const float *ptr_a = (const float *)(a);
+    const float *ptr_b = (const float *)(b);
+    float *ptr_out = (float *)(out);
+    array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
+}
+void array_inner_double_device(uint64_t a, uint64_t b, uint64_t out, int count, int byte_stride_a, int byte_stride_b,
+                               int type_len)
+{
+    const double *ptr_a = (const double *)(a);
+    const double *ptr_b = (const double *)(b);
+    double *ptr_out = (double *)(out);
+    array_inner_device_dispatch(ptr_a, ptr_b, ptr_out, count, byte_stride_a, byte_stride_b, type_len);
+}
+void array_sum_float_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
+{
+    const float *ptr_a = (const float *)(a);
+    float *ptr_out = (float *)(out);
+    array_sum_device_dispatch(ptr_a, ptr_out, count, byte_stride, type_length);
+}
+void array_sum_double_device(uint64_t a, uint64_t out, int count, int byte_stride, int type_length)
+{
+    const double *ptr_a = (const double *)(a);
+    double *ptr_out = (double *)(out);
+    array_sum_device_dispatch(ptr_a, ptr_out, count, byte_stride, type_length);
+}

warp/native/runlength_encode.cpp ADDED Viewed

@@ -0,0 +1,62 @@
+#include "warp.h"
+#include <cstdint>
+template <typename T>
+void runlength_encode_host(int n,
+                           const T *values,
+                           T *run_values,
+                           int *run_lengths,
+                           int *run_count)
+{
+    if (n == 0)
+    {
+        *run_count = 0;
+        return;
+    }
+    const T *end = values + n;
+    *run_count = 1;
+    *run_lengths = 1;
+    *run_values = *values;
+    while (++values != end)
+    {
+        if (*values == *run_values)
+        {
+            ++*run_lengths;
+        }
+        else
+        {
+            ++*run_count;
+            *(++run_lengths) = 1;
+            *(++run_values) = *values;
+        }
+    }
+}
+void runlength_encode_int_host(
+    uint64_t values,
+    uint64_t run_values,
+    uint64_t run_lengths,
+    uint64_t run_count,
+    int n)
+{
+    runlength_encode_host<int>(n,
+                               reinterpret_cast<const int *>(values),
+                               reinterpret_cast<int *>(run_values),
+                               reinterpret_cast<int *>(run_lengths),
+                               reinterpret_cast<int *>(run_count));
+}
+#if !WP_ENABLE_CUDA
+void runlength_encode_int_device(
+    uint64_t values,
+    uint64_t run_values,
+    uint64_t run_lengths,
+    uint64_t run_count,
+    int n)
+{
+}
+#endif

warp/native/runlength_encode.cu ADDED Viewed

@@ -0,0 +1,46 @@
+#include "warp.h"
+#include "cuda_util.h"
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+#include <cub/device/device_run_length_encode.cuh>
+template <typename T>
+void runlength_encode_device(int n,
+                             const T *values,
+                             T *run_values,
+                             int *run_lengths,
+                             int *run_count)
+{
+    ContextGuard guard(cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    size_t buff_size = 0;
+    check_cuda(cub::DeviceRunLengthEncode::Encode(
+        nullptr, buff_size, values, run_values, run_lengths, run_count,
+        n, stream));
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, buff_size);
+    check_cuda(cub::DeviceRunLengthEncode::Encode(
+        temp_buffer, buff_size, values, run_values, run_lengths, run_count,
+        n, stream));
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
+}
+void runlength_encode_int_device(
+    uint64_t values,
+    uint64_t run_values,
+    uint64_t run_lengths,
+    uint64_t run_count,
+    int n)
+{
+    return runlength_encode_device<int>(
+        n,
+        reinterpret_cast<const int *>(values),
+        reinterpret_cast<int *>(run_values),
+        reinterpret_cast<int *>(run_lengths),
+        reinterpret_cast<int *>(run_count));
+}

warp/native/scan.cu CHANGED Viewed

@@ -3,35 +3,33 @@
 #define THRUST_IGNORE_CUB_VERSION_CHECK
-#include <cub/cub.cuh>
+#include <cub/device/device_scan.cuh>
 template<typename T>
 void scan_device(const T* values_in, T* values_out, int n, bool inclusive)
 {
-    static void* scan_temp_memory = NULL;
-    static size_t scan_temp_max_size = 0;
+    ContextGuard guard(cuda_context_get_current());
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
     // compute temporary memory required
 	size_t scan_temp_size;
     if (inclusive) {
-        cub::DeviceScan::InclusiveSum(NULL, scan_temp_size, values_in, values_out, n);
+        check_cuda(cub::DeviceScan::InclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
     } else {
-        cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n);
+        check_cuda(cub::DeviceScan::ExclusiveSum(NULL, scan_temp_size, values_in, values_out, n));
     }
-    if (scan_temp_size > scan_temp_max_size)
-    {
-	    free_device(WP_CURRENT_CONTEXT, scan_temp_memory);
-        scan_temp_memory = alloc_device(WP_CURRENT_CONTEXT, scan_temp_size);
-        scan_temp_max_size = scan_temp_size;
-    }
+    void* temp_buffer = alloc_temp_device(WP_CURRENT_CONTEXT, scan_temp_size);
     // scan
     if (inclusive) {
-        cub::DeviceScan::InclusiveSum(scan_temp_memory, scan_temp_size, values_in, values_out, n, (cudaStream_t)cuda_stream_get_current());
+        check_cuda(cub::DeviceScan::InclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
     } else {
-        cub::DeviceScan::ExclusiveSum(scan_temp_memory, scan_temp_size, values_in, values_out, n, (cudaStream_t)cuda_stream_get_current());
+        check_cuda(cub::DeviceScan::ExclusiveSum(temp_buffer, scan_temp_size, values_in, values_out, n, stream));
     }
+    free_temp_device(WP_CURRENT_CONTEXT, temp_buffer);
 }
 template void scan_device(const int*, int*, int, bool);

warp/native/scan.h CHANGED Viewed

@@ -4,3 +4,4 @@ template<typename T>
 void scan_host(const T* values_in, T* values_out, int n, bool inclusive = true);
 template<typename T>
 void scan_device(const T* values_in, T* values_out, int n, bool inclusive = true);