PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl - Mend

warp-lang 1.6.2__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (179) hide show

warp/__init__.py +7 -1
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +452 -362
warp/codegen.py +179 -119
warp/config.py +42 -6
warp/context.py +490 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/nodal_field.py +22 -68
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +9 -10
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +3 -8
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +14 -27
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +301 -105
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +99 -10
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/sim/articulation.py +4 -4
warp/sim/collide.py +21 -10
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/integrator_euler.py +5 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +5 -5
warp/sim/model.py +42 -13
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +216 -19
warp/tests/__main__.py +0 -15
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +2 -2
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_examples.py +28 -36
warp/tests/test_fem.py +23 -4
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +233 -79
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +67 -46
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +46 -34
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +1 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -59
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +110 -658
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/METADATA +29 -7
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/RECORD +172 -162
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info/licenses}/LICENSE.md +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/native/quat.h CHANGED Viewed

@@ -375,12 +375,14 @@ inline CUDA_CALLABLE mat_t<3,3,Type> quat_to_matrix(const quat_t<Type>& q)
     vec_t<3,Type> c2 = quat_rotate(q, vec_t<3,Type>(0.0, 1.0, 0.0));
     vec_t<3,Type> c3 = quat_rotate(q, vec_t<3,Type>(0.0, 0.0, 1.0));
-    return mat_t<3,3,Type>(c1, c2, c3);
+    return matrix_from_cols<Type>(c1, c2, c3);
 }
-template<typename Type>
-inline CUDA_CALLABLE quat_t<Type> quat_from_matrix(const mat_t<3,3,Type>& m)
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE quat_t<Type> quat_from_matrix(const mat_t<Rows,Cols,Type>& m)
 {
+    static_assert((Rows == 3 && Cols == 3) || (Rows == 4 && Cols == 4), "Non-square matrix");
     const Type tr = m.data[0][0] + m.data[1][1] + m.data[2][2];
     Type x, y, z, w, h = Type(0);
@@ -498,37 +500,98 @@ inline CUDA_CALLABLE void adj_indexref(quat_t<Type>* q, int idx,
 template<typename Type>
-inline CUDA_CALLABLE void augassign_add(quat_t<Type>& q, int idx, Type value)
+inline CUDA_CALLABLE void add_inplace(quat_t<Type>& q, int idx, Type value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     q[idx] += value;
 }
 template<typename Type>
-inline CUDA_CALLABLE void adj_augassign_add(quat_t<Type>& q, int idx, Type value,
+inline CUDA_CALLABLE void adj_add_inplace(quat_t<Type>& q, int idx, Type value,
                                         quat_t<Type>& adj_q, int adj_idx, Type& adj_value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     adj_value += adj_q[idx];
 }
 template<typename Type>
-inline CUDA_CALLABLE void augassign_sub(quat_t<Type>& q, int idx, Type value)
+inline CUDA_CALLABLE void sub_inplace(quat_t<Type>& q, int idx, Type value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     q[idx] -= value;
 }
 template<typename Type>
-inline CUDA_CALLABLE void adj_augassign_sub(quat_t<Type>& q, int idx, Type value,
+inline CUDA_CALLABLE void adj_sub_inplace(quat_t<Type>& q, int idx, Type value,
                                         quat_t<Type>& adj_q, int adj_idx, Type& adj_value)
 {
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
     adj_value -= adj_q[idx];
 }
 template<typename Type>
-inline CUDA_CALLABLE quat_t<Type> assign(quat_t<Type>& q, int idx, Type value)
+inline CUDA_CALLABLE void assign_inplace(quat_t<Type>& q, int idx, Type value)
+{
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    q[idx] = value;
+}
+template<typename Type>
+inline CUDA_CALLABLE void adj_assign_inplace(quat_t<Type>& q, int idx, Type value, quat_t<Type>& adj_q, int& adj_idx, Type& adj_value)
+{
+#ifndef NDEBUG
+    if (idx < 0 || idx > 3)
+    {
+        printf("quat index %d out of bounds at %s %d\n", idx, __FILE__, __LINE__);
+        assert(0);
+    }
+#endif
+    adj_value += adj_q[idx];
+}
+template<typename Type>
+inline CUDA_CALLABLE quat_t<Type> assign_copy(quat_t<Type>& q, int idx, Type value)
 {
 #ifndef NDEBUG
     if (idx < 0 || idx > 3)
@@ -544,7 +607,7 @@ inline CUDA_CALLABLE quat_t<Type> assign(quat_t<Type>& q, int idx, Type value)
 }
 template<typename Type>
-inline CUDA_CALLABLE void adj_assign(quat_t<Type>& q, int idx, Type value, quat_t<Type>& adj_q, int& adj_idx, Type& adj_value, const quat_t<Type>& adj_ret)
+inline CUDA_CALLABLE void adj_assign_copy(quat_t<Type>& q, int idx, Type value, quat_t<Type>& adj_q, int& adj_idx, Type& adj_value, const quat_t<Type>& adj_ret)
 {
 #ifndef NDEBUG
     if (idx < 0 || idx > 3)
@@ -562,6 +625,7 @@ inline CUDA_CALLABLE void adj_assign(quat_t<Type>& q, int idx, Type value, quat_
     }
 }
 template<typename Type>
 CUDA_CALLABLE inline quat_t<Type> lerp(const quat_t<Type>& a, const quat_t<Type>& b, Type t)
 {
@@ -1048,9 +1112,11 @@ inline CUDA_CALLABLE void adj_quat_to_matrix(const quat_t<Type>& q, quat_t<Type>
     adj_quat_rotate(q, vec_t<3,Type>(0.0, 0.0, 1.0), adj_q, t, adj_ret.get_col(2));
 }
-template<typename Type>
-inline CUDA_CALLABLE void adj_quat_from_matrix(const mat_t<3,3,Type>& m, mat_t<3,3,Type>& adj_m, const quat_t<Type>& adj_ret)
+template<unsigned Rows, unsigned Cols, typename Type>
+inline CUDA_CALLABLE void adj_quat_from_matrix(const mat_t<Rows,Cols,Type>& m, mat_t<Rows,Cols,Type>& adj_m, const quat_t<Type>& adj_ret)
 {
+    static_assert((Rows == 3 && Cols == 3) || (Rows == 4 && Cols == 4), "Non-square matrix");
     const Type tr = m.data[0][0] + m.data[1][1] + m.data[2][2];
     Type x, y, z, w, h = Type(0);
@@ -1280,4 +1346,26 @@ CUDA_CALLABLE inline void adj_len(const quat_t<Type>& x, quat_t<Type>& adj_x, co
 {
 }
+template<typename Type>
+inline CUDA_CALLABLE void expect_near(const quat_t<Type>& actual, const quat_t<Type>& expected, const Type& tolerance)
+{
+    Type diff(0);
+    for(size_t i = 0; i < 4; ++i)
+    {
+        diff = max(diff, abs(actual[i] - expected[i]));
+    }
+    if (diff > tolerance)
+    {
+        printf("Error, expect_near() failed with tolerance "); print(tolerance);
+        printf("\t Expected: "); print(expected);
+        printf("\t Actual: "); print(actual);
+    }
+}
+template<typename Type>
+inline CUDA_CALLABLE void adj_expect_near(const quat_t<Type>& actual, const quat_t<Type>& expected, Type tolerance, quat_t<Type>& adj_actual, quat_t<Type>& adj_expected, Type adj_tolerance)
+{
+    // nop
+}
 } // namespace wp

warp/native/rand.h CHANGED Viewed

@@ -53,6 +53,9 @@ inline CUDA_CALLABLE uint32 rand_init(int seed, int offset) { return rand_pcg(ui
 inline CUDA_CALLABLE int randi(uint32& state) { state = rand_pcg(state); return int(state); }
 inline CUDA_CALLABLE int randi(uint32& state, int min, int max) { state = rand_pcg(state); return state % (max - min) + min; }
+inline CUDA_CALLABLE uint32 randu(uint32& state) { state = rand_pcg(state); return state; }
+inline CUDA_CALLABLE uint32 randu(uint32& state, uint32 min, uint32 max) { state = rand_pcg(state); return state % (max - min) + min; }
 /*
  * We want to ensure randf adheres to a uniform distribution over [0,1). The set of all possible float32 (IEEE 754 standard) values is not uniformly distributed however.
  * On the other hand, for a given sign and exponent, the mantissa of the float32 representation is uniformly distributed.
@@ -74,6 +77,9 @@ inline CUDA_CALLABLE void adj_rand_init(int seed, int offset, int& adj_seed, int
 inline CUDA_CALLABLE void adj_randi(uint32& state, uint32& adj_state, float adj_ret) {}
 inline CUDA_CALLABLE void adj_randi(uint32& state, int min, int max, uint32& adj_state, int& adj_min, int& adj_max, float adj_ret) {}
+inline CUDA_CALLABLE void adj_randu(uint32& state, uint32& adj_state, float adj_ret) {}
+inline CUDA_CALLABLE void adj_randu(uint32& state, uint32 min, uint32 max, uint32& adj_state, uint32& adj_min, uint32& adj_max, float adj_ret) {}
 inline CUDA_CALLABLE void adj_randf(uint32& state, uint32& adj_state, float adj_ret) {}
 inline CUDA_CALLABLE void adj_randf(uint32& state, float min, float max, uint32& adj_state, float& adj_min, float& adj_max, float adj_ret) {}

warp/native/sort.cpp CHANGED Viewed

@@ -21,69 +21,75 @@
 #include <cstdint>
-void radix_sort_pairs_host(int* keys, int* values, int n)
+//Only integer keys (bit count 32 or 64) are supported. Floats need to get converted into int first. see radix_float_to_int.
+template <typename KeyType>
+void radix_sort_pairs_host(KeyType* keys, int* values, int n, int offset_to_scratch_memory)
 {
-	static int tables[2][1 << 16];
+	const int numPasses = sizeof(KeyType) / 2;
+	static int tables[numPasses][1 << 16];
 	memset(tables, 0, sizeof(tables));
-	int* auxKeys = keys + n;
-	int* auxValues = values + n;
 	// build histograms
-	for (int i=0; i < n; ++i)
-	{
-		const unsigned short low = keys[i] & 0xffff;
-		const unsigned short high = keys[i] >> 16;
-		++tables[0][low];
-		++tables[1][high];
+	for (int p = 0; p < numPasses; ++p)
+    {
+		for (int i=0; i < n; ++i)
+		{
+			const int shift = p * 16;
+			const int b = (keys[i] >> shift) & 0xffff;
+			++tables[p][b];
+		}
 	}
-	// convert histograms to offset tables in-place
-	int offlow = 0;
-	int offhigh = 0;
-	for (int i=0; i < 65536; ++i)
+	// convert histograms to offset tables in-place
+	for (int p = 0; p < numPasses; ++p)
 	{
-		const int newofflow = offlow + tables[0][i];
-		const int newoffhigh = offhigh + tables[1][i];
-		tables[0][i] = offlow;
-		tables[1][i] = offhigh;
-		offlow = newofflow;
-		offhigh = newoffhigh;
+		int off = 0;
+		for (int i = 0; i < 65536; ++i)
+		{
+			const int newoff = off + tables[p][i];
+			tables[p][i] = off;
+			off = newoff;
+		}
 	}
-	// pass 1 - sort by low 16 bits
-	for (int i=0; i < n; ++i)
-	{
-		// lookup offset of input
-		const int k = keys[i];
-		const int v = values[i];
-		const int b = k & 0xffff;
-		// find offset and increment
-		const int offset = tables[0][b]++;
-		auxKeys[offset] = k;
-		auxValues[offset] = v;
-	}
-	// pass 2 - sort by high 16 bits
-	for (int i=0; i < n; ++i)
-	{
-		// lookup offset of input
-		const int k = auxKeys[i];
-		const int v = auxValues[i];
+    for (int p = 0; p < numPasses; ++p)
+    {
+		int flipFlop = p % 2;
+		KeyType* readKeys = keys + offset_to_scratch_memory * flipFlop;
+		int* readValues = values + offset_to_scratch_memory * flipFlop;
+		KeyType* writeKeys = keys + offset_to_scratch_memory * (1 - flipFlop);
+		int* writeValues = values + offset_to_scratch_memory * (1 - flipFlop);
+		// pass 1 - sort by low 16 bits
+		for (int i=0; i < n; ++i)
+		{
+			// lookup offset of input
+			const KeyType k = readKeys[i];
+			const int v = readValues[i];
+			const int shift = p * 16;
+			const int b = (k >> shift) & 0xffff;
+			// find offset and increment
+			const int offset = tables[p][b]++;
+			writeKeys[offset] = k;
+			writeValues[offset] = v;
+		}
+	}
+}
-		const int b = k >> 16;
-		const int offset = tables[1][b]++;
-		keys[offset] = k;
-		values[offset] = v;
-	}
+void radix_sort_pairs_host(int* keys, int* values, int n)
+{
+	radix_sort_pairs_host<int>(keys, values, n, n);
+}
+void radix_sort_pairs_host(int64_t* keys, int* values, int n)
+{
+	radix_sort_pairs_host<int64_t>(keys, values, n, n);
 }
  //http://stereopsis.com/radix.html
@@ -94,13 +100,13 @@ inline unsigned int radix_float_to_int(float f)
 	return i ^ mask;
 }
-void radix_sort_pairs_host(float* keys, int* values, int n)
+void radix_sort_pairs_host(float* keys, int* values, int n, int offset_to_scratch_memory)
 {
 	static unsigned int tables[2][1 << 16];
 	memset(tables, 0, sizeof(tables));
-	float* auxKeys = keys + n;
-	int* auxValues = values + n;
+	float* auxKeys = keys + offset_to_scratch_memory;
+	int* auxValues = values + offset_to_scratch_memory;
 	// build histograms
 	for (int i=0; i < n; ++i)
@@ -162,14 +168,46 @@ void radix_sort_pairs_host(float* keys, int* values, int n)
 	}
 }
+void radix_sort_pairs_host(float* keys, int* values, int n)
+{
+	radix_sort_pairs_host(keys, values, n, n);
+}
+void segmented_sort_pairs_host(float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
+{
+	for (int i = 0; i < num_segments; ++i)
+	{
+		const int start = segment_start_indices[i];
+		const int end = segment_end_indices[i];
+		radix_sort_pairs_host(keys + start, values + start, end - start, n);
+	}
+}
+void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
+{
+	for (int i = 0; i < num_segments; ++i)
+	{
+		const int start = segment_start_indices[i];
+		const int end = segment_end_indices[i];
+		radix_sort_pairs_host(keys + start, values + start, end - start, n);
+	}
+}
 #if !WP_ENABLE_CUDA
 void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
 void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
+void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
 void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
+void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
+void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
 #endif // !WP_ENABLE_CUDA
@@ -180,9 +218,34 @@ void radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
+void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
+{
+    radix_sort_pairs_host(
+        reinterpret_cast<int64_t *>(keys),
+        reinterpret_cast<int *>(values), n);
+}
 void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<float *>(keys),
         reinterpret_cast<int *>(values), n);
-}
+}
+void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+{
+    segmented_sort_pairs_host(
+        reinterpret_cast<float *>(keys),
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices), num_segments);
+}
+void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+{
+    segmented_sort_pairs_host(
+        reinterpret_cast<int *>(keys),
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices), num_segments);
+}

warp/native/sort.cu CHANGED Viewed

@@ -36,11 +36,12 @@ struct RadixSortTemp
 static std::map<void*, RadixSortTemp> g_radix_sort_temp_map;
-void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
+template <typename KeyType>
+void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* size_out)
 {
     ContextGuard guard(context);
-    cub::DoubleBuffer<int> d_keys;
+    cub::DoubleBuffer<KeyType> d_keys;
 	cub::DoubleBuffer<int> d_values;
     // compute temporary memory required
@@ -50,7 +51,7 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
         sort_temp_size,
         d_keys,
         d_values,
-        n, 0, 32,
+        n, 0, sizeof(KeyType)*8,
         (cudaStream_t)cuda_stream_get_current()));
     if (!context)
@@ -71,15 +72,21 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
         *size_out = temp.size;
 }
-void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
+void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
+{
+    radix_sort_reserve_internal<int>(context, n, mem_out, size_out);
+}
+template <typename KeyType>
+void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
 {
     ContextGuard guard(context);
-    cub::DoubleBuffer<int> d_keys(keys, keys + n);
+    cub::DoubleBuffer<KeyType> d_keys(keys, keys + n);
 	cub::DoubleBuffer<int> d_values(values, values + n);
     RadixSortTemp temp;
-    radix_sort_reserve(WP_CURRENT_CONTEXT, n, &temp.mem, &temp.size);
+    radix_sort_reserve_internal<KeyType>(WP_CURRENT_CONTEXT, n, &temp.mem, &temp.size);
     // sort
     check_cuda(cub::DeviceRadixSort::SortPairs(
@@ -87,16 +94,31 @@ void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
         temp.size,
         d_keys,
         d_values,
-        n, 0, 32,
+        n, 0, sizeof(KeyType)*8,
         (cudaStream_t)cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(int)*n);
+		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
 	if (d_values.Current() != values)
 		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
+void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
+{
+    radix_sort_pairs_device<int>(context, keys, values, n);
+}
+void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
+{
+    radix_sort_pairs_device<float>(context, keys, values, n);
+}
+void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
+{
+    radix_sort_pairs_device<int64_t>(context, keys, values, n);
+}
 void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
@@ -105,7 +127,69 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
+void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
+{
+    radix_sort_pairs_device(
+        WP_CURRENT_CONTEXT,
+        reinterpret_cast<float *>(keys),
+        reinterpret_cast<int *>(values), n);
+}
+void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
+{
+    radix_sort_pairs_device(
+        WP_CURRENT_CONTEXT,
+        reinterpret_cast<int64_t *>(keys),
+        reinterpret_cast<int *>(values), n);
+}
+void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_out, size_t* size_out)
+{
+    ContextGuard guard(context);
+    cub::DoubleBuffer<int> d_keys;
+	cub::DoubleBuffer<int> d_values;
+    int* start_indices = NULL;
+    int* end_indices = NULL;
+    // compute temporary memory required
+	size_t sort_temp_size;
+    check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+        NULL,
+        sort_temp_size,
+        d_keys,
+        d_values,
+        n,
+        num_segments,
+        start_indices,
+        end_indices,
+        0,
+        32,
+        (cudaStream_t)cuda_stream_get_current()));
+    if (!context)
+        context = cuda_context_get_current();
+    RadixSortTemp& temp = g_radix_sort_temp_map[context];
+    if (sort_temp_size > temp.size)
+    {
+	    free_device(WP_CURRENT_CONTEXT, temp.mem);
+        temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
+        temp.size = sort_temp_size;
+    }
+    if (mem_out)
+        *mem_out = temp.mem;
+    if (size_out)
+        *size_out = temp.size;
+}
+// segment_start_indices and segment_end_indices are arrays of length num_segments, where segment_start_indices[i] is the index of the first element
+// in the i-th segment and segment_end_indices[i] is the index after the last element in the i-th segment
+// https://nvidia.github.io/cccl/cub/api/structcub_1_1DeviceSegmentedRadixSort.html
+void segmented_sort_pairs_device(void* context, float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
 {
     ContextGuard guard(context);
@@ -113,15 +197,20 @@ void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
 	cub::DoubleBuffer<int> d_values(values, values + n);
     RadixSortTemp temp;
-    radix_sort_reserve(WP_CURRENT_CONTEXT, n, &temp.mem, &temp.size);
+    segmented_sort_reserve(WP_CURRENT_CONTEXT, n, num_segments, &temp.mem, &temp.size);
     // sort
-    check_cuda(cub::DeviceRadixSort::SortPairs(
+    check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
         temp.mem,
         temp.size,
         d_keys,
         d_values,
-        n, 0, 32,
+        n,
+        num_segments,
+        segment_start_indices,
+        segment_end_indices,
+        0,
+        32,
         (cudaStream_t)cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
@@ -131,10 +220,58 @@ void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
 		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
-void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
+void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
-    radix_sort_pairs_device(
+    segmented_sort_pairs_device(
         WP_CURRENT_CONTEXT,
         reinterpret_cast<float *>(keys),
-        reinterpret_cast<int *>(values), n);
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices),
+        num_segments);
+}
+// segment_indices is an array of length num_segments + 1, where segment_indices[i] is the index of the first element in the i-th segment
+// The end of a segment is given by segment_indices[i+1]
+// https://nvidia.github.io/cccl/cub/api/structcub_1_1DeviceSegmentedSort.html#a-simple-example
+void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
+{
+    ContextGuard guard(context);
+    cub::DoubleBuffer<int> d_keys(keys, keys + n);
+	cub::DoubleBuffer<int> d_values(values, values + n);
+    RadixSortTemp temp;
+    segmented_sort_reserve(WP_CURRENT_CONTEXT, n, num_segments, &temp.mem, &temp.size);
+    // sort
+    check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+        temp.mem,
+        temp.size,
+        d_keys,
+        d_values,
+        n,
+        num_segments,
+        segment_start_indices,
+        segment_end_indices,
+        0,
+        32,
+        (cudaStream_t)cuda_stream_get_current()));
+	if (d_keys.Current() != keys)
+		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
+	if (d_values.Current() != values)
+		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+}
+void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+{
+    segmented_sort_pairs_device(
+        WP_CURRENT_CONTEXT,
+        reinterpret_cast<int *>(keys),
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices),
+        num_segments);
 }

warp/native/sort.h CHANGED Viewed

@@ -22,5 +22,12 @@
 void radix_sort_reserve(void* context, int n, void** mem_out=NULL, size_t* size_out=NULL);
 void radix_sort_pairs_host(int* keys, int* values, int n);
 void radix_sort_pairs_host(float* keys, int* values, int n);
+void radix_sort_pairs_host(int64_t* keys, int* values, int n);
 void radix_sort_pairs_device(void* context, int* keys, int* values, int n);
-void radix_sort_pairs_device(void* context, float* keys, int* values, int n);
+void radix_sort_pairs_device(void* context, float* keys, int* values, int n);
+void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n);
+void segmented_sort_pairs_host(float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);
+void segmented_sort_pairs_device(void* context, float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);
+void segmented_sort_pairs_host(void* context, int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);
+void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);