PyPI - warp-lang - Versions diffs - 1.6.1__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl - Mend

warp-lang 1.6.1__py3-none-win_amd64.whl → 1.7.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (401) hide show

warp/__init__.py +21 -7
warp/autograd.py +14 -6
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +424 -6
warp/build_dll.py +20 -20
warp/builtins.py +467 -368
warp/codegen.py +193 -125
warp/config.py +56 -12
warp/constants.py +14 -6
warp/context.py +524 -277
warp/dlpack.py +22 -12
warp/examples/__init__.py +14 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_api.py +14 -6
warp/examples/benchmarks/benchmark_cloth.py +14 -6
warp/examples/benchmarks/benchmark_cloth_cupy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_jax.py +14 -6
warp/examples/benchmarks/benchmark_cloth_numba.py +15 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +14 -6
warp/examples/benchmarks/benchmark_cloth_paddle.py +14 -6
warp/examples/benchmarks/benchmark_cloth_pytorch.py +14 -6
warp/examples/benchmarks/benchmark_cloth_taichi.py +14 -6
warp/examples/benchmarks/benchmark_cloth_warp.py +14 -6
warp/examples/benchmarks/benchmark_gemm.py +82 -48
warp/examples/benchmarks/benchmark_interop_paddle.py +14 -6
warp/examples/benchmarks/benchmark_interop_torch.py +14 -6
warp/examples/benchmarks/benchmark_launches.py +14 -6
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/browse.py +14 -6
warp/examples/core/example_cupy.py +14 -6
warp/examples/core/example_dem.py +14 -6
warp/examples/core/example_fluid.py +14 -6
warp/examples/core/example_graph_capture.py +14 -6
warp/examples/core/example_marching_cubes.py +14 -6
warp/examples/core/example_mesh.py +14 -6
warp/examples/core/example_mesh_intersect.py +14 -6
warp/examples/core/example_nvdb.py +14 -6
warp/examples/core/example_raycast.py +14 -6
warp/examples/core/example_raymarch.py +14 -6
warp/examples/core/example_render_opengl.py +14 -6
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +14 -6
warp/examples/core/example_torch.py +14 -6
warp/examples/core/example_wave.py +14 -6
warp/examples/fem/example_adaptive_grid.py +14 -6
warp/examples/fem/example_apic_fluid.py +15 -7
warp/examples/fem/example_burgers.py +16 -8
warp/examples/fem/example_convection_diffusion.py +14 -6
warp/examples/fem/example_convection_diffusion_dg.py +14 -6
warp/examples/fem/example_deformed_geometry.py +15 -7
warp/examples/fem/example_diffusion.py +14 -6
warp/examples/fem/example_diffusion_3d.py +14 -6
warp/examples/fem/example_diffusion_mgpu.py +14 -6
warp/examples/fem/example_distortion_energy.py +15 -7
warp/examples/fem/example_magnetostatics.py +20 -12
warp/examples/fem/example_mixed_elasticity.py +14 -6
warp/examples/fem/example_navier_stokes.py +14 -6
warp/examples/fem/example_nonconforming_contact.py +14 -6
warp/examples/fem/example_stokes.py +14 -6
warp/examples/fem/example_stokes_transfer.py +14 -6
warp/examples/fem/example_streamlines.py +14 -6
warp/examples/fem/utils.py +24 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_bounce.py +14 -6
warp/examples/optim/example_cloth_throw.py +14 -6
warp/examples/optim/example_diffray.py +14 -6
warp/examples/optim/example_drone.py +14 -6
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/optim/example_inverse_kinematics.py +14 -6
warp/examples/optim/example_inverse_kinematics_torch.py +14 -6
warp/examples/optim/example_softbody_properties.py +14 -6
warp/examples/optim/example_spring_cage.py +14 -6
warp/examples/optim/example_trajectory.py +14 -6
warp/examples/sim/example_cartpole.py +14 -6
warp/examples/sim/example_cloth.py +14 -6
warp/examples/sim/example_cloth_self_contact.py +14 -6
warp/examples/sim/example_granular.py +14 -6
warp/examples/sim/example_granular_collision_sdf.py +14 -6
warp/examples/sim/example_jacobian_ik.py +14 -6
warp/examples/sim/example_particle_chain.py +14 -6
warp/examples/sim/example_quadruped.py +14 -6
warp/examples/sim/example_rigid_chain.py +14 -6
warp/examples/sim/example_rigid_contact.py +14 -6
warp/examples/sim/example_rigid_force.py +14 -6
warp/examples/sim/example_rigid_gyroscopic.py +14 -6
warp/examples/sim/example_rigid_soft_contact.py +14 -6
warp/examples/sim/example_soft_body.py +14 -6
warp/examples/tile/example_tile_cholesky.py +14 -6
warp/examples/tile/example_tile_convolution.py +14 -6
warp/examples/tile/example_tile_fft.py +14 -6
warp/examples/tile/example_tile_filtering.py +14 -6
warp/examples/tile/example_tile_matmul.py +16 -10
warp/examples/tile/example_tile_mlp.py +14 -6
warp/examples/tile/example_tile_nbody.py +14 -6
warp/examples/tile/example_tile_walker.py +14 -6
warp/fabric.py +15 -0
warp/fem/__init__.py +26 -1
warp/fem/adaptivity.py +19 -4
warp/fem/cache.py +15 -0
warp/fem/dirichlet.py +15 -0
warp/fem/domain.py +15 -0
warp/fem/field/__init__.py +15 -0
warp/fem/field/field.py +15 -0
warp/fem/field/nodal_field.py +37 -68
warp/fem/field/restriction.py +15 -0
warp/fem/field/virtual.py +77 -23
warp/fem/geometry/__init__.py +15 -0
warp/fem/geometry/adaptive_nanogrid.py +24 -10
warp/fem/geometry/closest_point.py +16 -1
warp/fem/geometry/deformed_geometry.py +20 -2
warp/fem/geometry/element.py +15 -0
warp/fem/geometry/geometry.py +20 -0
warp/fem/geometry/grid_2d.py +27 -12
warp/fem/geometry/grid_3d.py +27 -15
warp/fem/geometry/hexmesh.py +20 -7
warp/fem/geometry/nanogrid.py +24 -11
warp/fem/geometry/partition.py +15 -0
warp/fem/geometry/quadmesh.py +28 -13
warp/fem/geometry/tetmesh.py +18 -4
warp/fem/geometry/trimesh.py +18 -8
warp/fem/integrate.py +277 -93
warp/fem/linalg.py +20 -5
warp/fem/operator.py +15 -0
warp/fem/polynomial.py +15 -0
warp/fem/quadrature/__init__.py +15 -0
warp/fem/quadrature/pic_quadrature.py +52 -22
warp/fem/quadrature/quadrature.py +209 -25
warp/fem/space/__init__.py +16 -1
warp/fem/space/basis_function_space.py +19 -2
warp/fem/space/basis_space.py +40 -18
warp/fem/space/dof_mapper.py +15 -0
warp/fem/space/function_space.py +15 -0
warp/fem/space/grid_2d_function_space.py +15 -0
warp/fem/space/grid_3d_function_space.py +15 -0
warp/fem/space/hexmesh_function_space.py +17 -2
warp/fem/space/nanogrid_function_space.py +15 -0
warp/fem/space/partition.py +21 -2
warp/fem/space/quadmesh_function_space.py +23 -8
warp/fem/space/restriction.py +15 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +38 -23
warp/fem/space/shape/shape_function.py +15 -0
warp/fem/space/shape/square_shape_function.py +27 -12
warp/fem/space/shape/tet_shape_function.py +15 -0
warp/fem/space/shape/triangle_shape_function.py +16 -1
warp/fem/space/tetmesh_function_space.py +18 -3
warp/fem/space/topology.py +15 -0
warp/fem/space/trimesh_function_space.py +17 -2
warp/fem/types.py +15 -0
warp/fem/utils.py +27 -6
warp/jax.py +28 -7
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -33
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +103 -6
warp/native/array.h +28 -6
warp/native/builtin.h +44 -9
warp/native/bvh.cpp +18 -7
warp/native/bvh.cu +57 -20
warp/native/bvh.h +17 -7
warp/native/clang/clang.cpp +45 -9
warp/native/coloring.cpp +15 -6
warp/native/crt.cpp +15 -6
warp/native/crt.h +15 -6
warp/native/cuda_crt.h +15 -6
warp/native/cuda_util.cpp +29 -6
warp/native/cuda_util.h +17 -6
warp/native/error.cpp +15 -6
warp/native/error.h +15 -6
warp/native/exports.h +85 -63
warp/native/fabric.h +15 -6
warp/native/hashgrid.cpp +15 -6
warp/native/hashgrid.cu +15 -6
warp/native/hashgrid.h +15 -6
warp/native/initializer_array.h +15 -6
warp/native/intersect.h +41 -32
warp/native/intersect_adj.h +48 -39
warp/native/intersect_tri.h +17 -0
warp/native/marching.cpp +16 -0
warp/native/marching.cu +16 -7
warp/native/marching.h +17 -0
warp/native/mat.h +528 -15
warp/native/mathdx.cpp +15 -6
warp/native/matnn.h +15 -6
warp/native/mesh.cpp +15 -6
warp/native/mesh.cu +15 -6
warp/native/mesh.h +25 -16
warp/native/noise.h +15 -6
warp/native/quat.h +114 -17
warp/native/rand.h +21 -6
warp/native/range.h +15 -6
warp/native/reduce.cpp +15 -6
warp/native/reduce.cu +15 -6
warp/native/runlength_encode.cpp +15 -6
warp/native/runlength_encode.cu +15 -6
warp/native/scan.cpp +15 -6
warp/native/scan.cu +15 -6
warp/native/scan.h +15 -6
warp/native/solid_angle.h +17 -0
warp/native/sort.cpp +137 -65
warp/native/sort.cu +167 -21
warp/native/sort.h +23 -7
warp/native/sparse.cpp +58 -28
warp/native/sparse.cu +67 -23
warp/native/spatial.h +15 -6
warp/native/svd.h +131 -6
warp/native/temp_buffer.h +15 -6
warp/native/tile.h +316 -111
warp/native/tile_reduce.h +61 -9
warp/native/vec.h +83 -13
warp/native/volume.cpp +100 -119
warp/native/volume.cu +15 -6
warp/native/volume.h +15 -6
warp/native/volume_builder.cu +40 -16
warp/native/volume_builder.h +21 -6
warp/native/volume_impl.h +15 -6
warp/native/warp.cpp +20 -12
warp/native/warp.cu +114 -16
warp/native/warp.h +34 -16
warp/optim/__init__.py +14 -6
warp/optim/adam.py +14 -6
warp/optim/linear.py +25 -10
warp/optim/sgd.py +14 -6
warp/paddle.py +14 -6
warp/render/__init__.py +14 -6
warp/render/render_opengl.py +14 -6
warp/render/render_usd.py +14 -6
warp/render/utils.py +14 -6
warp/sim/__init__.py +14 -7
warp/sim/articulation.py +18 -10
warp/sim/collide.py +35 -16
warp/sim/graph_coloring.py +14 -6
warp/sim/import_mjcf.py +463 -162
warp/sim/import_snu.py +14 -7
warp/sim/import_urdf.py +46 -18
warp/sim/import_usd.py +14 -7
warp/sim/inertia.py +14 -6
warp/sim/integrator.py +14 -6
warp/sim/integrator_euler.py +19 -11
warp/sim/integrator_featherstone.py +17 -16
warp/sim/integrator_vbd.py +222 -8
warp/sim/integrator_xpbd.py +19 -11
warp/sim/model.py +56 -19
warp/sim/particles.py +14 -6
warp/sim/render.py +14 -6
warp/sim/utils.py +17 -2
warp/sparse.py +657 -555
warp/stubs.py +231 -19
warp/tape.py +14 -6
warp/tests/aux_test_class_kernel.py +14 -6
warp/tests/aux_test_compile_consts_dummy.py +14 -6
warp/tests/aux_test_conditional_unequal_types_kernels.py +14 -6
warp/tests/aux_test_dependent.py +14 -6
warp/tests/aux_test_grad_customs.py +14 -6
warp/tests/aux_test_instancing_gc.py +14 -6
warp/tests/aux_test_module_unload.py +14 -6
warp/tests/aux_test_name_clash1.py +14 -6
warp/tests/aux_test_name_clash2.py +14 -6
warp/tests/aux_test_unresolved_func.py +14 -6
warp/tests/aux_test_unresolved_symbol.py +14 -6
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_async.py → cuda/test_async.py} +14 -6
warp/tests/{test_ipc.py → cuda/test_ipc.py} +14 -6
warp/tests/{test_mempool.py → cuda/test_mempool.py} +53 -6
warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +14 -6
warp/tests/{test_peer.py → cuda/test_peer.py} +14 -6
warp/tests/{test_pinned.py → cuda/test_pinned.py} +14 -6
warp/tests/{test_streams.py → cuda/test_streams.py} +85 -6
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_bvh.py → geometry/test_bvh.py} +14 -6
warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +14 -6
warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +14 -6
warp/tests/{test_mesh.py → geometry/test_mesh.py} +14 -6
warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +14 -6
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +80 -69
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +15 -7
warp/tests/{test_volume.py → geometry/test_volume.py} +55 -12
warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +14 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +42 -11
warp/tests/{test_jax.py → interop/test_jax.py} +14 -6
warp/tests/{test_paddle.py → interop/test_paddle.py} +14 -6
warp/tests/{test_torch.py → interop/test_torch.py} +14 -6
warp/tests/run_coverage_serial.py +14 -6
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +23 -16
warp/tests/{flaky_test_sim_grad.py → sim/flaky_test_sim_grad.py} +14 -6
warp/tests/{test_collision.py → sim/test_collision.py} +16 -8
warp/tests/{test_coloring.py → sim/test_coloring.py} +14 -7
warp/tests/{test_model.py → sim/test_model.py} +55 -7
warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +14 -6
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +16 -7
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_adam.py +14 -6
warp/tests/test_arithmetic.py +14 -6
warp/tests/test_array.py +14 -6
warp/tests/test_array_reduce.py +14 -6
warp/tests/test_assert.py +14 -6
warp/tests/test_atomic.py +14 -6
warp/tests/test_bool.py +15 -7
warp/tests/test_builtins_resolution.py +14 -6
warp/tests/test_closest_point_edge_edge.py +14 -6
warp/tests/test_codegen.py +14 -6
warp/tests/test_codegen_instancing.py +14 -6
warp/tests/test_compile_consts.py +14 -6
warp/tests/test_conditional.py +14 -6
warp/tests/test_context.py +14 -6
warp/tests/test_copy.py +14 -6
warp/tests/test_ctypes.py +14 -6
warp/tests/test_dense.py +14 -6
warp/tests/test_devices.py +14 -6
warp/tests/test_examples.py +42 -42
warp/tests/test_fabricarray.py +14 -6
warp/tests/test_fast_math.py +14 -6
warp/tests/test_fem.py +37 -10
warp/tests/test_fp16.py +14 -6
warp/tests/test_func.py +14 -6
warp/tests/test_future_annotations.py +14 -6
warp/tests/test_generics.py +14 -6
warp/tests/test_grad.py +14 -6
warp/tests/test_grad_customs.py +14 -6
warp/tests/test_grad_debug.py +14 -6
warp/tests/test_implicit_init.py +14 -6
warp/tests/test_import.py +14 -6
warp/tests/test_indexedarray.py +14 -6
warp/tests/test_intersect.py +14 -6
warp/tests/test_iter.py +14 -6
warp/tests/test_large.py +14 -6
warp/tests/test_launch.py +14 -6
warp/tests/test_lerp.py +14 -6
warp/tests/test_linear_solvers.py +15 -11
warp/tests/test_lvalue.py +14 -6
warp/tests/test_mat.py +247 -85
warp/tests/test_mat_lite.py +14 -6
warp/tests/test_mat_scalar_ops.py +18 -10
warp/tests/test_math.py +14 -6
warp/tests/test_mlp.py +14 -6
warp/tests/test_module_hashing.py +14 -6
warp/tests/test_modules_lite.py +14 -6
warp/tests/test_noise.py +14 -6
warp/tests/test_operators.py +14 -6
warp/tests/test_options.py +14 -6
warp/tests/test_overwrite.py +15 -60
warp/tests/test_print.py +14 -6
warp/tests/test_quat.py +81 -52
warp/tests/test_rand.py +58 -43
warp/tests/test_reload.py +14 -6
warp/tests/test_rounding.py +14 -6
warp/tests/test_runlength_encode.py +14 -6
warp/tests/test_scalar_ops.py +14 -6
warp/tests/test_smoothstep.py +14 -6
warp/tests/test_snippet.py +15 -0
warp/tests/test_sparse.py +61 -12
warp/tests/test_spatial.py +89 -6
warp/tests/test_special_values.py +14 -6
warp/tests/test_static.py +15 -7
warp/tests/test_struct.py +14 -6
warp/tests/test_tape.py +14 -6
warp/tests/test_transient_module.py +14 -6
warp/tests/test_triangle_closest_point.py +14 -6
warp/tests/test_types.py +14 -6
warp/tests/test_utils.py +98 -10
warp/tests/test_vec.py +60 -40
warp/tests/test_vec_lite.py +14 -6
warp/tests/test_vec_scalar_ops.py +14 -6
warp/tests/test_verify_fp.py +14 -6
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +150 -57
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +15 -7
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +23 -12
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +39 -20
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +74 -7
warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +14 -6
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +15 -7
warp/tests/unittest_serial.py +15 -6
warp/tests/unittest_suites.py +59 -65
warp/tests/unittest_utils.py +16 -7
warp/tests/walkthrough_debug.py +14 -6
warp/thirdparty/unittest_parallel.py +15 -8
warp/torch.py +14 -6
warp/types.py +124 -664
warp/utils.py +151 -78
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/METADATA +39 -12
warp_lang-1.7.0.dist-info/RECORD +429 -0
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/WHEEL +1 -1
warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
warp/examples/optim/example_walker.py +0 -309
warp/native/cutlass_gemm.cpp +0 -34
warp/native/cutlass_gemm.cu +0 -373
warp/tests/test_matmul.py +0 -503
warp/tests/test_matmul_lite.py +0 -403
warp/tests/test_vbd.py +0 -378
warp/tests/unused_test_misc.py +0 -69
warp_lang-1.6.1.dist-info/LICENSE.md +0 -126
warp_lang-1.6.1.dist-info/RECORD +0 -419
{warp_lang-1.6.1.dist-info → warp_lang-1.7.0.dist-info}/top_level.txt +0 -0

warp/native/sort.cpp CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "warp.h"
@@ -12,69 +21,75 @@
 #include <cstdint>
-void radix_sort_pairs_host(int* keys, int* values, int n)
+//Only integer keys (bit count 32 or 64) are supported. Floats need to get converted into int first. see radix_float_to_int.
+template <typename KeyType>
+void radix_sort_pairs_host(KeyType* keys, int* values, int n, int offset_to_scratch_memory)
 {
-	static int tables[2][1 << 16];
+	const int numPasses = sizeof(KeyType) / 2;
+	static int tables[numPasses][1 << 16];
 	memset(tables, 0, sizeof(tables));
-	int* auxKeys = keys + n;
-	int* auxValues = values + n;
 	// build histograms
-	for (int i=0; i < n; ++i)
-	{
-		const unsigned short low = keys[i] & 0xffff;
-		const unsigned short high = keys[i] >> 16;
-		++tables[0][low];
-		++tables[1][high];
+	for (int p = 0; p < numPasses; ++p)
+    {
+		for (int i=0; i < n; ++i)
+		{
+			const int shift = p * 16;
+			const int b = (keys[i] >> shift) & 0xffff;
+			++tables[p][b];
+		}
 	}
-	// convert histograms to offset tables in-place
-	int offlow = 0;
-	int offhigh = 0;
-	for (int i=0; i < 65536; ++i)
+	// convert histograms to offset tables in-place
+	for (int p = 0; p < numPasses; ++p)
 	{
-		const int newofflow = offlow + tables[0][i];
-		const int newoffhigh = offhigh + tables[1][i];
-		tables[0][i] = offlow;
-		tables[1][i] = offhigh;
-		offlow = newofflow;
-		offhigh = newoffhigh;
+		int off = 0;
+		for (int i = 0; i < 65536; ++i)
+		{
+			const int newoff = off + tables[p][i];
+			tables[p][i] = off;
+			off = newoff;
+		}
 	}
-	// pass 1 - sort by low 16 bits
-	for (int i=0; i < n; ++i)
-	{
-		// lookup offset of input
-		const int k = keys[i];
-		const int v = values[i];
-		const int b = k & 0xffff;
-		// find offset and increment
-		const int offset = tables[0][b]++;
-		auxKeys[offset] = k;
-		auxValues[offset] = v;
-	}
-	// pass 2 - sort by high 16 bits
-	for (int i=0; i < n; ++i)
-	{
-		// lookup offset of input
-		const int k = auxKeys[i];
-		const int v = auxValues[i];
+    for (int p = 0; p < numPasses; ++p)
+    {
+		int flipFlop = p % 2;
+		KeyType* readKeys = keys + offset_to_scratch_memory * flipFlop;
+		int* readValues = values + offset_to_scratch_memory * flipFlop;
+		KeyType* writeKeys = keys + offset_to_scratch_memory * (1 - flipFlop);
+		int* writeValues = values + offset_to_scratch_memory * (1 - flipFlop);
+		// pass 1 - sort by low 16 bits
+		for (int i=0; i < n; ++i)
+		{
+			// lookup offset of input
+			const KeyType k = readKeys[i];
+			const int v = readValues[i];
+			const int shift = p * 16;
+			const int b = (k >> shift) & 0xffff;
+			// find offset and increment
+			const int offset = tables[p][b]++;
+			writeKeys[offset] = k;
+			writeValues[offset] = v;
+		}
+	}
+}
-		const int b = k >> 16;
-		const int offset = tables[1][b]++;
-		keys[offset] = k;
-		values[offset] = v;
-	}
+void radix_sort_pairs_host(int* keys, int* values, int n)
+{
+	radix_sort_pairs_host<int>(keys, values, n, n);
+}
+void radix_sort_pairs_host(int64_t* keys, int* values, int n)
+{
+	radix_sort_pairs_host<int64_t>(keys, values, n, n);
 }
  //http://stereopsis.com/radix.html
@@ -85,13 +100,13 @@ inline unsigned int radix_float_to_int(float f)
 	return i ^ mask;
 }
-void radix_sort_pairs_host(float* keys, int* values, int n)
+void radix_sort_pairs_host(float* keys, int* values, int n, int offset_to_scratch_memory)
 {
 	static unsigned int tables[2][1 << 16];
 	memset(tables, 0, sizeof(tables));
-	float* auxKeys = keys + n;
-	int* auxValues = values + n;
+	float* auxKeys = keys + offset_to_scratch_memory;
+	int* auxValues = values + offset_to_scratch_memory;
 	// build histograms
 	for (int i=0; i < n; ++i)
@@ -153,14 +168,46 @@ void radix_sort_pairs_host(float* keys, int* values, int n)
 	}
 }
+void radix_sort_pairs_host(float* keys, int* values, int n)
+{
+	radix_sort_pairs_host(keys, values, n, n);
+}
+void segmented_sort_pairs_host(float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
+{
+	for (int i = 0; i < num_segments; ++i)
+	{
+		const int start = segment_start_indices[i];
+		const int end = segment_end_indices[i];
+		radix_sort_pairs_host(keys + start, values + start, end - start, n);
+	}
+}
+void segmented_sort_pairs_host(int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
+{
+	for (int i = 0; i < num_segments; ++i)
+	{
+		const int start = segment_start_indices[i];
+		const int end = segment_end_indices[i];
+		radix_sort_pairs_host(keys + start, values + start, end - start, n);
+	}
+}
 #if !WP_ENABLE_CUDA
 void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out) {}
 void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n) {}
+void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n) {}
 void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n) {}
+void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
+void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments) {}
 #endif // !WP_ENABLE_CUDA
@@ -171,9 +218,34 @@ void radix_sort_pairs_int_host(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
+void radix_sort_pairs_int64_host(uint64_t keys, uint64_t values, int n)
+{
+    radix_sort_pairs_host(
+        reinterpret_cast<int64_t *>(keys),
+        reinterpret_cast<int *>(values), n);
+}
 void radix_sort_pairs_float_host(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_host(
         reinterpret_cast<float *>(keys),
         reinterpret_cast<int *>(values), n);
-}
+}
+void segmented_sort_pairs_float_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+{
+    segmented_sort_pairs_host(
+        reinterpret_cast<float *>(keys),
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices), num_segments);
+}
+void segmented_sort_pairs_int_host(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+{
+    segmented_sort_pairs_host(
+        reinterpret_cast<int *>(keys),
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices), num_segments);
+}

warp/native/sort.cu CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #include "warp.h"
@@ -27,11 +36,12 @@ struct RadixSortTemp
 static std::map<void*, RadixSortTemp> g_radix_sort_temp_map;
-void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
+template <typename KeyType>
+void radix_sort_reserve_internal(void* context, int n, void** mem_out, size_t* size_out)
 {
     ContextGuard guard(context);
-    cub::DoubleBuffer<int> d_keys;
+    cub::DoubleBuffer<KeyType> d_keys;
 	cub::DoubleBuffer<int> d_values;
     // compute temporary memory required
@@ -41,7 +51,7 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
         sort_temp_size,
         d_keys,
         d_values,
-        n, 0, 32,
+        n, 0, sizeof(KeyType)*8,
         (cudaStream_t)cuda_stream_get_current()));
     if (!context)
@@ -62,15 +72,21 @@ void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
         *size_out = temp.size;
 }
-void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
+void radix_sort_reserve(void* context, int n, void** mem_out, size_t* size_out)
+{
+    radix_sort_reserve_internal<int>(context, n, mem_out, size_out);
+}
+template <typename KeyType>
+void radix_sort_pairs_device(void* context, KeyType* keys, int* values, int n)
 {
     ContextGuard guard(context);
-    cub::DoubleBuffer<int> d_keys(keys, keys + n);
+    cub::DoubleBuffer<KeyType> d_keys(keys, keys + n);
 	cub::DoubleBuffer<int> d_values(values, values + n);
     RadixSortTemp temp;
-    radix_sort_reserve(WP_CURRENT_CONTEXT, n, &temp.mem, &temp.size);
+    radix_sort_reserve_internal<KeyType>(WP_CURRENT_CONTEXT, n, &temp.mem, &temp.size);
     // sort
     check_cuda(cub::DeviceRadixSort::SortPairs(
@@ -78,16 +94,31 @@ void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
         temp.size,
         d_keys,
         d_values,
-        n, 0, 32,
+        n, 0, sizeof(KeyType)*8,
         (cudaStream_t)cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
-		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(int)*n);
+		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(KeyType)*n);
 	if (d_values.Current() != values)
 		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
+void radix_sort_pairs_device(void* context, int* keys, int* values, int n)
+{
+    radix_sort_pairs_device<int>(context, keys, values, n);
+}
+void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
+{
+    radix_sort_pairs_device<float>(context, keys, values, n);
+}
+void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n)
+{
+    radix_sort_pairs_device<int64_t>(context, keys, values, n);
+}
 void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
 {
     radix_sort_pairs_device(
@@ -96,7 +127,69 @@ void radix_sort_pairs_int_device(uint64_t keys, uint64_t values, int n)
         reinterpret_cast<int *>(values), n);
 }
-void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
+void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
+{
+    radix_sort_pairs_device(
+        WP_CURRENT_CONTEXT,
+        reinterpret_cast<float *>(keys),
+        reinterpret_cast<int *>(values), n);
+}
+void radix_sort_pairs_int64_device(uint64_t keys, uint64_t values, int n)
+{
+    radix_sort_pairs_device(
+        WP_CURRENT_CONTEXT,
+        reinterpret_cast<int64_t *>(keys),
+        reinterpret_cast<int *>(values), n);
+}
+void segmented_sort_reserve(void* context, int n, int num_segments, void** mem_out, size_t* size_out)
+{
+    ContextGuard guard(context);
+    cub::DoubleBuffer<int> d_keys;
+	cub::DoubleBuffer<int> d_values;
+    int* start_indices = NULL;
+    int* end_indices = NULL;
+    // compute temporary memory required
+	size_t sort_temp_size;
+    check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+        NULL,
+        sort_temp_size,
+        d_keys,
+        d_values,
+        n,
+        num_segments,
+        start_indices,
+        end_indices,
+        0,
+        32,
+        (cudaStream_t)cuda_stream_get_current()));
+    if (!context)
+        context = cuda_context_get_current();
+    RadixSortTemp& temp = g_radix_sort_temp_map[context];
+    if (sort_temp_size > temp.size)
+    {
+	    free_device(WP_CURRENT_CONTEXT, temp.mem);
+        temp.mem = alloc_device(WP_CURRENT_CONTEXT, sort_temp_size);
+        temp.size = sort_temp_size;
+    }
+    if (mem_out)
+        *mem_out = temp.mem;
+    if (size_out)
+        *size_out = temp.size;
+}
+// segment_start_indices and segment_end_indices are arrays of length num_segments, where segment_start_indices[i] is the index of the first element
+// in the i-th segment and segment_end_indices[i] is the index after the last element in the i-th segment
+// https://nvidia.github.io/cccl/cub/api/structcub_1_1DeviceSegmentedRadixSort.html
+void segmented_sort_pairs_device(void* context, float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
 {
     ContextGuard guard(context);
@@ -104,15 +197,20 @@ void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
 	cub::DoubleBuffer<int> d_values(values, values + n);
     RadixSortTemp temp;
-    radix_sort_reserve(WP_CURRENT_CONTEXT, n, &temp.mem, &temp.size);
+    segmented_sort_reserve(WP_CURRENT_CONTEXT, n, num_segments, &temp.mem, &temp.size);
     // sort
-    check_cuda(cub::DeviceRadixSort::SortPairs(
+    check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
         temp.mem,
         temp.size,
         d_keys,
         d_values,
-        n, 0, 32,
+        n,
+        num_segments,
+        segment_start_indices,
+        segment_end_indices,
+        0,
+        32,
         (cudaStream_t)cuda_stream_get_current()));
 	if (d_keys.Current() != keys)
@@ -122,10 +220,58 @@ void radix_sort_pairs_device(void* context, float* keys, int* values, int n)
 		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
 }
-void radix_sort_pairs_float_device(uint64_t keys, uint64_t values, int n)
+void segmented_sort_pairs_float_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
 {
-    radix_sort_pairs_device(
+    segmented_sort_pairs_device(
         WP_CURRENT_CONTEXT,
         reinterpret_cast<float *>(keys),
-        reinterpret_cast<int *>(values), n);
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices),
+        num_segments);
+}
+// segment_indices is an array of length num_segments + 1, where segment_indices[i] is the index of the first element in the i-th segment
+// The end of a segment is given by segment_indices[i+1]
+// https://nvidia.github.io/cccl/cub/api/structcub_1_1DeviceSegmentedSort.html#a-simple-example
+void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments)
+{
+    ContextGuard guard(context);
+    cub::DoubleBuffer<int> d_keys(keys, keys + n);
+	cub::DoubleBuffer<int> d_values(values, values + n);
+    RadixSortTemp temp;
+    segmented_sort_reserve(WP_CURRENT_CONTEXT, n, num_segments, &temp.mem, &temp.size);
+    // sort
+    check_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
+        temp.mem,
+        temp.size,
+        d_keys,
+        d_values,
+        n,
+        num_segments,
+        segment_start_indices,
+        segment_end_indices,
+        0,
+        32,
+        (cudaStream_t)cuda_stream_get_current()));
+	if (d_keys.Current() != keys)
+		memcpy_d2d(WP_CURRENT_CONTEXT, keys, d_keys.Current(), sizeof(float)*n);
+	if (d_values.Current() != values)
+		memcpy_d2d(WP_CURRENT_CONTEXT, values, d_values.Current(), sizeof(int)*n);
+}
+void segmented_sort_pairs_int_device(uint64_t keys, uint64_t values, int n, uint64_t segment_start_indices, uint64_t segment_end_indices, int num_segments)
+{
+    segmented_sort_pairs_device(
+        WP_CURRENT_CONTEXT,
+        reinterpret_cast<int *>(keys),
+        reinterpret_cast<int *>(values), n,
+        reinterpret_cast<int *>(segment_start_indices),
+        reinterpret_cast<int *>(segment_end_indices),
+        num_segments);
 }

warp/native/sort.h CHANGED Viewed

@@ -1,9 +1,18 @@
-/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 #pragma once
@@ -13,5 +22,12 @@
 void radix_sort_reserve(void* context, int n, void** mem_out=NULL, size_t* size_out=NULL);
 void radix_sort_pairs_host(int* keys, int* values, int n);
 void radix_sort_pairs_host(float* keys, int* values, int n);
+void radix_sort_pairs_host(int64_t* keys, int* values, int n);
 void radix_sort_pairs_device(void* context, int* keys, int* values, int n);
-void radix_sort_pairs_device(void* context, float* keys, int* values, int n);
+void radix_sort_pairs_device(void* context, float* keys, int* values, int n);
+void radix_sort_pairs_device(void* context, int64_t* keys, int* values, int n);
+void segmented_sort_pairs_host(float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);
+void segmented_sort_pairs_device(void* context, float* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);
+void segmented_sort_pairs_host(void* context, int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);
+void segmented_sort_pairs_device(void* context, int* keys, int* values, int n, int* segment_start_indices, int* segment_end_indices, int num_segments);