cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,3437 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
3
|
+
* Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
|
7
|
+
* * Redistributions of source code must retain the above copyright
|
|
8
|
+
* notice, this list of conditions and the following disclaimer.
|
|
9
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
+
* documentation and/or other materials provided with the distribution.
|
|
12
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
+
* names of its contributors may be used to endorse or promote products
|
|
14
|
+
* derived from this software without specific prior written permission.
|
|
15
|
+
*
|
|
16
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
17
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
19
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
*
|
|
27
|
+
******************************************************************************/
|
|
28
|
+
|
|
29
|
+
//! @file
|
|
30
|
+
//! cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data
|
|
31
|
+
//! items residing within device-accessible memory.
|
|
32
|
+
|
|
33
|
+
#pragma once
|
|
34
|
+
|
|
35
|
+
#include <cub/config.cuh>
|
|
36
|
+
|
|
37
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
38
|
+
# pragma GCC system_header
|
|
39
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
40
|
+
# pragma clang system_header
|
|
41
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
42
|
+
# pragma system_header
|
|
43
|
+
#endif // no system header
|
|
44
|
+
|
|
45
|
+
#include <cub/detail/choose_offset.cuh>
|
|
46
|
+
#include <cub/device/dispatch/dispatch_radix_sort.cuh>
|
|
47
|
+
|
|
48
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
49
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
50
|
+
#include <cuda/std/__type_traits/is_convertible.h>
|
|
51
|
+
|
|
52
|
+
CUB_NAMESPACE_BEGIN
|
|
53
|
+
|
|
54
|
+
//! @rst
|
|
55
|
+
//! DeviceRadixSort provides device-wide, parallel operations for
|
|
56
|
+
//! computing a radix sort across a sequence of data items residing
|
|
57
|
+
//! within device-accessible memory.
|
|
58
|
+
//!
|
|
59
|
+
//! .. image:: ../../img/sorting_logo.png
|
|
60
|
+
//! :align: center
|
|
61
|
+
//!
|
|
62
|
+
//! Overview
|
|
63
|
+
//! --------------------------------------------------
|
|
64
|
+
//!
|
|
65
|
+
//! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_
|
|
66
|
+
//! arranges items into ascending (or descending) order. The algorithm relies
|
|
67
|
+
//! upon a positional representation for keys, i.e., each key is comprised of an
|
|
68
|
+
//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
|
|
69
|
+
//! least-significant to most-significant. For a given input sequence of keys
|
|
70
|
+
//! and a set of rules specifying a total ordering of the symbolic alphabet, the
|
|
71
|
+
//! radix sorting method produces a lexicographic ordering of those keys.
|
|
72
|
+
//!
|
|
73
|
+
//! @rowmajor
|
|
74
|
+
//!
|
|
75
|
+
//! Supported Types
|
|
76
|
+
//! --------------------------------------------------
|
|
77
|
+
//!
|
|
78
|
+
//! DeviceRadixSort can sort all of the built-in C++ numeric primitive types
|
|
79
|
+
//! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half``
|
|
80
|
+
//! and ``__nv_bfloat16`` 16-bit floating-point types. User-defined types are
|
|
81
|
+
//! supported as long as a decomposer object is provided.
|
|
82
|
+
//!
|
|
83
|
+
//! Floating-Point Special Cases
|
|
84
|
+
//! --------------------------------------------------
|
|
85
|
+
//!
|
|
86
|
+
//! - Positive and negative zeros are considered equivalent, and will be treated
|
|
87
|
+
//! as such in the output.
|
|
88
|
+
//! - No special handling is implemented for NaN values; these are sorted
|
|
89
|
+
//! according to their bit representations after any transformations.
|
|
90
|
+
//!
|
|
91
|
+
//! Transformations
|
|
92
|
+
//! --------------------------------------------------
|
|
93
|
+
//!
|
|
94
|
+
//! Although the direct radix sorting method can only be applied to unsigned
|
|
95
|
+
//! integral types, DeviceRadixSort is able to sort signed and floating-point
|
|
96
|
+
//! types via simple bit-wise transformations that ensure lexicographic key
|
|
97
|
+
//! ordering. Additional transformations occur for descending sorts. These
|
|
98
|
+
//! transformations must be considered when restricting the
|
|
99
|
+
//! ``[begin_bit, end_bit)`` range, as the bitwise transformations will occur
|
|
100
|
+
//! before the bit-range truncation.
|
|
101
|
+
//!
|
|
102
|
+
//! Any transformations applied to the keys prior to sorting are reversed
|
|
103
|
+
//! while writing to the final output buffer.
|
|
104
|
+
//!
|
|
105
|
+
//! Type Specific Bitwise Transformations
|
|
106
|
+
//! --------------------------------------------------
|
|
107
|
+
//!
|
|
108
|
+
//! To convert the input values into a radix-sortable bitwise representation,
|
|
109
|
+
//! the following transformations take place prior to sorting:
|
|
110
|
+
//!
|
|
111
|
+
//! - For unsigned integral values, the keys are used directly.
|
|
112
|
+
//! - For signed integral values, the sign bit is inverted.
|
|
113
|
+
//! - For positive floating point values, the sign bit is inverted.
|
|
114
|
+
//! - For negative floating point values, the full key is inverted.
|
|
115
|
+
//!
|
|
116
|
+
//! For floating point types, positive and negative zero are a special case and
|
|
117
|
+
//! will be considered equivalent during sorting.
|
|
118
|
+
//!
|
|
119
|
+
//! Descending Sort Bitwise Transformations
|
|
120
|
+
//! --------------------------------------------------
|
|
121
|
+
//!
|
|
122
|
+
//! If descending sort is used, the keys are inverted after performing any
|
|
123
|
+
//! type-specific transformations, and the resulting keys are sorted in ascending
|
|
124
|
+
//! order.
|
|
125
|
+
//!
|
|
126
|
+
//! Stability
|
|
127
|
+
//! --------------------------------------------------
|
|
128
|
+
//!
|
|
129
|
+
//! DeviceRadixSort is stable. For floating-point types, ``-0.0`` and ``+0.0`` are
|
|
130
|
+
//! considered equal and appear in the result in the same order as they appear in
|
|
131
|
+
//! the input.
|
|
132
|
+
//!
|
|
133
|
+
//! Usage Considerations
|
|
134
|
+
//! --------------------------------------------------
|
|
135
|
+
//!
|
|
136
|
+
//! @cdp_class{DeviceRadixSort}
|
|
137
|
+
//!
|
|
138
|
+
//! Performance
|
|
139
|
+
//! --------------------------------------------------
|
|
140
|
+
//!
|
|
141
|
+
//! @linear_performance{radix sort}
|
|
142
|
+
//!
|
|
143
|
+
//! @endrst
|
|
144
|
+
struct DeviceRadixSort
|
|
145
|
+
{
|
|
146
|
+
private:
|
|
147
|
+
template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
148
|
+
CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
|
|
149
|
+
::cuda::std::false_type,
|
|
150
|
+
void* d_temp_storage,
|
|
151
|
+
size_t& temp_storage_bytes,
|
|
152
|
+
bool is_overwrite_okay,
|
|
153
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
154
|
+
DoubleBuffer<ValueT>& d_values,
|
|
155
|
+
NumItemsT num_items,
|
|
156
|
+
DecomposerT decomposer,
|
|
157
|
+
int begin_bit,
|
|
158
|
+
int end_bit,
|
|
159
|
+
cudaStream_t stream);
|
|
160
|
+
|
|
161
|
+
template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
|
|
162
|
+
CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
|
|
163
|
+
::cuda::std::true_type,
|
|
164
|
+
void* d_temp_storage,
|
|
165
|
+
size_t& temp_storage_bytes,
|
|
166
|
+
bool is_overwrite_okay,
|
|
167
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
168
|
+
DoubleBuffer<ValueT>& d_values,
|
|
169
|
+
OffsetT num_items,
|
|
170
|
+
DecomposerT decomposer,
|
|
171
|
+
int begin_bit,
|
|
172
|
+
int end_bit,
|
|
173
|
+
cudaStream_t stream)
|
|
174
|
+
{
|
|
175
|
+
return DispatchRadixSort<Order, KeyT, ValueT, OffsetT, DecomposerT>::Dispatch(
|
|
176
|
+
d_temp_storage,
|
|
177
|
+
temp_storage_bytes,
|
|
178
|
+
d_keys,
|
|
179
|
+
d_values,
|
|
180
|
+
static_cast<OffsetT>(num_items),
|
|
181
|
+
begin_bit,
|
|
182
|
+
end_bit,
|
|
183
|
+
is_overwrite_okay,
|
|
184
|
+
stream,
|
|
185
|
+
decomposer);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
template <SortOrder Order, typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
189
|
+
CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
|
|
190
|
+
::cuda::std::false_type,
|
|
191
|
+
void* d_temp_storage,
|
|
192
|
+
size_t& temp_storage_bytes,
|
|
193
|
+
bool is_overwrite_okay,
|
|
194
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
195
|
+
DoubleBuffer<ValueT>& d_values,
|
|
196
|
+
NumItemsT num_items,
|
|
197
|
+
DecomposerT decomposer,
|
|
198
|
+
cudaStream_t stream);
|
|
199
|
+
|
|
200
|
+
template <SortOrder Order, typename KeyT, typename ValueT, typename OffsetT, typename DecomposerT>
|
|
201
|
+
CUB_RUNTIME_FUNCTION static cudaError_t custom_radix_sort(
|
|
202
|
+
::cuda::std::true_type,
|
|
203
|
+
void* d_temp_storage,
|
|
204
|
+
size_t& temp_storage_bytes,
|
|
205
|
+
bool is_overwrite_okay,
|
|
206
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
207
|
+
DoubleBuffer<ValueT>& d_values,
|
|
208
|
+
OffsetT num_items,
|
|
209
|
+
DecomposerT decomposer,
|
|
210
|
+
cudaStream_t stream)
|
|
211
|
+
{
|
|
212
|
+
constexpr int begin_bit = 0;
|
|
213
|
+
const int end_bit = detail::radix::traits_t<KeyT>::default_end_bit(decomposer);
|
|
214
|
+
|
|
215
|
+
return DeviceRadixSort::custom_radix_sort<Order>(
|
|
216
|
+
::cuda::std::true_type{},
|
|
217
|
+
d_temp_storage,
|
|
218
|
+
temp_storage_bytes,
|
|
219
|
+
is_overwrite_okay,
|
|
220
|
+
d_keys,
|
|
221
|
+
d_values,
|
|
222
|
+
num_items,
|
|
223
|
+
decomposer,
|
|
224
|
+
begin_bit,
|
|
225
|
+
end_bit,
|
|
226
|
+
stream);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Name reported for NVTX ranges
|
|
230
|
+
_CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
|
|
231
|
+
{
|
|
232
|
+
return "cub::DeviceRadixSort";
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
public:
|
|
236
|
+
//! @name KeyT-value pairs
|
|
237
|
+
//! @{
|
|
238
|
+
|
|
239
|
+
//! @rst
|
|
240
|
+
//! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
|
|
241
|
+
//!
|
|
242
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
243
|
+
//! - Pointers to contiguous memory must be used; iterators are not currently
|
|
244
|
+
//! supported.
|
|
245
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
246
|
+
//! any of the provided ranges:
|
|
247
|
+
//!
|
|
248
|
+
//! - ``[d_keys_in, d_keys_in + num_items)``
|
|
249
|
+
//! - ``[d_keys_out, d_keys_out + num_items)``
|
|
250
|
+
//! - ``[d_values_in, d_values_in + num_items)``
|
|
251
|
+
//! - ``[d_values_out, d_values_out + num_items)``
|
|
252
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
253
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
254
|
+
//! yield a corresponding performance improvement.
|
|
255
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
256
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
257
|
+
//! - @devicestorage
|
|
258
|
+
//!
|
|
259
|
+
//! Snippet
|
|
260
|
+
//! --------------------------------------------------
|
|
261
|
+
//!
|
|
262
|
+
//! The code snippet below illustrates the sorting of a device vector of ``int``
|
|
263
|
+
//! keys with associated vector of ``int`` values.
|
|
264
|
+
//! @endrst
|
|
265
|
+
//!
|
|
266
|
+
//! @code{.cpp}
|
|
267
|
+
//! #include <cub/cub.cuh>
|
|
268
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
269
|
+
//!
|
|
270
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
271
|
+
//! // for sorting data
|
|
272
|
+
//! int num_items; // e.g., 7
|
|
273
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
274
|
+
//! int *d_keys_out; // e.g., [ ... ]
|
|
275
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
276
|
+
//! int *d_values_out; // e.g., [ ... ]
|
|
277
|
+
//! ...
|
|
278
|
+
//!
|
|
279
|
+
//! // Determine temporary device storage requirements
|
|
280
|
+
//! void *d_temp_storage = nullptr;
|
|
281
|
+
//! size_t temp_storage_bytes = 0;
|
|
282
|
+
//! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
|
|
283
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
|
|
284
|
+
//!
|
|
285
|
+
//! // Allocate temporary storage
|
|
286
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
287
|
+
//!
|
|
288
|
+
//! // Run sorting operation
|
|
289
|
+
//! cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
|
|
290
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
|
|
291
|
+
//!
|
|
292
|
+
//! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
|
|
293
|
+
//! // d_values_out <-- [5, 4, 3, 1, 2, 0, 6]
|
|
294
|
+
//! @endcode
|
|
295
|
+
//!
|
|
296
|
+
//! @tparam KeyT
|
|
297
|
+
//! **[inferred]** KeyT type
|
|
298
|
+
//!
|
|
299
|
+
//! @tparam ValueT
|
|
300
|
+
//! **[inferred]** ValueT type
|
|
301
|
+
//!
|
|
302
|
+
//! @tparam NumItemsT
|
|
303
|
+
//! **[inferred]** Type of num_items
|
|
304
|
+
//!
|
|
305
|
+
//! @param[in] d_temp_storage
|
|
306
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
307
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
308
|
+
//! is done.
|
|
309
|
+
//!
|
|
310
|
+
//! @param[in,out] temp_storage_bytes
|
|
311
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
312
|
+
//!
|
|
313
|
+
//! @param[in] d_keys_in
|
|
314
|
+
//! Pointer to the input data of key data to sort
|
|
315
|
+
//!
|
|
316
|
+
//! @param[out] d_keys_out
|
|
317
|
+
//! Pointer to the sorted output sequence of key data
|
|
318
|
+
//!
|
|
319
|
+
//! @param[in] d_values_in
|
|
320
|
+
//! Pointer to the corresponding input sequence of associated value items
|
|
321
|
+
//!
|
|
322
|
+
//! @param[out] d_values_out
|
|
323
|
+
//! Pointer to the correspondingly-reordered output sequence of associated
|
|
324
|
+
//! value items
|
|
325
|
+
//!
|
|
326
|
+
//! @param[in] num_items
|
|
327
|
+
//! Number of items to sort
|
|
328
|
+
//!
|
|
329
|
+
//! @param[in] begin_bit
|
|
330
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
331
|
+
//! key comparison
|
|
332
|
+
//!
|
|
333
|
+
//! @param[in] end_bit
|
|
334
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
335
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
336
|
+
//!
|
|
337
|
+
//! @param[in] stream
|
|
338
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
339
|
+
//! Default is stream<sub>0</sub>.
|
|
340
|
+
template <typename KeyT, typename ValueT, typename NumItemsT>
|
|
341
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
|
|
342
|
+
void* d_temp_storage,
|
|
343
|
+
size_t& temp_storage_bytes,
|
|
344
|
+
const KeyT* d_keys_in,
|
|
345
|
+
KeyT* d_keys_out,
|
|
346
|
+
const ValueT* d_values_in,
|
|
347
|
+
ValueT* d_values_out,
|
|
348
|
+
NumItemsT num_items,
|
|
349
|
+
int begin_bit = 0,
|
|
350
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
351
|
+
cudaStream_t stream = 0)
|
|
352
|
+
{
|
|
353
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
354
|
+
// Unsigned integer type for global offsets.
|
|
355
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
356
|
+
|
|
357
|
+
// TODO API that doesn't accept decomposer should also contain a static
|
|
358
|
+
// assert that the key type is fundamental.
|
|
359
|
+
|
|
360
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
361
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
362
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
363
|
+
// is not set.
|
|
364
|
+
constexpr bool is_overwrite_okay = false;
|
|
365
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
366
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
367
|
+
|
|
368
|
+
return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
|
|
369
|
+
d_temp_storage,
|
|
370
|
+
temp_storage_bytes,
|
|
371
|
+
d_keys,
|
|
372
|
+
d_values,
|
|
373
|
+
static_cast<OffsetT>(num_items),
|
|
374
|
+
begin_bit,
|
|
375
|
+
end_bit,
|
|
376
|
+
is_overwrite_okay,
|
|
377
|
+
stream);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
//! @rst
|
|
381
|
+
//! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
|
|
382
|
+
//!
|
|
383
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
384
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
385
|
+
//! supported.
|
|
386
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
387
|
+
//! any of the provided ranges:
|
|
388
|
+
//!
|
|
389
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
390
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
391
|
+
//! * ``[d_values_in, d_values_in + num_items)``
|
|
392
|
+
//! * ``[d_values_out, d_values_out + num_items)``
|
|
393
|
+
//!
|
|
394
|
+
//! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
|
|
395
|
+
//! differentiating key bits. This can reduce overall sorting overhead and
|
|
396
|
+
//! yield a corresponding performance improvement.
|
|
397
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
398
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
399
|
+
//! * @devicestorage
|
|
400
|
+
//!
|
|
401
|
+
//! Snippet
|
|
402
|
+
//! --------------------------------------------------
|
|
403
|
+
//!
|
|
404
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
405
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
406
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
407
|
+
//! tuple of references to relevant members of the key.
|
|
408
|
+
//!
|
|
409
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
410
|
+
//! :language: c++
|
|
411
|
+
//! :dedent:
|
|
412
|
+
//! :start-after: example-begin custom-type
|
|
413
|
+
//! :end-before: example-end custom-type
|
|
414
|
+
//!
|
|
415
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
416
|
+
//! using ``cub::DeviceRadixSort::SortPairs``:
|
|
417
|
+
//!
|
|
418
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
419
|
+
//! :language: c++
|
|
420
|
+
//! :dedent:
|
|
421
|
+
//! :start-after: example-begin pairs-bits
|
|
422
|
+
//! :end-before: example-end pairs-bits
|
|
423
|
+
//!
|
|
424
|
+
//! @endrst
|
|
425
|
+
//!
|
|
426
|
+
//! @tparam KeyT
|
|
427
|
+
//! **[inferred]** KeyT type
|
|
428
|
+
//!
|
|
429
|
+
//! @tparam ValueT
|
|
430
|
+
//! **[inferred]** ValueT type
|
|
431
|
+
//!
|
|
432
|
+
//! @tparam NumItemsT
|
|
433
|
+
//! **[inferred]** Type of num_items
|
|
434
|
+
//!
|
|
435
|
+
//! @tparam DecomposerT
|
|
436
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
437
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
438
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
439
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
440
|
+
//! The call operator must not modify members of the key.
|
|
441
|
+
//!
|
|
442
|
+
//! @param[in] d_temp_storage
|
|
443
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
444
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
445
|
+
//! is done.
|
|
446
|
+
//!
|
|
447
|
+
//! @param[in,out] temp_storage_bytes
|
|
448
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
449
|
+
//!
|
|
450
|
+
//! @param[in] d_keys_in
|
|
451
|
+
//! Pointer to the input data of key data to sort
|
|
452
|
+
//!
|
|
453
|
+
//! @param[out] d_keys_out
|
|
454
|
+
//! Pointer to the sorted output sequence of key data
|
|
455
|
+
//!
|
|
456
|
+
//! @param[in] d_values_in
|
|
457
|
+
//! Pointer to the corresponding input sequence of associated value items
|
|
458
|
+
//!
|
|
459
|
+
//! @param[out] d_values_out
|
|
460
|
+
//! Pointer to the correspondingly-reordered output sequence of associated
|
|
461
|
+
//! value items
|
|
462
|
+
//!
|
|
463
|
+
//! @param[in] num_items
|
|
464
|
+
//! Number of items to sort
|
|
465
|
+
//!
|
|
466
|
+
//! @param decomposer
|
|
467
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
468
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
469
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
470
|
+
//! modify members of the key.
|
|
471
|
+
//!
|
|
472
|
+
//! @param[in] begin_bit
|
|
473
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
474
|
+
//! key comparison
|
|
475
|
+
//!
|
|
476
|
+
//! @param[in] end_bit
|
|
477
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
478
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
479
|
+
//!
|
|
480
|
+
//! @param[in] stream
|
|
481
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
482
|
+
//! Default is stream<sub>0</sub>.
|
|
483
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
484
|
+
CUB_RUNTIME_FUNCTION static //
|
|
485
|
+
::cuda::std::enable_if_t< //
|
|
486
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
487
|
+
cudaError_t>
|
|
488
|
+
SortPairs(void* d_temp_storage,
|
|
489
|
+
size_t& temp_storage_bytes,
|
|
490
|
+
const KeyT* d_keys_in,
|
|
491
|
+
KeyT* d_keys_out,
|
|
492
|
+
const ValueT* d_values_in,
|
|
493
|
+
ValueT* d_values_out,
|
|
494
|
+
NumItemsT num_items,
|
|
495
|
+
DecomposerT decomposer,
|
|
496
|
+
int begin_bit,
|
|
497
|
+
int end_bit,
|
|
498
|
+
cudaStream_t stream = 0)
|
|
499
|
+
{
|
|
500
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
501
|
+
// unsigned integer type for global offsets
|
|
502
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
503
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
504
|
+
|
|
505
|
+
static_assert(decomposer_check_t::value,
|
|
506
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
507
|
+
"arithmetic types");
|
|
508
|
+
|
|
509
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
510
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
511
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
512
|
+
// is not set.
|
|
513
|
+
constexpr bool is_overwrite_okay = false;
|
|
514
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
515
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
516
|
+
|
|
517
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
518
|
+
decomposer_check_t{},
|
|
519
|
+
d_temp_storage,
|
|
520
|
+
temp_storage_bytes,
|
|
521
|
+
is_overwrite_okay,
|
|
522
|
+
d_keys,
|
|
523
|
+
d_values,
|
|
524
|
+
static_cast<offset_t>(num_items),
|
|
525
|
+
decomposer,
|
|
526
|
+
begin_bit,
|
|
527
|
+
end_bit,
|
|
528
|
+
stream);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
//! @rst
|
|
532
|
+
//! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage.
|
|
533
|
+
//!
|
|
534
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
535
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
536
|
+
//! supported.
|
|
537
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
538
|
+
//! any of the provided ranges:
|
|
539
|
+
//!
|
|
540
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
541
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
542
|
+
//! * ``[d_values_in, d_values_in + num_items)``
|
|
543
|
+
//! * ``[d_values_out, d_values_out + num_items)``
|
|
544
|
+
//!
|
|
545
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
546
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
547
|
+
//! * @devicestorage
|
|
548
|
+
//!
|
|
549
|
+
//! Snippet
|
|
550
|
+
//! --------------------------------------------------
|
|
551
|
+
//!
|
|
552
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
553
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
554
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
555
|
+
//! tuple of references to relevant members of the key.
|
|
556
|
+
//!
|
|
557
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
558
|
+
//! :language: c++
|
|
559
|
+
//! :dedent:
|
|
560
|
+
//! :start-after: example-begin custom-type
|
|
561
|
+
//! :end-before: example-end custom-type
|
|
562
|
+
//!
|
|
563
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
564
|
+
//! using ``cub::DeviceRadixSort::SortPairs``:
|
|
565
|
+
//!
|
|
566
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
567
|
+
//! :language: c++
|
|
568
|
+
//! :dedent:
|
|
569
|
+
//! :start-after: example-begin pairs
|
|
570
|
+
//! :end-before: example-end pairs
|
|
571
|
+
//!
|
|
572
|
+
//! @endrst
|
|
573
|
+
//!
|
|
574
|
+
//! @tparam KeyT
|
|
575
|
+
//! **[inferred]** KeyT type
|
|
576
|
+
//!
|
|
577
|
+
//! @tparam ValueT
|
|
578
|
+
//! **[inferred]** ValueT type
|
|
579
|
+
//!
|
|
580
|
+
//! @tparam NumItemsT
|
|
581
|
+
//! **[inferred]** Type of num_items
|
|
582
|
+
//!
|
|
583
|
+
//! @tparam DecomposerT
|
|
584
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
585
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
586
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
587
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
588
|
+
//! The call operator must not modify members of the key.
|
|
589
|
+
//!
|
|
590
|
+
//! @param[in] d_temp_storage
|
|
591
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
592
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
593
|
+
//! is done.
|
|
594
|
+
//!
|
|
595
|
+
//! @param[in,out] temp_storage_bytes
|
|
596
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
597
|
+
//!
|
|
598
|
+
//! @param[in] d_keys_in
|
|
599
|
+
//! Pointer to the input data of key data to sort
|
|
600
|
+
//!
|
|
601
|
+
//! @param[out] d_keys_out
|
|
602
|
+
//! Pointer to the sorted output sequence of key data
|
|
603
|
+
//!
|
|
604
|
+
//! @param[in] d_values_in
|
|
605
|
+
//! Pointer to the corresponding input sequence of associated value items
|
|
606
|
+
//!
|
|
607
|
+
//! @param[out] d_values_out
|
|
608
|
+
//! Pointer to the correspondingly-reordered output sequence of associated
|
|
609
|
+
//! value items
|
|
610
|
+
//!
|
|
611
|
+
//! @param[in] num_items
|
|
612
|
+
//! Number of items to sort
|
|
613
|
+
//!
|
|
614
|
+
//! @param decomposer
|
|
615
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
616
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
617
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
618
|
+
//! modify members of the key.
|
|
619
|
+
//!
|
|
620
|
+
//! @param[in] stream
|
|
621
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
622
|
+
//! Default is stream<sub>0</sub>.
|
|
623
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
624
|
+
CUB_RUNTIME_FUNCTION static //
|
|
625
|
+
::cuda::std::enable_if_t< //
|
|
626
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
627
|
+
cudaError_t>
|
|
628
|
+
SortPairs(void* d_temp_storage,
|
|
629
|
+
size_t& temp_storage_bytes,
|
|
630
|
+
const KeyT* d_keys_in,
|
|
631
|
+
KeyT* d_keys_out,
|
|
632
|
+
const ValueT* d_values_in,
|
|
633
|
+
ValueT* d_values_out,
|
|
634
|
+
NumItemsT num_items,
|
|
635
|
+
DecomposerT decomposer,
|
|
636
|
+
cudaStream_t stream = 0)
|
|
637
|
+
{
|
|
638
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
639
|
+
// unsigned integer type for global offsets
|
|
640
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
641
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
642
|
+
|
|
643
|
+
static_assert(decomposer_check_t::value,
|
|
644
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
645
|
+
"arithmetic types");
|
|
646
|
+
|
|
647
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
648
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
649
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
650
|
+
// is not set.
|
|
651
|
+
constexpr bool is_overwrite_okay = false;
|
|
652
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
653
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
654
|
+
|
|
655
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
656
|
+
decomposer_check_t{},
|
|
657
|
+
d_temp_storage,
|
|
658
|
+
temp_storage_bytes,
|
|
659
|
+
is_overwrite_okay,
|
|
660
|
+
d_keys,
|
|
661
|
+
d_values,
|
|
662
|
+
static_cast<offset_t>(num_items),
|
|
663
|
+
decomposer,
|
|
664
|
+
stream);
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
//! @rst
|
|
668
|
+
//! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
|
|
669
|
+
//!
|
|
670
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
671
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
672
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
673
|
+
//! contains the input data to be sorted).
|
|
674
|
+
//! - The contents of both buffers within each pair may be altered by the
|
|
675
|
+
//! sorting operation.
|
|
676
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
677
|
+
//! any of the provided ranges:
|
|
678
|
+
//!
|
|
679
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
680
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
681
|
+
//! - ``[d_values.Current(), d_values.Current() + num_items)``
|
|
682
|
+
//! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
|
|
683
|
+
//!
|
|
684
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
685
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
686
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
687
|
+
//! number of key bits specified and the targeted device architecture).
|
|
688
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
689
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
690
|
+
//! yield a corresponding performance improvement.
|
|
691
|
+
//! - @devicestorageP
|
|
692
|
+
//! - @devicestorage
|
|
693
|
+
//!
|
|
694
|
+
//! Snippet
|
|
695
|
+
//! --------------------------------------------------
|
|
696
|
+
//!
|
|
697
|
+
//! The code snippet below illustrates the sorting of a device vector of ``int``
|
|
698
|
+
//! keys with associated vector of ``int`` values.
|
|
699
|
+
//! @endrst
|
|
700
|
+
//!
|
|
701
|
+
//! @code
|
|
702
|
+
//! #include <cub/cub.cuh>
|
|
703
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
704
|
+
//!
|
|
705
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
706
|
+
//! // sorting data
|
|
707
|
+
//! int num_items; // e.g., 7
|
|
708
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
709
|
+
//! int *d_key_alt_buf; // e.g., [ ... ]
|
|
710
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
711
|
+
//! int *d_value_alt_buf; // e.g., [ ... ]
|
|
712
|
+
//! ...
|
|
713
|
+
//!
|
|
714
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
715
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
716
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
717
|
+
//!
|
|
718
|
+
//! // Determine temporary device storage requirements
|
|
719
|
+
//! void *d_temp_storage = nullptr;
|
|
720
|
+
//! size_t temp_storage_bytes = 0;
|
|
721
|
+
//! cub::DeviceRadixSort::SortPairs(
|
|
722
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
|
|
723
|
+
//!
|
|
724
|
+
//! // Allocate temporary storage
|
|
725
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
726
|
+
//!
|
|
727
|
+
//! // Run sorting operation
|
|
728
|
+
//! cub::DeviceRadixSort::SortPairs(
|
|
729
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
|
|
730
|
+
//!
|
|
731
|
+
//! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
|
|
732
|
+
//! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
|
|
733
|
+
//!
|
|
734
|
+
//! @endcode
|
|
735
|
+
//!
|
|
736
|
+
//! @tparam KeyT
|
|
737
|
+
//! **[inferred]** KeyT type
|
|
738
|
+
//!
|
|
739
|
+
//! @tparam ValueT
|
|
740
|
+
//! **[inferred]** ValueT type
|
|
741
|
+
//!
|
|
742
|
+
//! @tparam NumItemsT
|
|
743
|
+
//! **[inferred]** Type of num_items
|
|
744
|
+
//!
|
|
745
|
+
//! @param[in] d_temp_storage
|
|
746
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
747
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work is done.
|
|
748
|
+
//!
|
|
749
|
+
//! @param[in,out] temp_storage_bytes
|
|
750
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
751
|
+
//!
|
|
752
|
+
//! @param[in,out] d_keys
|
|
753
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
754
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
755
|
+
//! point to the sorted output keys
|
|
756
|
+
//!
|
|
757
|
+
//! @param[in,out] d_values
|
|
758
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
759
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
760
|
+
//! to the sorted output values
|
|
761
|
+
//!
|
|
762
|
+
//! @param[in] num_items
|
|
763
|
+
//! Number of items to sort
|
|
764
|
+
//!
|
|
765
|
+
//! @param[in] begin_bit
|
|
766
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
767
|
+
//! key comparison
|
|
768
|
+
//!
|
|
769
|
+
//! @param[in] end_bit
|
|
770
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
771
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
772
|
+
//!
|
|
773
|
+
//! @param[in] stream
|
|
774
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
775
|
+
//! Default is stream<sub>0</sub>.
|
|
776
|
+
template <typename KeyT, typename ValueT, typename NumItemsT>
|
|
777
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
|
|
778
|
+
void* d_temp_storage,
|
|
779
|
+
size_t& temp_storage_bytes,
|
|
780
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
781
|
+
DoubleBuffer<ValueT>& d_values,
|
|
782
|
+
NumItemsT num_items,
|
|
783
|
+
int begin_bit = 0,
|
|
784
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
785
|
+
cudaStream_t stream = 0)
|
|
786
|
+
{
|
|
787
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
788
|
+
|
|
789
|
+
// Unsigned integer type for global offsets.
|
|
790
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
791
|
+
|
|
792
|
+
constexpr bool is_overwrite_okay = true;
|
|
793
|
+
|
|
794
|
+
return DispatchRadixSort<SortOrder::Ascending, KeyT, ValueT, OffsetT>::Dispatch(
|
|
795
|
+
d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
//! @rst
|
|
799
|
+
//! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
|
|
800
|
+
//!
|
|
801
|
+
//! * The sorting operation is given a pair of key buffers and a corresponding
|
|
802
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
803
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
804
|
+
//! contains the input data to be sorted).
|
|
805
|
+
//! * The contents of both buffers within each pair may be altered by the
|
|
806
|
+
//! sorting operation.
|
|
807
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
808
|
+
//! any of the provided ranges:
|
|
809
|
+
//!
|
|
810
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
811
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
812
|
+
//! - ``[d_values.Current(), d_values.Current() + num_items)``
|
|
813
|
+
//! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
|
|
814
|
+
//!
|
|
815
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
816
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
817
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
818
|
+
//! number of key bits specified and the targeted device architecture).
|
|
819
|
+
//! - @devicestorageP
|
|
820
|
+
//! - @devicestorage
|
|
821
|
+
//!
|
|
822
|
+
//! Snippet
|
|
823
|
+
//! --------------------------------------------------
|
|
824
|
+
//!
|
|
825
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
826
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
827
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
828
|
+
//! tuple of references to relevant members of the key.
|
|
829
|
+
//!
|
|
830
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
831
|
+
//! :language: c++
|
|
832
|
+
//! :dedent:
|
|
833
|
+
//! :start-after: example-begin custom-type
|
|
834
|
+
//! :end-before: example-end custom-type
|
|
835
|
+
//!
|
|
836
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
837
|
+
//! using ``cub::DeviceRadixSort::SortPairs``:
|
|
838
|
+
//!
|
|
839
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
840
|
+
//! :language: c++
|
|
841
|
+
//! :dedent:
|
|
842
|
+
//! :start-after: example-begin pairs-db
|
|
843
|
+
//! :end-before: example-end pairs-db
|
|
844
|
+
//!
|
|
845
|
+
//! @endrst
|
|
846
|
+
//!
|
|
847
|
+
//! @tparam KeyT
|
|
848
|
+
//! **[inferred]** KeyT type
|
|
849
|
+
//!
|
|
850
|
+
//! @tparam ValueT
|
|
851
|
+
//! **[inferred]** ValueT type
|
|
852
|
+
//!
|
|
853
|
+
//! @tparam NumItemsT
|
|
854
|
+
//! **[inferred]** Type of num_items
|
|
855
|
+
//!
|
|
856
|
+
//! @tparam DecomposerT
|
|
857
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
858
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
859
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
860
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
861
|
+
//! The call operator must not modify members of the key.
|
|
862
|
+
//!
|
|
863
|
+
//! @param[in] d_temp_storage
|
|
864
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
865
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
866
|
+
//! is done.
|
|
867
|
+
//!
|
|
868
|
+
//! @param[in,out] temp_storage_bytes
|
|
869
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
870
|
+
//!
|
|
871
|
+
//! @param[in,out] d_keys
|
|
872
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
873
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
874
|
+
//! point to the sorted output keys
|
|
875
|
+
//!
|
|
876
|
+
//! @param[in,out] d_values
|
|
877
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
878
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
879
|
+
//! to the sorted output values
|
|
880
|
+
//!
|
|
881
|
+
//! @param[in] num_items
|
|
882
|
+
//! Number of items to sort
|
|
883
|
+
//!
|
|
884
|
+
//! @param decomposer
|
|
885
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
886
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
887
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
888
|
+
//! modify members of the key.
|
|
889
|
+
//!
|
|
890
|
+
//! @param[in] stream
|
|
891
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
892
|
+
//! Default is stream<sub>0</sub>.
|
|
893
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
894
|
+
CUB_RUNTIME_FUNCTION static //
|
|
895
|
+
::cuda::std::enable_if_t< //
|
|
896
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
897
|
+
cudaError_t>
|
|
898
|
+
SortPairs(void* d_temp_storage,
|
|
899
|
+
size_t& temp_storage_bytes,
|
|
900
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
901
|
+
DoubleBuffer<ValueT>& d_values,
|
|
902
|
+
NumItemsT num_items,
|
|
903
|
+
DecomposerT decomposer,
|
|
904
|
+
cudaStream_t stream = 0)
|
|
905
|
+
{
|
|
906
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
907
|
+
|
|
908
|
+
// unsigned integer type for global offsets
|
|
909
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
910
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
911
|
+
|
|
912
|
+
static_assert(decomposer_check_t::value,
|
|
913
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
914
|
+
"arithmetic types");
|
|
915
|
+
|
|
916
|
+
constexpr bool is_overwrite_okay = true;
|
|
917
|
+
|
|
918
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
919
|
+
decomposer_check_t{},
|
|
920
|
+
d_temp_storage,
|
|
921
|
+
temp_storage_bytes,
|
|
922
|
+
is_overwrite_okay,
|
|
923
|
+
d_keys,
|
|
924
|
+
d_values,
|
|
925
|
+
static_cast<offset_t>(num_items),
|
|
926
|
+
decomposer,
|
|
927
|
+
stream);
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
//! @rst
|
|
931
|
+
//! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage.
|
|
932
|
+
//!
|
|
933
|
+
//! * The sorting operation is given a pair of key buffers and a corresponding
|
|
934
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
935
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
936
|
+
//! contains the input data to be sorted).
|
|
937
|
+
//! * The contents of both buffers within each pair may be altered by the
|
|
938
|
+
//! sorting operation.
|
|
939
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
940
|
+
//! any of the provided ranges:
|
|
941
|
+
//!
|
|
942
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
943
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
944
|
+
//! - ``[d_values.Current(), d_values.Current() + num_items)``
|
|
945
|
+
//! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
|
|
946
|
+
//!
|
|
947
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
948
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
949
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
950
|
+
//! number of key bits specified and the targeted device architecture).
|
|
951
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
952
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
953
|
+
//! yield a corresponding performance improvement.
|
|
954
|
+
//! - @devicestorageP
|
|
955
|
+
//! - @devicestorage
|
|
956
|
+
//!
|
|
957
|
+
//! Snippet
|
|
958
|
+
//! --------------------------------------------------
|
|
959
|
+
//!
|
|
960
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
961
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
962
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
963
|
+
//! tuple of references to relevant members of the key.
|
|
964
|
+
//!
|
|
965
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
966
|
+
//! :language: c++
|
|
967
|
+
//! :dedent:
|
|
968
|
+
//! :start-after: example-begin custom-type
|
|
969
|
+
//! :end-before: example-end custom-type
|
|
970
|
+
//!
|
|
971
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
972
|
+
//! using ``cub::DeviceRadixSort::SortPairs``:
|
|
973
|
+
//!
|
|
974
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
975
|
+
//! :language: c++
|
|
976
|
+
//! :dedent:
|
|
977
|
+
//! :start-after: example-begin pairs-bits-db
|
|
978
|
+
//! :end-before: example-end pairs-bits-db
|
|
979
|
+
//!
|
|
980
|
+
//! @endrst
|
|
981
|
+
//!
|
|
982
|
+
//! @tparam KeyT
|
|
983
|
+
//! **[inferred]** KeyT type
|
|
984
|
+
//!
|
|
985
|
+
//! @tparam ValueT
|
|
986
|
+
//! **[inferred]** ValueT type
|
|
987
|
+
//!
|
|
988
|
+
//! @tparam NumItemsT
|
|
989
|
+
//! **[inferred]** Type of num_items
|
|
990
|
+
//!
|
|
991
|
+
//! @tparam DecomposerT
|
|
992
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
993
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
994
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
995
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
996
|
+
//! The call operator must not modify members of the key.
|
|
997
|
+
//!
|
|
998
|
+
//! @param[in] d_temp_storage
|
|
999
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1000
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1001
|
+
//! is done.
|
|
1002
|
+
//!
|
|
1003
|
+
//! @param[in,out] temp_storage_bytes
|
|
1004
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1005
|
+
//!
|
|
1006
|
+
//! @param[in,out] d_keys
|
|
1007
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1008
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1009
|
+
//! point to the sorted output keys
|
|
1010
|
+
//!
|
|
1011
|
+
//! @param[in,out] d_values
|
|
1012
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
1013
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
1014
|
+
//! to the sorted output values
|
|
1015
|
+
//!
|
|
1016
|
+
//! @param[in] num_items
|
|
1017
|
+
//! Number of items to sort
|
|
1018
|
+
//!
|
|
1019
|
+
//! @param decomposer
|
|
1020
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
1021
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
1022
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
1023
|
+
//! modify members of the key.
|
|
1024
|
+
//!
|
|
1025
|
+
//! @param[in] begin_bit
|
|
1026
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
1027
|
+
//! key comparison
|
|
1028
|
+
//!
|
|
1029
|
+
//! @param[in] end_bit
|
|
1030
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1031
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
1032
|
+
//!
|
|
1033
|
+
//! @param[in] stream
|
|
1034
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
1035
|
+
//! Default is stream<sub>0</sub>.
|
|
1036
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
1037
|
+
CUB_RUNTIME_FUNCTION static //
|
|
1038
|
+
::cuda::std::enable_if_t< //
|
|
1039
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
1040
|
+
cudaError_t>
|
|
1041
|
+
SortPairs(void* d_temp_storage,
|
|
1042
|
+
size_t& temp_storage_bytes,
|
|
1043
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1044
|
+
DoubleBuffer<ValueT>& d_values,
|
|
1045
|
+
NumItemsT num_items,
|
|
1046
|
+
DecomposerT decomposer,
|
|
1047
|
+
int begin_bit,
|
|
1048
|
+
int end_bit,
|
|
1049
|
+
cudaStream_t stream = 0)
|
|
1050
|
+
{
|
|
1051
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1052
|
+
|
|
1053
|
+
// unsigned integer type for global offsets
|
|
1054
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
1055
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
1056
|
+
|
|
1057
|
+
static_assert(decomposer_check_t::value,
|
|
1058
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
1059
|
+
"arithmetic types");
|
|
1060
|
+
|
|
1061
|
+
constexpr bool is_overwrite_okay = true;
|
|
1062
|
+
|
|
1063
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
1064
|
+
decomposer_check_t{},
|
|
1065
|
+
d_temp_storage,
|
|
1066
|
+
temp_storage_bytes,
|
|
1067
|
+
is_overwrite_okay,
|
|
1068
|
+
d_keys,
|
|
1069
|
+
d_values,
|
|
1070
|
+
static_cast<offset_t>(num_items),
|
|
1071
|
+
decomposer,
|
|
1072
|
+
begin_bit,
|
|
1073
|
+
end_bit,
|
|
1074
|
+
stream);
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
//! @rst
|
|
1078
|
+
//! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
|
|
1079
|
+
//!
|
|
1080
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
1081
|
+
//! - Pointers to contiguous memory must be used; iterators are not currently
|
|
1082
|
+
//! supported.
|
|
1083
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
1084
|
+
//! any of the provided ranges:
|
|
1085
|
+
//!
|
|
1086
|
+
//! - ``[d_keys_in, d_keys_in + num_items)``
|
|
1087
|
+
//! - ``[d_keys_out, d_keys_out + num_items)``
|
|
1088
|
+
//! - ``[d_values_in, d_values_in + num_items)``
|
|
1089
|
+
//! - ``[d_values_out, d_values_out + num_items)``
|
|
1090
|
+
//!
|
|
1091
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
1092
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
1093
|
+
//! yield a corresponding performance improvement.
|
|
1094
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
1095
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
1096
|
+
//! - @devicestorage
|
|
1097
|
+
//!
|
|
1098
|
+
//! Snippet
|
|
1099
|
+
//! --------------------------------------------------
|
|
1100
|
+
//!
|
|
1101
|
+
//! The code snippet below illustrates the sorting of a device vector of ``int``
|
|
1102
|
+
//! keys with associated vector of ``int`` values.
|
|
1103
|
+
//! @endrst
|
|
1104
|
+
//!
|
|
1105
|
+
//! @code{.cpp}
|
|
1106
|
+
//! #include <cub/cub.cuh>
|
|
1107
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
1108
|
+
//!
|
|
1109
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1110
|
+
//! // for sorting data
|
|
1111
|
+
//! int num_items; // e.g., 7
|
|
1112
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1113
|
+
//! int *d_keys_out; // e.g., [ ... ]
|
|
1114
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
1115
|
+
//! int *d_values_out; // e.g., [ ... ]
|
|
1116
|
+
//! ...
|
|
1117
|
+
//!
|
|
1118
|
+
//! // Determine temporary device storage requirements
|
|
1119
|
+
//! void *d_temp_storage = nullptr;
|
|
1120
|
+
//! size_t temp_storage_bytes = 0;
|
|
1121
|
+
//! cub::DeviceRadixSort::SortPairsDescending(
|
|
1122
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1123
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
|
|
1124
|
+
//!
|
|
1125
|
+
//! // Allocate temporary storage
|
|
1126
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1127
|
+
//!
|
|
1128
|
+
//! // Run sorting operation
|
|
1129
|
+
//! cub::DeviceRadixSort::SortPairsDescending(
|
|
1130
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1131
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
|
|
1132
|
+
//!
|
|
1133
|
+
//! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]
|
|
1134
|
+
//! // d_values_out <-- [6, 0, 2, 1, 3, 4, 5]
|
|
1135
|
+
//! @endcode
|
|
1136
|
+
//!
|
|
1137
|
+
//! @tparam KeyT
|
|
1138
|
+
//! **[inferred]** KeyT type
|
|
1139
|
+
//!
|
|
1140
|
+
//! @tparam ValueT
|
|
1141
|
+
//! **[inferred]** ValueT type
|
|
1142
|
+
//!
|
|
1143
|
+
//! @tparam NumItemsT
|
|
1144
|
+
//! **[inferred]** Type of num_items
|
|
1145
|
+
//!
|
|
1146
|
+
//! @param[in] d_temp_storage
|
|
1147
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1148
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1149
|
+
//! is done.
|
|
1150
|
+
//!
|
|
1151
|
+
//! @param[in,out] temp_storage_bytes
|
|
1152
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1153
|
+
//!
|
|
1154
|
+
//! @param[in] d_keys_in
|
|
1155
|
+
//! Pointer to the input data of key data to sort
|
|
1156
|
+
//!
|
|
1157
|
+
//! @param[out] d_keys_out
|
|
1158
|
+
//! Pointer to the sorted output sequence of key data
|
|
1159
|
+
//!
|
|
1160
|
+
//! @param[in] d_values_in
|
|
1161
|
+
//! Pointer to the corresponding input sequence of associated value items
|
|
1162
|
+
//!
|
|
1163
|
+
//! @param[out] d_values_out
|
|
1164
|
+
//! Pointer to the correspondingly-reordered output sequence of associated
|
|
1165
|
+
//! value items
|
|
1166
|
+
//!
|
|
1167
|
+
//! @param[in] num_items
|
|
1168
|
+
//! Number of items to sort
|
|
1169
|
+
//!
|
|
1170
|
+
//! @param[in] begin_bit
|
|
1171
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
1172
|
+
//! key comparison
|
|
1173
|
+
//!
|
|
1174
|
+
//! @param[in] end_bit
|
|
1175
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1176
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
1177
|
+
//!
|
|
1178
|
+
//! @param[in] stream
|
|
1179
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
1180
|
+
//! Default is stream<sub>0</sub>.
|
|
1181
|
+
template <typename KeyT, typename ValueT, typename NumItemsT>
|
|
1182
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
|
|
1183
|
+
void* d_temp_storage,
|
|
1184
|
+
size_t& temp_storage_bytes,
|
|
1185
|
+
const KeyT* d_keys_in,
|
|
1186
|
+
KeyT* d_keys_out,
|
|
1187
|
+
const ValueT* d_values_in,
|
|
1188
|
+
ValueT* d_values_out,
|
|
1189
|
+
NumItemsT num_items,
|
|
1190
|
+
int begin_bit = 0,
|
|
1191
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
1192
|
+
cudaStream_t stream = 0)
|
|
1193
|
+
{
|
|
1194
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1195
|
+
|
|
1196
|
+
// Unsigned integer type for global offsets.
|
|
1197
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
1198
|
+
|
|
1199
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
1200
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
1201
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
1202
|
+
// is not set.
|
|
1203
|
+
constexpr bool is_overwrite_okay = false;
|
|
1204
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
1205
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
1206
|
+
|
|
1207
|
+
return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
|
|
1208
|
+
d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
//! @rst
|
|
1212
|
+
//! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
|
|
1213
|
+
//!
|
|
1214
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
1215
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
1216
|
+
//! supported.
|
|
1217
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
1218
|
+
//! any of the provided ranges:
|
|
1219
|
+
//!
|
|
1220
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
1221
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
1222
|
+
//! * ``[d_values_in, d_values_in + num_items)``
|
|
1223
|
+
//! * ``[d_values_out, d_values_out + num_items)``
|
|
1224
|
+
//!
|
|
1225
|
+
//! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
|
|
1226
|
+
//! differentiating key bits. This can reduce overall sorting overhead and
|
|
1227
|
+
//! yield a corresponding performance improvement.
|
|
1228
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
1229
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
1230
|
+
//! * @devicestorage
|
|
1231
|
+
//!
|
|
1232
|
+
//! Snippet
|
|
1233
|
+
//! --------------------------------------------------
|
|
1234
|
+
//!
|
|
1235
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
1236
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
1237
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
1238
|
+
//! tuple of references to relevant members of the key.
|
|
1239
|
+
//!
|
|
1240
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1241
|
+
//! :language: c++
|
|
1242
|
+
//! :dedent:
|
|
1243
|
+
//! :start-after: example-begin custom-type
|
|
1244
|
+
//! :end-before: example-end custom-type
|
|
1245
|
+
//!
|
|
1246
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
1247
|
+
//! using ``cub::DeviceRadixSort::SortPairsDescending``:
|
|
1248
|
+
//!
|
|
1249
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1250
|
+
//! :language: c++
|
|
1251
|
+
//! :dedent:
|
|
1252
|
+
//! :start-after: example-begin pairs-descending-bits
|
|
1253
|
+
//! :end-before: example-end pairs-descending-bits
|
|
1254
|
+
//!
|
|
1255
|
+
//! @endrst
|
|
1256
|
+
//!
|
|
1257
|
+
//! @tparam KeyT
|
|
1258
|
+
//! **[inferred]** KeyT type
|
|
1259
|
+
//!
|
|
1260
|
+
//! @tparam ValueT
|
|
1261
|
+
//! **[inferred]** ValueT type
|
|
1262
|
+
//!
|
|
1263
|
+
//! @tparam NumItemsT
|
|
1264
|
+
//! **[inferred]** Type of num_items
|
|
1265
|
+
//!
|
|
1266
|
+
//! @tparam DecomposerT
|
|
1267
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
1268
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
1269
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
1270
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
1271
|
+
//! The call operator must not modify members of the key.
|
|
1272
|
+
//!
|
|
1273
|
+
//! @param[in] d_temp_storage
|
|
1274
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1275
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1276
|
+
//! is done.
|
|
1277
|
+
//!
|
|
1278
|
+
//! @param[in,out] temp_storage_bytes
|
|
1279
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1280
|
+
//!
|
|
1281
|
+
//! @param[in] d_keys_in
|
|
1282
|
+
//! Pointer to the input data of key data to sort
|
|
1283
|
+
//!
|
|
1284
|
+
//! @param[out] d_keys_out
|
|
1285
|
+
//! Pointer to the sorted output sequence of key data
|
|
1286
|
+
//!
|
|
1287
|
+
//! @param[in] d_values_in
|
|
1288
|
+
//! Pointer to the corresponding input sequence of associated value items
|
|
1289
|
+
//!
|
|
1290
|
+
//! @param[out] d_values_out
|
|
1291
|
+
//! Pointer to the correspondingly-reordered output sequence of associated
|
|
1292
|
+
//! value items
|
|
1293
|
+
//!
|
|
1294
|
+
//! @param[in] num_items
|
|
1295
|
+
//! Number of items to sort
|
|
1296
|
+
//!
|
|
1297
|
+
//! @param decomposer
|
|
1298
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
1299
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
1300
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
1301
|
+
//! modify members of the key.
|
|
1302
|
+
//!
|
|
1303
|
+
//! @param[in] begin_bit
|
|
1304
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
1305
|
+
//! key comparison
|
|
1306
|
+
//!
|
|
1307
|
+
//! @param[in] end_bit
|
|
1308
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1309
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
1310
|
+
//!
|
|
1311
|
+
//! @param[in] stream
|
|
1312
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
1313
|
+
//! Default is stream<sub>0</sub>.
|
|
1314
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
1315
|
+
CUB_RUNTIME_FUNCTION static //
|
|
1316
|
+
::cuda::std::enable_if_t< //
|
|
1317
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
1318
|
+
cudaError_t>
|
|
1319
|
+
SortPairsDescending(
|
|
1320
|
+
void* d_temp_storage,
|
|
1321
|
+
size_t& temp_storage_bytes,
|
|
1322
|
+
const KeyT* d_keys_in,
|
|
1323
|
+
KeyT* d_keys_out,
|
|
1324
|
+
const ValueT* d_values_in,
|
|
1325
|
+
ValueT* d_values_out,
|
|
1326
|
+
NumItemsT num_items,
|
|
1327
|
+
DecomposerT decomposer,
|
|
1328
|
+
int begin_bit,
|
|
1329
|
+
int end_bit,
|
|
1330
|
+
cudaStream_t stream = 0)
|
|
1331
|
+
{
|
|
1332
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1333
|
+
|
|
1334
|
+
// unsigned integer type for global offsets
|
|
1335
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
1336
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
1337
|
+
|
|
1338
|
+
static_assert(decomposer_check_t::value,
|
|
1339
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
1340
|
+
"arithmetic types");
|
|
1341
|
+
|
|
1342
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
1343
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
1344
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
1345
|
+
// is not set.
|
|
1346
|
+
constexpr bool is_overwrite_okay = false;
|
|
1347
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
1348
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
1349
|
+
|
|
1350
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
1351
|
+
decomposer_check_t{},
|
|
1352
|
+
d_temp_storage,
|
|
1353
|
+
temp_storage_bytes,
|
|
1354
|
+
is_overwrite_okay,
|
|
1355
|
+
d_keys,
|
|
1356
|
+
d_values,
|
|
1357
|
+
static_cast<offset_t>(num_items),
|
|
1358
|
+
decomposer,
|
|
1359
|
+
begin_bit,
|
|
1360
|
+
end_bit,
|
|
1361
|
+
stream);
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
//! @rst
|
|
1365
|
+
//! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage.
|
|
1366
|
+
//!
|
|
1367
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
1368
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
1369
|
+
//! supported.
|
|
1370
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
1371
|
+
//! any of the provided ranges:
|
|
1372
|
+
//!
|
|
1373
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
1374
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
1375
|
+
//! * ``[d_values_in, d_values_in + num_items)``
|
|
1376
|
+
//! * ``[d_values_out, d_values_out + num_items)``
|
|
1377
|
+
//!
|
|
1378
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
1379
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
1380
|
+
//! * @devicestorage
|
|
1381
|
+
//!
|
|
1382
|
+
//! Snippet
|
|
1383
|
+
//! --------------------------------------------------
|
|
1384
|
+
//!
|
|
1385
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
1386
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
1387
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
1388
|
+
//! tuple of references to relevant members of the key.
|
|
1389
|
+
//!
|
|
1390
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1391
|
+
//! :language: c++
|
|
1392
|
+
//! :dedent:
|
|
1393
|
+
//! :start-after: example-begin custom-type
|
|
1394
|
+
//! :end-before: example-end custom-type
|
|
1395
|
+
//!
|
|
1396
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
1397
|
+
//! using ``cub::DeviceRadixSort::SortPairsDescending``:
|
|
1398
|
+
//!
|
|
1399
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1400
|
+
//! :language: c++
|
|
1401
|
+
//! :dedent:
|
|
1402
|
+
//! :start-after: example-begin pairs-descending
|
|
1403
|
+
//! :end-before: example-end pairs-descending
|
|
1404
|
+
//!
|
|
1405
|
+
//! @endrst
|
|
1406
|
+
//!
|
|
1407
|
+
//! @tparam KeyT
|
|
1408
|
+
//! **[inferred]** KeyT type
|
|
1409
|
+
//!
|
|
1410
|
+
//! @tparam ValueT
|
|
1411
|
+
//! **[inferred]** ValueT type
|
|
1412
|
+
//!
|
|
1413
|
+
//! @tparam NumItemsT
|
|
1414
|
+
//! **[inferred]** Type of num_items
|
|
1415
|
+
//!
|
|
1416
|
+
//! @tparam DecomposerT
|
|
1417
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
1418
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
1419
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
1420
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
1421
|
+
//! The call operator must not modify members of the key.
|
|
1422
|
+
//!
|
|
1423
|
+
//! @param[in] d_temp_storage
|
|
1424
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1425
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1426
|
+
//! is done.
|
|
1427
|
+
//!
|
|
1428
|
+
//! @param[in,out] temp_storage_bytes
|
|
1429
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1430
|
+
//!
|
|
1431
|
+
//! @param[in] d_keys_in
|
|
1432
|
+
//! Pointer to the input data of key data to sort
|
|
1433
|
+
//!
|
|
1434
|
+
//! @param[out] d_keys_out
|
|
1435
|
+
//! Pointer to the sorted output sequence of key data
|
|
1436
|
+
//!
|
|
1437
|
+
//! @param[in] d_values_in
|
|
1438
|
+
//! Pointer to the corresponding input sequence of associated value items
|
|
1439
|
+
//!
|
|
1440
|
+
//! @param[out] d_values_out
|
|
1441
|
+
//! Pointer to the correspondingly-reordered output sequence of associated
|
|
1442
|
+
//! value items
|
|
1443
|
+
//!
|
|
1444
|
+
//! @param[in] num_items
|
|
1445
|
+
//! Number of items to sort
|
|
1446
|
+
//!
|
|
1447
|
+
//! @param decomposer
|
|
1448
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
1449
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
1450
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
1451
|
+
//! modify members of the key.
|
|
1452
|
+
//!
|
|
1453
|
+
//! @param[in] stream
|
|
1454
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
1455
|
+
//! Default is stream<sub>0</sub>.
|
|
1456
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
1457
|
+
CUB_RUNTIME_FUNCTION static //
|
|
1458
|
+
::cuda::std::enable_if_t< //
|
|
1459
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
1460
|
+
cudaError_t>
|
|
1461
|
+
SortPairsDescending(
|
|
1462
|
+
void* d_temp_storage,
|
|
1463
|
+
size_t& temp_storage_bytes,
|
|
1464
|
+
const KeyT* d_keys_in,
|
|
1465
|
+
KeyT* d_keys_out,
|
|
1466
|
+
const ValueT* d_values_in,
|
|
1467
|
+
ValueT* d_values_out,
|
|
1468
|
+
NumItemsT num_items,
|
|
1469
|
+
DecomposerT decomposer,
|
|
1470
|
+
cudaStream_t stream = 0)
|
|
1471
|
+
{
|
|
1472
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1473
|
+
|
|
1474
|
+
// unsigned integer type for global offsets
|
|
1475
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
1476
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
1477
|
+
|
|
1478
|
+
static_assert(decomposer_check_t::value,
|
|
1479
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
1480
|
+
"arithmetic types");
|
|
1481
|
+
|
|
1482
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
1483
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
1484
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
1485
|
+
// is not set.
|
|
1486
|
+
constexpr bool is_overwrite_okay = false;
|
|
1487
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
1488
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
1489
|
+
|
|
1490
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
1491
|
+
decomposer_check_t{},
|
|
1492
|
+
d_temp_storage,
|
|
1493
|
+
temp_storage_bytes,
|
|
1494
|
+
is_overwrite_okay,
|
|
1495
|
+
d_keys,
|
|
1496
|
+
d_values,
|
|
1497
|
+
static_cast<offset_t>(num_items),
|
|
1498
|
+
decomposer,
|
|
1499
|
+
stream);
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
//! @rst
|
|
1503
|
+
//! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
|
|
1504
|
+
//!
|
|
1505
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
1506
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
1507
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
1508
|
+
//! contains the input data to be sorted).
|
|
1509
|
+
//! - The contents of both buffers within each pair may be altered by the
|
|
1510
|
+
//! sorting operation.
|
|
1511
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
1512
|
+
//! any of the provided ranges:
|
|
1513
|
+
//!
|
|
1514
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
1515
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
1516
|
+
//! - ``[d_values.Current(), d_values.Current() + num_items)``
|
|
1517
|
+
//! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
|
|
1518
|
+
//!
|
|
1519
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
1520
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
1521
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
1522
|
+
//! of key bits specified and the targeted device architecture).
|
|
1523
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
1524
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
1525
|
+
//! yield a corresponding performance improvement.
|
|
1526
|
+
//! - @devicestorageP
|
|
1527
|
+
//! - @devicestorage
|
|
1528
|
+
//!
|
|
1529
|
+
//! Snippet
|
|
1530
|
+
//! --------------------------------------------------
|
|
1531
|
+
//!
|
|
1532
|
+
//! The code snippet below illustrates the sorting of a device vector of ``int``
|
|
1533
|
+
//! keys with associated vector of ``int`` values.
|
|
1534
|
+
//! @endrst
|
|
1535
|
+
//!
|
|
1536
|
+
//! @code{.cpp}
|
|
1537
|
+
//! #include <cub/cub.cuh>
|
|
1538
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
1539
|
+
//!
|
|
1540
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1541
|
+
//! // for sorting data
|
|
1542
|
+
//! int num_items; // e.g., 7
|
|
1543
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1544
|
+
//! int *d_key_alt_buf; // e.g., [ ... ]
|
|
1545
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
1546
|
+
//! int *d_value_alt_buf; // e.g., [ ... ]
|
|
1547
|
+
//! ...
|
|
1548
|
+
//!
|
|
1549
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
1550
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
1551
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
1552
|
+
//!
|
|
1553
|
+
//! // Determine temporary device storage requirements
|
|
1554
|
+
//! void *d_temp_storage = nullptr;
|
|
1555
|
+
//! size_t temp_storage_bytes = 0;
|
|
1556
|
+
//! cub::DeviceRadixSort::SortPairsDescending(
|
|
1557
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
|
|
1558
|
+
//!
|
|
1559
|
+
//! // Allocate temporary storage
|
|
1560
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1561
|
+
//!
|
|
1562
|
+
//! // Run sorting operation
|
|
1563
|
+
//! cub::DeviceRadixSort::SortPairsDescending(
|
|
1564
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
|
|
1565
|
+
//!
|
|
1566
|
+
//! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
|
|
1567
|
+
//! // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5]
|
|
1568
|
+
//! @endcode
|
|
1569
|
+
//!
|
|
1570
|
+
//! @tparam KeyT
|
|
1571
|
+
//! **[inferred]** KeyT type
|
|
1572
|
+
//!
|
|
1573
|
+
//! @tparam ValueT
|
|
1574
|
+
//! **[inferred]** ValueT type
|
|
1575
|
+
//!
|
|
1576
|
+
//! @tparam NumItemsT
|
|
1577
|
+
//! **[inferred]** Type of num_items
|
|
1578
|
+
//!
|
|
1579
|
+
//! @param[in] d_temp_storage
|
|
1580
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1581
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1582
|
+
//! is done.
|
|
1583
|
+
//!
|
|
1584
|
+
//! @param[in,out] temp_storage_bytes
|
|
1585
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1586
|
+
//!
|
|
1587
|
+
//! @param[in,out] d_keys
|
|
1588
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1589
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1590
|
+
//! point to the sorted output keys
|
|
1591
|
+
//!
|
|
1592
|
+
//! @param[in,out] d_values
|
|
1593
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
1594
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
1595
|
+
//! to the sorted output values
|
|
1596
|
+
//!
|
|
1597
|
+
//! @param[in] num_items
|
|
1598
|
+
//! Number of items to sort
|
|
1599
|
+
//!
|
|
1600
|
+
//! @param[in] begin_bit
|
|
1601
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
1602
|
+
//! key comparison
|
|
1603
|
+
//!
|
|
1604
|
+
//! @param[in] end_bit
|
|
1605
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1606
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
1607
|
+
//!
|
|
1608
|
+
//! @param[in] stream
|
|
1609
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
1610
|
+
//! Default is stream<sub>0</sub>.
|
|
1611
|
+
template <typename KeyT, typename ValueT, typename NumItemsT>
|
|
1612
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
|
|
1613
|
+
void* d_temp_storage,
|
|
1614
|
+
size_t& temp_storage_bytes,
|
|
1615
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1616
|
+
DoubleBuffer<ValueT>& d_values,
|
|
1617
|
+
NumItemsT num_items,
|
|
1618
|
+
int begin_bit = 0,
|
|
1619
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
1620
|
+
cudaStream_t stream = 0)
|
|
1621
|
+
{
|
|
1622
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1623
|
+
|
|
1624
|
+
// Unsigned integer type for global offsets.
|
|
1625
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
1626
|
+
|
|
1627
|
+
constexpr bool is_overwrite_okay = true;
|
|
1628
|
+
|
|
1629
|
+
return DispatchRadixSort<SortOrder::Descending, KeyT, ValueT, OffsetT>::Dispatch(
|
|
1630
|
+
d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
|
|
1631
|
+
}
|
|
1632
|
+
|
|
1633
|
+
//! @rst
|
|
1634
|
+
//! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
|
|
1635
|
+
//!
|
|
1636
|
+
//! * The sorting operation is given a pair of key buffers and a corresponding
|
|
1637
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
1638
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
1639
|
+
//! contains the input data to be sorted).
|
|
1640
|
+
//! * The contents of both buffers within each pair may be altered by the
|
|
1641
|
+
//! sorting operation.
|
|
1642
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
1643
|
+
//! any of the provided ranges:
|
|
1644
|
+
//!
|
|
1645
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
1646
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
1647
|
+
//! - ``[d_values.Current(), d_values.Current() + num_items)``
|
|
1648
|
+
//! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
|
|
1649
|
+
//!
|
|
1650
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
1651
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
1652
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
1653
|
+
//! number of key bits specified and the targeted device architecture).
|
|
1654
|
+
//! - @devicestorageP
|
|
1655
|
+
//! - @devicestorage
|
|
1656
|
+
//!
|
|
1657
|
+
//! Snippet
|
|
1658
|
+
//! --------------------------------------------------
|
|
1659
|
+
//!
|
|
1660
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
1661
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
1662
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
1663
|
+
//! tuple of references to relevant members of the key.
|
|
1664
|
+
//!
|
|
1665
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1666
|
+
//! :language: c++
|
|
1667
|
+
//! :dedent:
|
|
1668
|
+
//! :start-after: example-begin custom-type
|
|
1669
|
+
//! :end-before: example-end custom-type
|
|
1670
|
+
//!
|
|
1671
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
1672
|
+
//! using ``cub::DeviceRadixSort::SortPairsDescending``:
|
|
1673
|
+
//!
|
|
1674
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1675
|
+
//! :language: c++
|
|
1676
|
+
//! :dedent:
|
|
1677
|
+
//! :start-after: example-begin pairs-descending-db
|
|
1678
|
+
//! :end-before: example-end pairs-descending-db
|
|
1679
|
+
//!
|
|
1680
|
+
//! @endrst
|
|
1681
|
+
//!
|
|
1682
|
+
//! @tparam KeyT
|
|
1683
|
+
//! **[inferred]** KeyT type
|
|
1684
|
+
//!
|
|
1685
|
+
//! @tparam ValueT
|
|
1686
|
+
//! **[inferred]** ValueT type
|
|
1687
|
+
//!
|
|
1688
|
+
//! @tparam NumItemsT
|
|
1689
|
+
//! **[inferred]** Type of num_items
|
|
1690
|
+
//!
|
|
1691
|
+
//! @tparam DecomposerT
|
|
1692
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
1693
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
1694
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
1695
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
1696
|
+
//! The call operator must not modify members of the key.
|
|
1697
|
+
//!
|
|
1698
|
+
//! @param[in] d_temp_storage
|
|
1699
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1700
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1701
|
+
//! is done.
|
|
1702
|
+
//!
|
|
1703
|
+
//! @param[in,out] temp_storage_bytes
|
|
1704
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1705
|
+
//!
|
|
1706
|
+
//! @param[in,out] d_keys
|
|
1707
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1708
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1709
|
+
//! point to the sorted output keys
|
|
1710
|
+
//!
|
|
1711
|
+
//! @param[in,out] d_values
|
|
1712
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
1713
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
1714
|
+
//! to the sorted output values
|
|
1715
|
+
//!
|
|
1716
|
+
//! @param[in] num_items
|
|
1717
|
+
//! Number of items to sort
|
|
1718
|
+
//!
|
|
1719
|
+
//! @param decomposer
|
|
1720
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
1721
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
1722
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
1723
|
+
//! modify members of the key.
|
|
1724
|
+
//!
|
|
1725
|
+
//! @param[in] stream
|
|
1726
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
1727
|
+
//! Default is stream<sub>0</sub>.
|
|
1728
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
1729
|
+
CUB_RUNTIME_FUNCTION static //
|
|
1730
|
+
::cuda::std::enable_if_t< //
|
|
1731
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
1732
|
+
cudaError_t>
|
|
1733
|
+
SortPairsDescending(
|
|
1734
|
+
void* d_temp_storage,
|
|
1735
|
+
size_t& temp_storage_bytes,
|
|
1736
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1737
|
+
DoubleBuffer<ValueT>& d_values,
|
|
1738
|
+
NumItemsT num_items,
|
|
1739
|
+
DecomposerT decomposer,
|
|
1740
|
+
cudaStream_t stream = 0)
|
|
1741
|
+
{
|
|
1742
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1743
|
+
|
|
1744
|
+
// unsigned integer type for global offsets
|
|
1745
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
1746
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
1747
|
+
|
|
1748
|
+
static_assert(decomposer_check_t::value,
|
|
1749
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
1750
|
+
"arithmetic types");
|
|
1751
|
+
|
|
1752
|
+
constexpr bool is_overwrite_okay = true;
|
|
1753
|
+
|
|
1754
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
1755
|
+
decomposer_check_t{},
|
|
1756
|
+
d_temp_storage,
|
|
1757
|
+
temp_storage_bytes,
|
|
1758
|
+
is_overwrite_okay,
|
|
1759
|
+
d_keys,
|
|
1760
|
+
d_values,
|
|
1761
|
+
static_cast<offset_t>(num_items),
|
|
1762
|
+
decomposer,
|
|
1763
|
+
stream);
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
//! @rst
|
|
1767
|
+
//! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage.
|
|
1768
|
+
//!
|
|
1769
|
+
//! * The sorting operation is given a pair of key buffers and a corresponding
|
|
1770
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
1771
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
1772
|
+
//! contains the input data to be sorted).
|
|
1773
|
+
//! * The contents of both buffers within each pair may be altered by the
|
|
1774
|
+
//! sorting operation.
|
|
1775
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
1776
|
+
//! any of the provided ranges:
|
|
1777
|
+
//!
|
|
1778
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
1779
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
1780
|
+
//! - ``[d_values.Current(), d_values.Current() + num_items)``
|
|
1781
|
+
//! - ``[d_values.Alternate(), d_values.Alternate() + num_items)``
|
|
1782
|
+
//!
|
|
1783
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
1784
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
1785
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
1786
|
+
//! number of key bits specified and the targeted device architecture).
|
|
1787
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
1788
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
1789
|
+
//! yield a corresponding performance improvement.
|
|
1790
|
+
//! - @devicestorageP
|
|
1791
|
+
//! - @devicestorage
|
|
1792
|
+
//!
|
|
1793
|
+
//! Snippet
|
|
1794
|
+
//! --------------------------------------------------
|
|
1795
|
+
//!
|
|
1796
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
1797
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
1798
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
1799
|
+
//! tuple of references to relevant members of the key.
|
|
1800
|
+
//!
|
|
1801
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1802
|
+
//! :language: c++
|
|
1803
|
+
//! :dedent:
|
|
1804
|
+
//! :start-after: example-begin custom-type
|
|
1805
|
+
//! :end-before: example-end custom-type
|
|
1806
|
+
//!
|
|
1807
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
1808
|
+
//! using ``cub::DeviceRadixSort::SortPairsDescending``:
|
|
1809
|
+
//!
|
|
1810
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
1811
|
+
//! :language: c++
|
|
1812
|
+
//! :dedent:
|
|
1813
|
+
//! :start-after: example-begin pairs-descending-bits-db
|
|
1814
|
+
//! :end-before: example-end pairs-descending-bits-db
|
|
1815
|
+
//!
|
|
1816
|
+
//! @endrst
|
|
1817
|
+
//!
|
|
1818
|
+
//! @tparam KeyT
|
|
1819
|
+
//! **[inferred]** KeyT type
|
|
1820
|
+
//!
|
|
1821
|
+
//! @tparam ValueT
|
|
1822
|
+
//! **[inferred]** ValueT type
|
|
1823
|
+
//!
|
|
1824
|
+
//! @tparam NumItemsT
|
|
1825
|
+
//! **[inferred]** Type of num_items
|
|
1826
|
+
//!
|
|
1827
|
+
//! @tparam DecomposerT
|
|
1828
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
1829
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
1830
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
1831
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
1832
|
+
//! The call operator must not modify members of the key.
|
|
1833
|
+
//!
|
|
1834
|
+
//! @param[in] d_temp_storage
|
|
1835
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1836
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1837
|
+
//! is done.
|
|
1838
|
+
//!
|
|
1839
|
+
//! @param[in,out] temp_storage_bytes
|
|
1840
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1841
|
+
//!
|
|
1842
|
+
//! @param[in,out] d_keys
|
|
1843
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1844
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1845
|
+
//! point to the sorted output keys
|
|
1846
|
+
//!
|
|
1847
|
+
//! @param[in,out] d_values
|
|
1848
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
1849
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
1850
|
+
//! to the sorted output values
|
|
1851
|
+
//!
|
|
1852
|
+
//! @param[in] num_items
|
|
1853
|
+
//! Number of items to sort
|
|
1854
|
+
//!
|
|
1855
|
+
//! @param decomposer
|
|
1856
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
1857
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
1858
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
1859
|
+
//! modify members of the key.
|
|
1860
|
+
//!
|
|
1861
|
+
//! @param[in] begin_bit
|
|
1862
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
1863
|
+
//! key comparison
|
|
1864
|
+
//!
|
|
1865
|
+
//! @param[in] end_bit
|
|
1866
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1867
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
1868
|
+
//!
|
|
1869
|
+
//! @param[in] stream
|
|
1870
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
1871
|
+
//! Default is stream<sub>0</sub>.
|
|
1872
|
+
template <typename KeyT, typename ValueT, typename NumItemsT, typename DecomposerT>
|
|
1873
|
+
CUB_RUNTIME_FUNCTION static //
|
|
1874
|
+
::cuda::std::enable_if_t< //
|
|
1875
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
1876
|
+
cudaError_t>
|
|
1877
|
+
SortPairsDescending(
|
|
1878
|
+
void* d_temp_storage,
|
|
1879
|
+
size_t& temp_storage_bytes,
|
|
1880
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1881
|
+
DoubleBuffer<ValueT>& d_values,
|
|
1882
|
+
NumItemsT num_items,
|
|
1883
|
+
DecomposerT decomposer,
|
|
1884
|
+
int begin_bit,
|
|
1885
|
+
int end_bit,
|
|
1886
|
+
cudaStream_t stream = 0)
|
|
1887
|
+
{
|
|
1888
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1889
|
+
|
|
1890
|
+
// unsigned integer type for global offsets
|
|
1891
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
1892
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
1893
|
+
|
|
1894
|
+
static_assert(decomposer_check_t::value,
|
|
1895
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
1896
|
+
"arithmetic types");
|
|
1897
|
+
|
|
1898
|
+
constexpr bool is_overwrite_okay = true;
|
|
1899
|
+
|
|
1900
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
1901
|
+
decomposer_check_t{},
|
|
1902
|
+
d_temp_storage,
|
|
1903
|
+
temp_storage_bytes,
|
|
1904
|
+
is_overwrite_okay,
|
|
1905
|
+
d_keys,
|
|
1906
|
+
d_values,
|
|
1907
|
+
static_cast<offset_t>(num_items),
|
|
1908
|
+
decomposer,
|
|
1909
|
+
begin_bit,
|
|
1910
|
+
end_bit,
|
|
1911
|
+
stream);
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
//! @} end member group
|
|
1915
|
+
//! @name Keys-only
|
|
1916
|
+
//! @{
|
|
1917
|
+
|
|
1918
|
+
//! @rst
|
|
1919
|
+
//! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
|
|
1920
|
+
//!
|
|
1921
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
1922
|
+
//! - Pointers to contiguous memory must be used; iterators are not currently
|
|
1923
|
+
//! supported.
|
|
1924
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
1925
|
+
//! any of the provided ranges:
|
|
1926
|
+
//!
|
|
1927
|
+
//! - ``[d_keys_in, d_keys_in + num_items)``
|
|
1928
|
+
//! - ``[d_keys_out, d_keys_out + num_items)``
|
|
1929
|
+
//!
|
|
1930
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
1931
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
1932
|
+
//! yield a corresponding performance improvement.
|
|
1933
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
1934
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
1935
|
+
//! - @devicestorage
|
|
1936
|
+
//!
|
|
1937
|
+
//! Snippet
|
|
1938
|
+
//! --------------------------------------------------
|
|
1939
|
+
//!
|
|
1940
|
+
//! The code snippet below illustrates the sorting of a device vector of
|
|
1941
|
+
//! ``int`` keys.
|
|
1942
|
+
//! @endrst
|
|
1943
|
+
//!
|
|
1944
|
+
//! @code{.cpp}
|
|
1945
|
+
//! #include <cub/cub.cuh>
|
|
1946
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
1947
|
+
//!
|
|
1948
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1949
|
+
//! // for sorting data
|
|
1950
|
+
//! int num_items; // e.g., 7
|
|
1951
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1952
|
+
//! int *d_keys_out; // e.g., [ ... ]
|
|
1953
|
+
//! ...
|
|
1954
|
+
//!
|
|
1955
|
+
//! // Determine temporary device storage requirements
|
|
1956
|
+
//! void *d_temp_storage = nullptr;
|
|
1957
|
+
//! size_t temp_storage_bytes = 0;
|
|
1958
|
+
//! cub::DeviceRadixSort::SortKeys(
|
|
1959
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
|
|
1960
|
+
//!
|
|
1961
|
+
//! // Allocate temporary storage
|
|
1962
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1963
|
+
//!
|
|
1964
|
+
//! // Run sorting operation
|
|
1965
|
+
//! cub::DeviceRadixSort::SortKeys(
|
|
1966
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
|
|
1967
|
+
//!
|
|
1968
|
+
//! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9]
|
|
1969
|
+
//! @endcode
|
|
1970
|
+
//!
|
|
1971
|
+
//! @tparam KeyT
|
|
1972
|
+
//! **[inferred]** KeyT type
|
|
1973
|
+
//!
|
|
1974
|
+
//! @tparam NumItemsT
|
|
1975
|
+
//! **[inferred]** Type of num_items
|
|
1976
|
+
//!
|
|
1977
|
+
//! @tparam NumItemsT
|
|
1978
|
+
//! **[inferred]** Type of num_items
|
|
1979
|
+
//!
|
|
1980
|
+
//! @param[in] d_temp_storage
|
|
1981
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
1982
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
1983
|
+
//! is done.
|
|
1984
|
+
//!
|
|
1985
|
+
//! @param[in,out] temp_storage_bytes
|
|
1986
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
1987
|
+
//!
|
|
1988
|
+
//! @param[in] d_keys_in
|
|
1989
|
+
//! Pointer to the input data of key data to sort
|
|
1990
|
+
//!
|
|
1991
|
+
//! @param[out] d_keys_out
|
|
1992
|
+
//! Pointer to the sorted output sequence of key data
|
|
1993
|
+
//!
|
|
1994
|
+
//! @param[in] num_items
|
|
1995
|
+
//! Number of items to sort
|
|
1996
|
+
//!
|
|
1997
|
+
//! @param[in] begin_bit
|
|
1998
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
1999
|
+
//! key comparison
|
|
2000
|
+
//!
|
|
2001
|
+
//! @param[in] end_bit
|
|
2002
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
2003
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
2004
|
+
//!
|
|
2005
|
+
//! @param[in] stream
|
|
2006
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2007
|
+
//! Default is stream<sub>0</sub>.
|
|
2008
|
+
template <typename KeyT, typename NumItemsT>
|
|
2009
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
|
|
2010
|
+
void* d_temp_storage,
|
|
2011
|
+
size_t& temp_storage_bytes,
|
|
2012
|
+
const KeyT* d_keys_in,
|
|
2013
|
+
KeyT* d_keys_out,
|
|
2014
|
+
NumItemsT num_items,
|
|
2015
|
+
int begin_bit = 0,
|
|
2016
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
2017
|
+
cudaStream_t stream = 0)
|
|
2018
|
+
{
|
|
2019
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2020
|
+
|
|
2021
|
+
// Unsigned integer type for global offsets.
|
|
2022
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
2023
|
+
|
|
2024
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
2025
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
2026
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
2027
|
+
// is not set.
|
|
2028
|
+
constexpr bool is_overwrite_okay = false;
|
|
2029
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
2030
|
+
// Null value type
|
|
2031
|
+
DoubleBuffer<NullType> d_values;
|
|
2032
|
+
|
|
2033
|
+
return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
|
|
2034
|
+
d_temp_storage,
|
|
2035
|
+
temp_storage_bytes,
|
|
2036
|
+
d_keys,
|
|
2037
|
+
d_values,
|
|
2038
|
+
static_cast<OffsetT>(num_items),
|
|
2039
|
+
begin_bit,
|
|
2040
|
+
end_bit,
|
|
2041
|
+
is_overwrite_okay,
|
|
2042
|
+
stream);
|
|
2043
|
+
}
|
|
2044
|
+
|
|
2045
|
+
//! @rst
|
|
2046
|
+
//! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
|
|
2047
|
+
//!
|
|
2048
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
2049
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
2050
|
+
//! supported.
|
|
2051
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
2052
|
+
//! any of the provided ranges:
|
|
2053
|
+
//!
|
|
2054
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
2055
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
2056
|
+
//!
|
|
2057
|
+
//! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
|
|
2058
|
+
//! differentiating key bits. This can reduce overall sorting overhead and
|
|
2059
|
+
//! yield a corresponding performance improvement.
|
|
2060
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
2061
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
2062
|
+
//! * @devicestorage
|
|
2063
|
+
//!
|
|
2064
|
+
//! Snippet
|
|
2065
|
+
//! --------------------------------------------------
|
|
2066
|
+
//!
|
|
2067
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
2068
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
2069
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
2070
|
+
//! tuple of references to relevant members of the key.
|
|
2071
|
+
//!
|
|
2072
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2073
|
+
//! :language: c++
|
|
2074
|
+
//! :dedent:
|
|
2075
|
+
//! :start-after: example-begin custom-type
|
|
2076
|
+
//! :end-before: example-end custom-type
|
|
2077
|
+
//!
|
|
2078
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
2079
|
+
//! using ``cub::DeviceRadixSort::SortKeys``:
|
|
2080
|
+
//!
|
|
2081
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2082
|
+
//! :language: c++
|
|
2083
|
+
//! :dedent:
|
|
2084
|
+
//! :start-after: example-begin keys-bits
|
|
2085
|
+
//! :end-before: example-end keys-bits
|
|
2086
|
+
//!
|
|
2087
|
+
//! @endrst
|
|
2088
|
+
//!
|
|
2089
|
+
//! @tparam KeyT
|
|
2090
|
+
//! **[inferred]** KeyT type
|
|
2091
|
+
//!
|
|
2092
|
+
//! @tparam NumItemsT
|
|
2093
|
+
//! **[inferred]** Type of num_items
|
|
2094
|
+
//!
|
|
2095
|
+
//! @tparam DecomposerT
|
|
2096
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
2097
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
2098
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
2099
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
2100
|
+
//! The call operator must not modify members of the key.
|
|
2101
|
+
//!
|
|
2102
|
+
//! @param[in] d_temp_storage
|
|
2103
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2104
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2105
|
+
//! is done.
|
|
2106
|
+
//!
|
|
2107
|
+
//! @param[in,out] temp_storage_bytes
|
|
2108
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2109
|
+
//!
|
|
2110
|
+
//! @param[in] d_keys_in
|
|
2111
|
+
//! Pointer to the input data of key data to sort
|
|
2112
|
+
//!
|
|
2113
|
+
//! @param[out] d_keys_out
|
|
2114
|
+
//! Pointer to the sorted output sequence of key data
|
|
2115
|
+
//!
|
|
2116
|
+
//! @param[in] num_items
|
|
2117
|
+
//! Number of items to sort
|
|
2118
|
+
//!
|
|
2119
|
+
//! @param decomposer
|
|
2120
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
2121
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
2122
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
2123
|
+
//! modify members of the key.
|
|
2124
|
+
//!
|
|
2125
|
+
//! @param[in] begin_bit
|
|
2126
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
2127
|
+
//! key comparison
|
|
2128
|
+
//!
|
|
2129
|
+
//! @param[in] end_bit
|
|
2130
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
2131
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
2132
|
+
//!
|
|
2133
|
+
//! @param[in] stream
|
|
2134
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2135
|
+
//! Default is stream<sub>0</sub>.
|
|
2136
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
2137
|
+
CUB_RUNTIME_FUNCTION static //
|
|
2138
|
+
::cuda::std::enable_if_t< //
|
|
2139
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
2140
|
+
cudaError_t>
|
|
2141
|
+
SortKeys(void* d_temp_storage,
|
|
2142
|
+
size_t& temp_storage_bytes,
|
|
2143
|
+
const KeyT* d_keys_in,
|
|
2144
|
+
KeyT* d_keys_out,
|
|
2145
|
+
NumItemsT num_items,
|
|
2146
|
+
DecomposerT decomposer,
|
|
2147
|
+
int begin_bit,
|
|
2148
|
+
int end_bit,
|
|
2149
|
+
cudaStream_t stream = 0)
|
|
2150
|
+
{
|
|
2151
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2152
|
+
|
|
2153
|
+
// unsigned integer type for global offsets
|
|
2154
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
2155
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
2156
|
+
|
|
2157
|
+
static_assert(decomposer_check_t::value,
|
|
2158
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
2159
|
+
"arithmetic types");
|
|
2160
|
+
|
|
2161
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
2162
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
2163
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
2164
|
+
// is not set.
|
|
2165
|
+
constexpr bool is_overwrite_okay = false;
|
|
2166
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
2167
|
+
DoubleBuffer<NullType> d_values;
|
|
2168
|
+
|
|
2169
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
2170
|
+
decomposer_check_t{},
|
|
2171
|
+
d_temp_storage,
|
|
2172
|
+
temp_storage_bytes,
|
|
2173
|
+
is_overwrite_okay,
|
|
2174
|
+
d_keys,
|
|
2175
|
+
d_values,
|
|
2176
|
+
static_cast<offset_t>(num_items),
|
|
2177
|
+
decomposer,
|
|
2178
|
+
begin_bit,
|
|
2179
|
+
end_bit,
|
|
2180
|
+
stream);
|
|
2181
|
+
}
|
|
2182
|
+
|
|
2183
|
+
//! @rst
|
|
2184
|
+
//! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage.
|
|
2185
|
+
//!
|
|
2186
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
2187
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
2188
|
+
//! supported.
|
|
2189
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
2190
|
+
//! any of the provided ranges:
|
|
2191
|
+
//!
|
|
2192
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
2193
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
2194
|
+
//!
|
|
2195
|
+
//! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
2196
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
2197
|
+
//! yield a corresponding performance improvement.
|
|
2198
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
2199
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
2200
|
+
//! * @devicestorage
|
|
2201
|
+
//!
|
|
2202
|
+
//! Snippet
|
|
2203
|
+
//! --------------------------------------------------
|
|
2204
|
+
//!
|
|
2205
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
2206
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
2207
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
2208
|
+
//! tuple of references to relevant members of the key.
|
|
2209
|
+
//!
|
|
2210
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2211
|
+
//! :language: c++
|
|
2212
|
+
//! :dedent:
|
|
2213
|
+
//! :start-after: example-begin custom-type
|
|
2214
|
+
//! :end-before: example-end custom-type
|
|
2215
|
+
//!
|
|
2216
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
2217
|
+
//! using ``cub::DeviceRadixSort::SortKeys``:
|
|
2218
|
+
//!
|
|
2219
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2220
|
+
//! :language: c++
|
|
2221
|
+
//! :dedent:
|
|
2222
|
+
//! :start-after: example-begin keys
|
|
2223
|
+
//! :end-before: example-end keys
|
|
2224
|
+
//!
|
|
2225
|
+
//! @endrst
|
|
2226
|
+
//!
|
|
2227
|
+
//! @tparam KeyT
|
|
2228
|
+
//! **[inferred]** KeyT type
|
|
2229
|
+
//!
|
|
2230
|
+
//! @tparam NumItemsT
|
|
2231
|
+
//! **[inferred]** Type of num_items
|
|
2232
|
+
//!
|
|
2233
|
+
//! @tparam DecomposerT
|
|
2234
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
2235
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
2236
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
2237
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
2238
|
+
//! The call operator must not modify members of the key.
|
|
2239
|
+
//!
|
|
2240
|
+
//! @param[in] d_temp_storage
|
|
2241
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2242
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2243
|
+
//! is done.
|
|
2244
|
+
//!
|
|
2245
|
+
//! @param[in,out] temp_storage_bytes
|
|
2246
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2247
|
+
//!
|
|
2248
|
+
//! @param[in] d_keys_in
|
|
2249
|
+
//! Pointer to the input data of key data to sort
|
|
2250
|
+
//!
|
|
2251
|
+
//! @param[out] d_keys_out
|
|
2252
|
+
//! Pointer to the sorted output sequence of key data
|
|
2253
|
+
//!
|
|
2254
|
+
//! @param[in] num_items
|
|
2255
|
+
//! Number of items to sort
|
|
2256
|
+
//!
|
|
2257
|
+
//! @param decomposer
|
|
2258
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
2259
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
2260
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
2261
|
+
//! modify members of the key.
|
|
2262
|
+
//!
|
|
2263
|
+
//! @param[in] stream
|
|
2264
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2265
|
+
//! Default is stream<sub>0</sub>.
|
|
2266
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
2267
|
+
CUB_RUNTIME_FUNCTION static //
|
|
2268
|
+
::cuda::std::enable_if_t< //
|
|
2269
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
2270
|
+
cudaError_t>
|
|
2271
|
+
SortKeys(void* d_temp_storage,
|
|
2272
|
+
size_t& temp_storage_bytes,
|
|
2273
|
+
const KeyT* d_keys_in,
|
|
2274
|
+
KeyT* d_keys_out,
|
|
2275
|
+
NumItemsT num_items,
|
|
2276
|
+
DecomposerT decomposer,
|
|
2277
|
+
cudaStream_t stream = 0)
|
|
2278
|
+
{
|
|
2279
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2280
|
+
|
|
2281
|
+
// unsigned integer type for global offsets
|
|
2282
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
2283
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
2284
|
+
|
|
2285
|
+
static_assert(decomposer_check_t::value,
|
|
2286
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
2287
|
+
"arithmetic types");
|
|
2288
|
+
|
|
2289
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
2290
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
2291
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
2292
|
+
// is not set.
|
|
2293
|
+
constexpr bool is_overwrite_okay = false;
|
|
2294
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
2295
|
+
DoubleBuffer<NullType> d_values;
|
|
2296
|
+
|
|
2297
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
2298
|
+
decomposer_check_t{},
|
|
2299
|
+
d_temp_storage,
|
|
2300
|
+
temp_storage_bytes,
|
|
2301
|
+
is_overwrite_okay,
|
|
2302
|
+
d_keys,
|
|
2303
|
+
d_values,
|
|
2304
|
+
static_cast<offset_t>(num_items),
|
|
2305
|
+
decomposer,
|
|
2306
|
+
stream);
|
|
2307
|
+
}
|
|
2308
|
+
|
|
2309
|
+
//! @rst
|
|
2310
|
+
//! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
|
|
2311
|
+
//!
|
|
2312
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
2313
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
2314
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
2315
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
2316
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
2317
|
+
//! any of the provided ranges:
|
|
2318
|
+
//!
|
|
2319
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
2320
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
2321
|
+
//!
|
|
2322
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
2323
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
2324
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
2325
|
+
//! number of key bits specified and the targeted device architecture).
|
|
2326
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
2327
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
2328
|
+
//! yield a corresponding performance improvement.
|
|
2329
|
+
//! - @devicestorageP
|
|
2330
|
+
//! - @devicestorage
|
|
2331
|
+
//!
|
|
2332
|
+
//! Snippet
|
|
2333
|
+
//! --------------------------------------------------
|
|
2334
|
+
//!
|
|
2335
|
+
//! The code snippet below illustrates the sorting of a device vector of
|
|
2336
|
+
//! ``int`` keys.
|
|
2337
|
+
//! @endrst
|
|
2338
|
+
//!
|
|
2339
|
+
//! @code{.cpp}
|
|
2340
|
+
//! #include <cub/cub.cuh>
|
|
2341
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
2342
|
+
//!
|
|
2343
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
2344
|
+
//! // for sorting data
|
|
2345
|
+
//! int num_items; // e.g., 7
|
|
2346
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2347
|
+
//! int *d_key_alt_buf; // e.g., [ ... ]
|
|
2348
|
+
//! ...
|
|
2349
|
+
//!
|
|
2350
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
2351
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
2352
|
+
//!
|
|
2353
|
+
//! // Determine temporary device storage requirements
|
|
2354
|
+
//! void *d_temp_storage = nullptr;
|
|
2355
|
+
//! size_t temp_storage_bytes = 0;
|
|
2356
|
+
//! cub::DeviceRadixSort::SortKeys(
|
|
2357
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, num_items);
|
|
2358
|
+
//!
|
|
2359
|
+
//! // Allocate temporary storage
|
|
2360
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2361
|
+
//!
|
|
2362
|
+
//! // Run sorting operation
|
|
2363
|
+
//! cub::DeviceRadixSort::SortKeys(
|
|
2364
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, num_items);
|
|
2365
|
+
//!
|
|
2366
|
+
//! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9]
|
|
2367
|
+
//! @endcode
|
|
2368
|
+
//!
|
|
2369
|
+
//! @tparam KeyT
|
|
2370
|
+
//! **[inferred]** KeyT type
|
|
2371
|
+
//!
|
|
2372
|
+
//! @tparam NumItemsT
|
|
2373
|
+
//! **[inferred]** Type of num_items
|
|
2374
|
+
//!
|
|
2375
|
+
//! @param[in] d_temp_storage
|
|
2376
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2377
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2378
|
+
//! is done.
|
|
2379
|
+
//!
|
|
2380
|
+
//! @param[in,out] temp_storage_bytes
|
|
2381
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2382
|
+
//!
|
|
2383
|
+
//! @param[in,out] d_keys
|
|
2384
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
2385
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
2386
|
+
//! point to the sorted output keys
|
|
2387
|
+
//!
|
|
2388
|
+
//! @param[in] num_items
|
|
2389
|
+
//! Number of items to sort
|
|
2390
|
+
//!
|
|
2391
|
+
//! @param[in] begin_bit
|
|
2392
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
2393
|
+
//! key comparison
|
|
2394
|
+
//!
|
|
2395
|
+
//! @param[in] end_bit
|
|
2396
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
2397
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
2398
|
+
//!
|
|
2399
|
+
//! @param[in] stream
|
|
2400
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2401
|
+
//! Default is stream<sub>0</sub>.
|
|
2402
|
+
template <typename KeyT, typename NumItemsT>
|
|
2403
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
|
|
2404
|
+
void* d_temp_storage,
|
|
2405
|
+
size_t& temp_storage_bytes,
|
|
2406
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
2407
|
+
NumItemsT num_items,
|
|
2408
|
+
int begin_bit = 0,
|
|
2409
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
2410
|
+
cudaStream_t stream = 0)
|
|
2411
|
+
{
|
|
2412
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2413
|
+
|
|
2414
|
+
// Unsigned integer type for global offsets.
|
|
2415
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
2416
|
+
|
|
2417
|
+
constexpr bool is_overwrite_okay = true;
|
|
2418
|
+
|
|
2419
|
+
// Null value type
|
|
2420
|
+
DoubleBuffer<NullType> d_values;
|
|
2421
|
+
|
|
2422
|
+
return DispatchRadixSort<SortOrder::Ascending, KeyT, NullType, OffsetT>::Dispatch(
|
|
2423
|
+
d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2426
|
+
//! @rst
|
|
2427
|
+
//! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
|
|
2428
|
+
//!
|
|
2429
|
+
//! * The sorting operation is given a pair of key buffers managed by a
|
|
2430
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
2431
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
2432
|
+
//! * The contents of both buffers may be altered by the sorting operation.
|
|
2433
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
2434
|
+
//! any of the provided ranges:
|
|
2435
|
+
//!
|
|
2436
|
+
//! * ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
2437
|
+
//! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
2438
|
+
//!
|
|
2439
|
+
//! * Upon completion, the sorting operation will update the "current"
|
|
2440
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
2441
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
2442
|
+
//! number of key bits specified and the targeted device architecture).
|
|
2443
|
+
//! * @devicestorageP
|
|
2444
|
+
//! * @devicestorage
|
|
2445
|
+
//!
|
|
2446
|
+
//! Snippet
|
|
2447
|
+
//! --------------------------------------------------
|
|
2448
|
+
//!
|
|
2449
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
2450
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
2451
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
2452
|
+
//! tuple of references to relevant members of the key.
|
|
2453
|
+
//!
|
|
2454
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2455
|
+
//! :language: c++
|
|
2456
|
+
//! :dedent:
|
|
2457
|
+
//! :start-after: example-begin custom-type
|
|
2458
|
+
//! :end-before: example-end custom-type
|
|
2459
|
+
//!
|
|
2460
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
2461
|
+
//! using ``cub::DeviceRadixSort::SortKeys``:
|
|
2462
|
+
//!
|
|
2463
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2464
|
+
//! :language: c++
|
|
2465
|
+
//! :dedent:
|
|
2466
|
+
//! :start-after: example-begin keys-db
|
|
2467
|
+
//! :end-before: example-end keys-db
|
|
2468
|
+
//!
|
|
2469
|
+
//! @endrst
|
|
2470
|
+
//!
|
|
2471
|
+
//! @tparam KeyT
|
|
2472
|
+
//! **[inferred]** KeyT type
|
|
2473
|
+
//!
|
|
2474
|
+
//! @tparam NumItemsT
|
|
2475
|
+
//! **[inferred]** Type of num_items
|
|
2476
|
+
//!
|
|
2477
|
+
//! @tparam DecomposerT
|
|
2478
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
2479
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
2480
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
2481
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
2482
|
+
//! The call operator must not modify members of the key.
|
|
2483
|
+
//!
|
|
2484
|
+
//! @param[in] d_temp_storage
|
|
2485
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2486
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2487
|
+
//! is done.
|
|
2488
|
+
//!
|
|
2489
|
+
//! @param[in,out] temp_storage_bytes
|
|
2490
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2491
|
+
//!
|
|
2492
|
+
//! @param[in,out] d_keys
|
|
2493
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
2494
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
2495
|
+
//! point to the sorted output keys
|
|
2496
|
+
//!
|
|
2497
|
+
//! @param[in] num_items
|
|
2498
|
+
//! Number of items to sort
|
|
2499
|
+
//!
|
|
2500
|
+
//! @param decomposer
|
|
2501
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
2502
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
2503
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
2504
|
+
//! modify members of the key.
|
|
2505
|
+
//!
|
|
2506
|
+
//! @param[in] stream
|
|
2507
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2508
|
+
//! Default is stream<sub>0</sub>.
|
|
2509
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
2510
|
+
CUB_RUNTIME_FUNCTION static //
|
|
2511
|
+
::cuda::std::enable_if_t< //
|
|
2512
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
2513
|
+
cudaError_t>
|
|
2514
|
+
SortKeys(void* d_temp_storage,
|
|
2515
|
+
size_t& temp_storage_bytes,
|
|
2516
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
2517
|
+
NumItemsT num_items,
|
|
2518
|
+
DecomposerT decomposer,
|
|
2519
|
+
cudaStream_t stream = 0)
|
|
2520
|
+
{
|
|
2521
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2522
|
+
|
|
2523
|
+
// unsigned integer type for global offsets
|
|
2524
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
2525
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
2526
|
+
|
|
2527
|
+
static_assert(decomposer_check_t::value,
|
|
2528
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
2529
|
+
"arithmetic types");
|
|
2530
|
+
|
|
2531
|
+
constexpr bool is_overwrite_okay = true;
|
|
2532
|
+
DoubleBuffer<NullType> d_values;
|
|
2533
|
+
|
|
2534
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
2535
|
+
decomposer_check_t{},
|
|
2536
|
+
d_temp_storage,
|
|
2537
|
+
temp_storage_bytes,
|
|
2538
|
+
is_overwrite_okay,
|
|
2539
|
+
d_keys,
|
|
2540
|
+
d_values,
|
|
2541
|
+
static_cast<offset_t>(num_items),
|
|
2542
|
+
decomposer,
|
|
2543
|
+
stream);
|
|
2544
|
+
}
|
|
2545
|
+
|
|
2546
|
+
//! @rst
|
|
2547
|
+
//! Sorts keys into ascending order using :math:`\approx N` auxiliary storage.
|
|
2548
|
+
//!
|
|
2549
|
+
//! * The sorting operation is given a pair of key buffers managed by a
|
|
2550
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
2551
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
2552
|
+
//! * The contents of both buffers may be altered by the sorting operation.
|
|
2553
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
2554
|
+
//! any of the provided ranges:
|
|
2555
|
+
//!
|
|
2556
|
+
//! * ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
2557
|
+
//! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
2558
|
+
//!
|
|
2559
|
+
//! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
|
|
2560
|
+
//! differentiating key bits. This can reduce overall sorting overhead and
|
|
2561
|
+
//! yield a corresponding performance improvement.
|
|
2562
|
+
//! * Upon completion, the sorting operation will update the "current"
|
|
2563
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
2564
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
2565
|
+
//! number of key bits specified and the targeted device architecture).
|
|
2566
|
+
//! * @devicestorageP
|
|
2567
|
+
//! * @devicestorage
|
|
2568
|
+
//!
|
|
2569
|
+
//! Snippet
|
|
2570
|
+
//! --------------------------------------------------
|
|
2571
|
+
//!
|
|
2572
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
2573
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
2574
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
2575
|
+
//! tuple of references to relevant members of the key.
|
|
2576
|
+
//!
|
|
2577
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2578
|
+
//! :language: c++
|
|
2579
|
+
//! :dedent:
|
|
2580
|
+
//! :start-after: example-begin custom-type
|
|
2581
|
+
//! :end-before: example-end custom-type
|
|
2582
|
+
//!
|
|
2583
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
2584
|
+
//! using ``cub::DeviceRadixSort::SortKeys``:
|
|
2585
|
+
//!
|
|
2586
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2587
|
+
//! :language: c++
|
|
2588
|
+
//! :dedent:
|
|
2589
|
+
//! :start-after: example-begin keys-bits-db
|
|
2590
|
+
//! :end-before: example-end keys-bits-db
|
|
2591
|
+
//!
|
|
2592
|
+
//! @endrst
|
|
2593
|
+
//!
|
|
2594
|
+
//! @tparam KeyT
|
|
2595
|
+
//! **[inferred]** KeyT type
|
|
2596
|
+
//!
|
|
2597
|
+
//! @tparam NumItemsT
|
|
2598
|
+
//! **[inferred]** Type of num_items
|
|
2599
|
+
//!
|
|
2600
|
+
//! @tparam DecomposerT
|
|
2601
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
2602
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
2603
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
2604
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
2605
|
+
//! The call operator must not modify members of the key.
|
|
2606
|
+
//!
|
|
2607
|
+
//! @param[in] d_temp_storage
|
|
2608
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2609
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2610
|
+
//! is done.
|
|
2611
|
+
//!
|
|
2612
|
+
//! @param[in,out] temp_storage_bytes
|
|
2613
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2614
|
+
//!
|
|
2615
|
+
//! @param[in,out] d_keys
|
|
2616
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
2617
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
2618
|
+
//! point to the sorted output keys
|
|
2619
|
+
//!
|
|
2620
|
+
//! @param[in] num_items
|
|
2621
|
+
//! Number of items to sort
|
|
2622
|
+
//!
|
|
2623
|
+
//! @param decomposer
|
|
2624
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
2625
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
2626
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
2627
|
+
//! modify members of the key.
|
|
2628
|
+
//!
|
|
2629
|
+
//! @param[in] begin_bit
|
|
2630
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
2631
|
+
//! key comparison
|
|
2632
|
+
//!
|
|
2633
|
+
//! @param[in] end_bit
|
|
2634
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
2635
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
2636
|
+
//!
|
|
2637
|
+
//! @param[in] stream
|
|
2638
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2639
|
+
//! Default is stream<sub>0</sub>.
|
|
2640
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
2641
|
+
CUB_RUNTIME_FUNCTION static //
|
|
2642
|
+
::cuda::std::enable_if_t< //
|
|
2643
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
2644
|
+
cudaError_t>
|
|
2645
|
+
SortKeys(void* d_temp_storage,
|
|
2646
|
+
size_t& temp_storage_bytes,
|
|
2647
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
2648
|
+
NumItemsT num_items,
|
|
2649
|
+
DecomposerT decomposer,
|
|
2650
|
+
int begin_bit,
|
|
2651
|
+
int end_bit,
|
|
2652
|
+
cudaStream_t stream = 0)
|
|
2653
|
+
{
|
|
2654
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2655
|
+
|
|
2656
|
+
// unsigned integer type for global offsets
|
|
2657
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
2658
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
2659
|
+
|
|
2660
|
+
static_assert(decomposer_check_t::value,
|
|
2661
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
2662
|
+
"arithmetic types");
|
|
2663
|
+
|
|
2664
|
+
constexpr bool is_overwrite_okay = true;
|
|
2665
|
+
DoubleBuffer<NullType> d_values;
|
|
2666
|
+
|
|
2667
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Ascending>(
|
|
2668
|
+
decomposer_check_t{},
|
|
2669
|
+
d_temp_storage,
|
|
2670
|
+
temp_storage_bytes,
|
|
2671
|
+
is_overwrite_okay,
|
|
2672
|
+
d_keys,
|
|
2673
|
+
d_values,
|
|
2674
|
+
static_cast<offset_t>(num_items),
|
|
2675
|
+
decomposer,
|
|
2676
|
+
begin_bit,
|
|
2677
|
+
end_bit,
|
|
2678
|
+
stream);
|
|
2679
|
+
}
|
|
2680
|
+
|
|
2681
|
+
//! @rst Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
|
|
2682
|
+
//!
|
|
2683
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
2684
|
+
//! - Pointers to contiguous memory must be used; iterators are not currently
|
|
2685
|
+
//! supported.
|
|
2686
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
2687
|
+
//! any of the provided ranges:
|
|
2688
|
+
//!
|
|
2689
|
+
//! - ``[d_keys_in, d_keys_in + num_items)``
|
|
2690
|
+
//! - ``[d_keys_out, d_keys_out + num_items)``
|
|
2691
|
+
//!
|
|
2692
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
2693
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
2694
|
+
//! yield a corresponding performance improvement.
|
|
2695
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
2696
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
2697
|
+
//! - @devicestorage
|
|
2698
|
+
//!
|
|
2699
|
+
//! Snippet
|
|
2700
|
+
//! --------------------------------------------------
|
|
2701
|
+
//!
|
|
2702
|
+
//! The code snippet below illustrates the sorting of a device vector of
|
|
2703
|
+
//! ``int`` keys.
|
|
2704
|
+
//! @endrst
|
|
2705
|
+
//!
|
|
2706
|
+
//! @code{.cpp}
|
|
2707
|
+
//! #include <cub/cub.cuh>
|
|
2708
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
2709
|
+
//!
|
|
2710
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
2711
|
+
//! // for sorting data
|
|
2712
|
+
//! int num_items; // e.g., 7
|
|
2713
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2714
|
+
//! int *d_keys_out; // e.g., [ ... ]
|
|
2715
|
+
//! ...
|
|
2716
|
+
//!
|
|
2717
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
2718
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
2719
|
+
//!
|
|
2720
|
+
//! // Determine temporary device storage requirements
|
|
2721
|
+
//! void *d_temp_storage = nullptr;
|
|
2722
|
+
//! size_t temp_storage_bytes = 0;
|
|
2723
|
+
//! cub::DeviceRadixSort::SortKeysDescending(
|
|
2724
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
|
|
2725
|
+
//!
|
|
2726
|
+
//! // Allocate temporary storage
|
|
2727
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2728
|
+
//!
|
|
2729
|
+
//! // Run sorting operation
|
|
2730
|
+
//! cub::DeviceRadixSort::SortKeysDescending(
|
|
2731
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
|
|
2732
|
+
//!
|
|
2733
|
+
//! // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s
|
|
2734
|
+
//! @endcode
|
|
2735
|
+
//!
|
|
2736
|
+
//! @tparam KeyT
|
|
2737
|
+
//! **[inferred]** KeyT type
|
|
2738
|
+
//!
|
|
2739
|
+
//! @tparam NumItemsT
|
|
2740
|
+
//! **[inferred]** Type of num_items
|
|
2741
|
+
//!
|
|
2742
|
+
//! @param[in] d_temp_storage
|
|
2743
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2744
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2745
|
+
//! is done.
|
|
2746
|
+
//!
|
|
2747
|
+
//! @param[in,out] temp_storage_bytes
|
|
2748
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2749
|
+
//!
|
|
2750
|
+
//! @param[in] d_keys_in
|
|
2751
|
+
//! Pointer to the input data of key data to sort
|
|
2752
|
+
//!
|
|
2753
|
+
//! @param[out] d_keys_out
|
|
2754
|
+
//! Pointer to the sorted output sequence of key data
|
|
2755
|
+
//!
|
|
2756
|
+
//! @param[in] num_items
|
|
2757
|
+
//! Number of items to sort
|
|
2758
|
+
//!
|
|
2759
|
+
//! @param[in] begin_bit
|
|
2760
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
2761
|
+
//! key comparison
|
|
2762
|
+
//!
|
|
2763
|
+
//! @param[in] end_bit
|
|
2764
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
2765
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
2766
|
+
//!
|
|
2767
|
+
//! @param[in] stream
|
|
2768
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2769
|
+
//! Default is stream<sub>0</sub>.
|
|
2770
|
+
template <typename KeyT, typename NumItemsT>
|
|
2771
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
|
|
2772
|
+
void* d_temp_storage,
|
|
2773
|
+
size_t& temp_storage_bytes,
|
|
2774
|
+
const KeyT* d_keys_in,
|
|
2775
|
+
KeyT* d_keys_out,
|
|
2776
|
+
NumItemsT num_items,
|
|
2777
|
+
int begin_bit = 0,
|
|
2778
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
2779
|
+
cudaStream_t stream = 0)
|
|
2780
|
+
{
|
|
2781
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2782
|
+
|
|
2783
|
+
// Unsigned integer type for global offsets.
|
|
2784
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
2785
|
+
|
|
2786
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
2787
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
2788
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
2789
|
+
// is not set.
|
|
2790
|
+
constexpr bool is_overwrite_okay = false;
|
|
2791
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
2792
|
+
DoubleBuffer<NullType> d_values;
|
|
2793
|
+
|
|
2794
|
+
return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
|
|
2795
|
+
d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
|
|
2796
|
+
}
|
|
2797
|
+
|
|
2798
|
+
//! @rst
|
|
2799
|
+
//! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
|
|
2800
|
+
//!
|
|
2801
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
2802
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
2803
|
+
//! supported.
|
|
2804
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
2805
|
+
//! any of the provided ranges:
|
|
2806
|
+
//!
|
|
2807
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
2808
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
2809
|
+
//!
|
|
2810
|
+
//! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
2811
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
2812
|
+
//! yield a corresponding performance improvement.
|
|
2813
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
2814
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
2815
|
+
//! * @devicestorage
|
|
2816
|
+
//!
|
|
2817
|
+
//! Snippet
|
|
2818
|
+
//! --------------------------------------------------
|
|
2819
|
+
//!
|
|
2820
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
2821
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
2822
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
2823
|
+
//! tuple of references to relevant members of the key.
|
|
2824
|
+
//!
|
|
2825
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2826
|
+
//! :language: c++
|
|
2827
|
+
//! :dedent:
|
|
2828
|
+
//! :start-after: example-begin custom-type
|
|
2829
|
+
//! :end-before: example-end custom-type
|
|
2830
|
+
//!
|
|
2831
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
2832
|
+
//! using ``cub::DeviceRadixSort::SortKeysDescending``:
|
|
2833
|
+
//!
|
|
2834
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2835
|
+
//! :language: c++
|
|
2836
|
+
//! :dedent:
|
|
2837
|
+
//! :start-after: example-begin keys-descending-bits
|
|
2838
|
+
//! :end-before: example-end keys-descending-bits
|
|
2839
|
+
//!
|
|
2840
|
+
//! @endrst
|
|
2841
|
+
//!
|
|
2842
|
+
//! @tparam KeyT
|
|
2843
|
+
//! **[inferred]** KeyT type
|
|
2844
|
+
//!
|
|
2845
|
+
//! @tparam NumItemsT
|
|
2846
|
+
//! **[inferred]** Type of num_items
|
|
2847
|
+
//!
|
|
2848
|
+
//! @tparam DecomposerT
|
|
2849
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
2850
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
2851
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
2852
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
2853
|
+
//! The call operator must not modify members of the key.
|
|
2854
|
+
//!
|
|
2855
|
+
//! @param[in] d_temp_storage
|
|
2856
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2857
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2858
|
+
//! is done.
|
|
2859
|
+
//!
|
|
2860
|
+
//! @param[in,out] temp_storage_bytes
|
|
2861
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2862
|
+
//!
|
|
2863
|
+
//! @param[in] d_keys_in
|
|
2864
|
+
//! Pointer to the input data of key data to sort
|
|
2865
|
+
//!
|
|
2866
|
+
//! @param[out] d_keys_out
|
|
2867
|
+
//! Pointer to the sorted output sequence of key data
|
|
2868
|
+
//!
|
|
2869
|
+
//! @param[in] num_items
|
|
2870
|
+
//! Number of items to sort
|
|
2871
|
+
//!
|
|
2872
|
+
//! @param decomposer
|
|
2873
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
2874
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
2875
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
2876
|
+
//! modify members of the key.
|
|
2877
|
+
//!
|
|
2878
|
+
//! @param[in] begin_bit
|
|
2879
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
2880
|
+
//! key comparison
|
|
2881
|
+
//!
|
|
2882
|
+
//! @param[in] end_bit
|
|
2883
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
2884
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
2885
|
+
//!
|
|
2886
|
+
//! @param[in] stream
|
|
2887
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
2888
|
+
//! Default is stream<sub>0</sub>.
|
|
2889
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
2890
|
+
CUB_RUNTIME_FUNCTION static //
|
|
2891
|
+
::cuda::std::enable_if_t< //
|
|
2892
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
2893
|
+
cudaError_t>
|
|
2894
|
+
SortKeysDescending(
|
|
2895
|
+
void* d_temp_storage,
|
|
2896
|
+
size_t& temp_storage_bytes,
|
|
2897
|
+
const KeyT* d_keys_in,
|
|
2898
|
+
KeyT* d_keys_out,
|
|
2899
|
+
NumItemsT num_items,
|
|
2900
|
+
DecomposerT decomposer,
|
|
2901
|
+
int begin_bit,
|
|
2902
|
+
int end_bit,
|
|
2903
|
+
cudaStream_t stream = 0)
|
|
2904
|
+
{
|
|
2905
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2906
|
+
|
|
2907
|
+
// unsigned integer type for global offsets
|
|
2908
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
2909
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
2910
|
+
|
|
2911
|
+
static_assert(decomposer_check_t::value,
|
|
2912
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
2913
|
+
"arithmetic types");
|
|
2914
|
+
|
|
2915
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
2916
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
2917
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
2918
|
+
// is not set.
|
|
2919
|
+
constexpr bool is_overwrite_okay = false;
|
|
2920
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
2921
|
+
DoubleBuffer<NullType> d_values;
|
|
2922
|
+
|
|
2923
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
2924
|
+
decomposer_check_t{},
|
|
2925
|
+
d_temp_storage,
|
|
2926
|
+
temp_storage_bytes,
|
|
2927
|
+
is_overwrite_okay,
|
|
2928
|
+
d_keys,
|
|
2929
|
+
d_values,
|
|
2930
|
+
static_cast<offset_t>(num_items),
|
|
2931
|
+
decomposer,
|
|
2932
|
+
begin_bit,
|
|
2933
|
+
end_bit,
|
|
2934
|
+
stream);
|
|
2935
|
+
}
|
|
2936
|
+
|
|
2937
|
+
//! @rst
|
|
2938
|
+
//! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage.
|
|
2939
|
+
//!
|
|
2940
|
+
//! * The contents of the input data are not altered by the sorting operation.
|
|
2941
|
+
//! * Pointers to contiguous memory must be used; iterators are not currently
|
|
2942
|
+
//! supported.
|
|
2943
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
2944
|
+
//! any of the provided ranges:
|
|
2945
|
+
//!
|
|
2946
|
+
//! * ``[d_keys_in, d_keys_in + num_items)``
|
|
2947
|
+
//! * ``[d_keys_out, d_keys_out + num_items)``
|
|
2948
|
+
//!
|
|
2949
|
+
//! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see
|
|
2950
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
2951
|
+
//! * @devicestorage
|
|
2952
|
+
//!
|
|
2953
|
+
//! Snippet
|
|
2954
|
+
//! --------------------------------------------------
|
|
2955
|
+
//!
|
|
2956
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
2957
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
2958
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
2959
|
+
//! tuple of references to relevant members of the key.
|
|
2960
|
+
//!
|
|
2961
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2962
|
+
//! :language: c++
|
|
2963
|
+
//! :dedent:
|
|
2964
|
+
//! :start-after: example-begin custom-type
|
|
2965
|
+
//! :end-before: example-end custom-type
|
|
2966
|
+
//!
|
|
2967
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
2968
|
+
//! using ``cub::DeviceRadixSort::SortKeysDescending``:
|
|
2969
|
+
//!
|
|
2970
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
2971
|
+
//! :language: c++
|
|
2972
|
+
//! :dedent:
|
|
2973
|
+
//! :start-after: example-begin keys-descending
|
|
2974
|
+
//! :end-before: example-end keys-descending
|
|
2975
|
+
//!
|
|
2976
|
+
//! @endrst
|
|
2977
|
+
//!
|
|
2978
|
+
//! @tparam KeyT
|
|
2979
|
+
//! **[inferred]** KeyT type
|
|
2980
|
+
//!
|
|
2981
|
+
//! @tparam NumItemsT
|
|
2982
|
+
//! **[inferred]** Type of num_items
|
|
2983
|
+
//!
|
|
2984
|
+
//! @tparam DecomposerT
|
|
2985
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
2986
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
2987
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
2988
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
2989
|
+
//! The call operator must not modify members of the key.
|
|
2990
|
+
//!
|
|
2991
|
+
//! @param[in] d_temp_storage
|
|
2992
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
2993
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
2994
|
+
//! is done.
|
|
2995
|
+
//!
|
|
2996
|
+
//! @param[in,out] temp_storage_bytes
|
|
2997
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
2998
|
+
//!
|
|
2999
|
+
//! @param[in] d_keys_in
|
|
3000
|
+
//! Pointer to the input data of key data to sort
|
|
3001
|
+
//!
|
|
3002
|
+
//! @param[out] d_keys_out
|
|
3003
|
+
//! Pointer to the sorted output sequence of key data
|
|
3004
|
+
//!
|
|
3005
|
+
//! @param[in] num_items
|
|
3006
|
+
//! Number of items to sort
|
|
3007
|
+
//!
|
|
3008
|
+
//! @param decomposer
|
|
3009
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
3010
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
3011
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
3012
|
+
//! modify members of the key.
|
|
3013
|
+
//!
|
|
3014
|
+
//! @param[in] stream
|
|
3015
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
3016
|
+
//! Default is stream<sub>0</sub>.
|
|
3017
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
3018
|
+
CUB_RUNTIME_FUNCTION static //
|
|
3019
|
+
::cuda::std::enable_if_t< //
|
|
3020
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
3021
|
+
cudaError_t>
|
|
3022
|
+
SortKeysDescending(
|
|
3023
|
+
void* d_temp_storage,
|
|
3024
|
+
size_t& temp_storage_bytes,
|
|
3025
|
+
const KeyT* d_keys_in,
|
|
3026
|
+
KeyT* d_keys_out,
|
|
3027
|
+
NumItemsT num_items,
|
|
3028
|
+
DecomposerT decomposer,
|
|
3029
|
+
cudaStream_t stream = 0)
|
|
3030
|
+
{
|
|
3031
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
3032
|
+
|
|
3033
|
+
// unsigned integer type for global offsets
|
|
3034
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
3035
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
3036
|
+
|
|
3037
|
+
static_assert(decomposer_check_t::value,
|
|
3038
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
3039
|
+
"arithmetic types");
|
|
3040
|
+
|
|
3041
|
+
// We cast away const-ness, but will *not* write to these arrays.
|
|
3042
|
+
// ``DispatchRadixSort::Dispatch`` will allocate temporary storage and
|
|
3043
|
+
// create a new double-buffer internally when the ``is_overwrite_ok`` flag
|
|
3044
|
+
// is not set.
|
|
3045
|
+
constexpr bool is_overwrite_okay = false;
|
|
3046
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
3047
|
+
DoubleBuffer<NullType> d_values;
|
|
3048
|
+
|
|
3049
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
3050
|
+
decomposer_check_t{},
|
|
3051
|
+
d_temp_storage,
|
|
3052
|
+
temp_storage_bytes,
|
|
3053
|
+
is_overwrite_okay,
|
|
3054
|
+
d_keys,
|
|
3055
|
+
d_values,
|
|
3056
|
+
static_cast<offset_t>(num_items),
|
|
3057
|
+
decomposer,
|
|
3058
|
+
stream);
|
|
3059
|
+
}
|
|
3060
|
+
|
|
3061
|
+
//! @rst
|
|
3062
|
+
//! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
|
|
3063
|
+
//!
|
|
3064
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
3065
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
3066
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
3067
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
3068
|
+
//! - In-place operations are not supported. There must be no overlap between
|
|
3069
|
+
//! any of the provided ranges:
|
|
3070
|
+
//!
|
|
3071
|
+
//! - ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
3072
|
+
//! - ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
3073
|
+
//!
|
|
3074
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
3075
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
3076
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
3077
|
+
//! number of key bits specified and the targeted device architecture).
|
|
3078
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
3079
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
3080
|
+
//! yield a corresponding performance improvement.
|
|
3081
|
+
//! - @devicestorageP
|
|
3082
|
+
//! - @devicestorage
|
|
3083
|
+
//!
|
|
3084
|
+
//! Snippet
|
|
3085
|
+
//! --------------------------------------------------
|
|
3086
|
+
//!
|
|
3087
|
+
//! The code snippet below illustrates the sorting of a device vector of ``int`` keys.
|
|
3088
|
+
//! @endrst
|
|
3089
|
+
//!
|
|
3090
|
+
//! @code{.cpp}
|
|
3091
|
+
//! #include <cub/cub.cuh>
|
|
3092
|
+
//! // or equivalently <cub/device/device_radix_sort.cuh>
|
|
3093
|
+
//!
|
|
3094
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
3095
|
+
//! // for sorting data
|
|
3096
|
+
//! int num_items; // e.g., 7
|
|
3097
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
3098
|
+
//! int *d_key_alt_buf; // e.g., [ ... ]
|
|
3099
|
+
//! ...
|
|
3100
|
+
//!
|
|
3101
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
3102
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
3103
|
+
//!
|
|
3104
|
+
//! // Determine temporary device storage requirements
|
|
3105
|
+
//! void *d_temp_storage = nullptr;
|
|
3106
|
+
//! size_t temp_storage_bytes = 0;
|
|
3107
|
+
//! cub::DeviceRadixSort::SortKeysDescending(
|
|
3108
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, num_items);
|
|
3109
|
+
//!
|
|
3110
|
+
//! // Allocate temporary storage
|
|
3111
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
3112
|
+
//!
|
|
3113
|
+
//! // Run sorting operation
|
|
3114
|
+
//! cub::DeviceRadixSort::SortKeysDescending(
|
|
3115
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, num_items);
|
|
3116
|
+
//!
|
|
3117
|
+
//! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0]
|
|
3118
|
+
//! @endcode
|
|
3119
|
+
//!
|
|
3120
|
+
//! @tparam KeyT
|
|
3121
|
+
//! **[inferred]** KeyT type
|
|
3122
|
+
//!
|
|
3123
|
+
//! @tparam NumItemsT
|
|
3124
|
+
//! **[inferred]** Type of num_items
|
|
3125
|
+
//!
|
|
3126
|
+
//! @param[in] d_temp_storage
|
|
3127
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
3128
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
3129
|
+
//! is done.
|
|
3130
|
+
//!
|
|
3131
|
+
//! @param[in,out] temp_storage_bytes
|
|
3132
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
3133
|
+
//!
|
|
3134
|
+
//! @param[in,out] d_keys
|
|
3135
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
3136
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
3137
|
+
//! point to the sorted output keys
|
|
3138
|
+
//!
|
|
3139
|
+
//! @param[in] num_items
|
|
3140
|
+
//! Number of items to sort
|
|
3141
|
+
//!
|
|
3142
|
+
//! @param[in] begin_bit
|
|
3143
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
3144
|
+
//! key comparison
|
|
3145
|
+
//!
|
|
3146
|
+
//! @param[in] end_bit
|
|
3147
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
3148
|
+
//! comparison (e.g., ``sizeof(unsigned int) * 8``)
|
|
3149
|
+
//!
|
|
3150
|
+
//! @param[in] stream
|
|
3151
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
3152
|
+
//! Default is stream<sub>0</sub>.
|
|
3153
|
+
template <typename KeyT, typename NumItemsT>
|
|
3154
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
|
|
3155
|
+
void* d_temp_storage,
|
|
3156
|
+
size_t& temp_storage_bytes,
|
|
3157
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
3158
|
+
NumItemsT num_items,
|
|
3159
|
+
int begin_bit = 0,
|
|
3160
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
3161
|
+
cudaStream_t stream = 0)
|
|
3162
|
+
{
|
|
3163
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
3164
|
+
|
|
3165
|
+
// Unsigned integer type for global offsets.
|
|
3166
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
3167
|
+
|
|
3168
|
+
constexpr bool is_overwrite_okay = true;
|
|
3169
|
+
|
|
3170
|
+
// Null value type
|
|
3171
|
+
DoubleBuffer<NullType> d_values;
|
|
3172
|
+
|
|
3173
|
+
return DispatchRadixSort<SortOrder::Descending, KeyT, NullType, OffsetT>::Dispatch(
|
|
3174
|
+
d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, is_overwrite_okay, stream);
|
|
3175
|
+
}
|
|
3176
|
+
|
|
3177
|
+
//! @rst
|
|
3178
|
+
//! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
|
|
3179
|
+
//!
|
|
3180
|
+
//! * The sorting operation is given a pair of key buffers managed by a
|
|
3181
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
3182
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
3183
|
+
//! * The contents of both buffers may be altered by the sorting operation.
|
|
3184
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
3185
|
+
//! any of the provided ranges:
|
|
3186
|
+
//!
|
|
3187
|
+
//! * ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
3188
|
+
//! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
3189
|
+
//!
|
|
3190
|
+
//! * Upon completion, the sorting operation will update the "current"
|
|
3191
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
3192
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
3193
|
+
//! number of key bits specified and the targeted device architecture).
|
|
3194
|
+
//! * @devicestorageP
|
|
3195
|
+
//! * @devicestorage
|
|
3196
|
+
//!
|
|
3197
|
+
//! Snippet
|
|
3198
|
+
//! --------------------------------------------------
|
|
3199
|
+
//!
|
|
3200
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
3201
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
3202
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
3203
|
+
//! tuple of references to relevant members of the key.
|
|
3204
|
+
//!
|
|
3205
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
3206
|
+
//! :language: c++
|
|
3207
|
+
//! :dedent:
|
|
3208
|
+
//! :start-after: example-begin custom-type
|
|
3209
|
+
//! :end-before: example-end custom-type
|
|
3210
|
+
//!
|
|
3211
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
3212
|
+
//! using ``cub::DeviceRadixSort::SortKeysDescending``:
|
|
3213
|
+
//!
|
|
3214
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
3215
|
+
//! :language: c++
|
|
3216
|
+
//! :dedent:
|
|
3217
|
+
//! :start-after: example-begin keys-descending-db
|
|
3218
|
+
//! :end-before: example-end keys-descending-db
|
|
3219
|
+
//!
|
|
3220
|
+
//! @endrst
|
|
3221
|
+
//!
|
|
3222
|
+
//! @tparam KeyT
|
|
3223
|
+
//! **[inferred]** KeyT type
|
|
3224
|
+
//!
|
|
3225
|
+
//! @tparam NumItemsT
|
|
3226
|
+
//! **[inferred]** Type of num_items
|
|
3227
|
+
//!
|
|
3228
|
+
//! @tparam DecomposerT
|
|
3229
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
3230
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
3231
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
3232
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
3233
|
+
//! The call operator must not modify members of the key.
|
|
3234
|
+
//!
|
|
3235
|
+
//! @param[in] d_temp_storage
|
|
3236
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
3237
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
3238
|
+
//! is done.
|
|
3239
|
+
//!
|
|
3240
|
+
//! @param[in,out] temp_storage_bytes
|
|
3241
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
3242
|
+
//!
|
|
3243
|
+
//! @param[in,out] d_keys
|
|
3244
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
3245
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
3246
|
+
//! point to the sorted output keys
|
|
3247
|
+
//!
|
|
3248
|
+
//! @param[in] num_items
|
|
3249
|
+
//! Number of items to sort
|
|
3250
|
+
//!
|
|
3251
|
+
//! @param decomposer
|
|
3252
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
3253
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
3254
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
3255
|
+
//! modify members of the key.
|
|
3256
|
+
//!
|
|
3257
|
+
//! @param[in] stream
|
|
3258
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
3259
|
+
//! Default is stream<sub>0</sub>.
|
|
3260
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
3261
|
+
CUB_RUNTIME_FUNCTION static //
|
|
3262
|
+
::cuda::std::enable_if_t< //
|
|
3263
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
3264
|
+
cudaError_t>
|
|
3265
|
+
SortKeysDescending(
|
|
3266
|
+
void* d_temp_storage,
|
|
3267
|
+
size_t& temp_storage_bytes,
|
|
3268
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
3269
|
+
NumItemsT num_items,
|
|
3270
|
+
DecomposerT decomposer,
|
|
3271
|
+
cudaStream_t stream = 0)
|
|
3272
|
+
{
|
|
3273
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
3274
|
+
|
|
3275
|
+
// unsigned integer type for global offsets
|
|
3276
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
3277
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
3278
|
+
|
|
3279
|
+
static_assert(decomposer_check_t::value,
|
|
3280
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
3281
|
+
"arithmetic types");
|
|
3282
|
+
|
|
3283
|
+
constexpr bool is_overwrite_okay = true;
|
|
3284
|
+
DoubleBuffer<NullType> d_values;
|
|
3285
|
+
|
|
3286
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
3287
|
+
decomposer_check_t{},
|
|
3288
|
+
d_temp_storage,
|
|
3289
|
+
temp_storage_bytes,
|
|
3290
|
+
is_overwrite_okay,
|
|
3291
|
+
d_keys,
|
|
3292
|
+
d_values,
|
|
3293
|
+
static_cast<offset_t>(num_items),
|
|
3294
|
+
decomposer,
|
|
3295
|
+
stream);
|
|
3296
|
+
}
|
|
3297
|
+
|
|
3298
|
+
//! @rst
|
|
3299
|
+
//! Sorts keys into descending order using :math:`\approx N` auxiliary storage.
|
|
3300
|
+
//!
|
|
3301
|
+
//! * The sorting operation is given a pair of key buffers managed by a
|
|
3302
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
3303
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
3304
|
+
//! * The contents of both buffers may be altered by the sorting operation.
|
|
3305
|
+
//! * In-place operations are not supported. There must be no overlap between
|
|
3306
|
+
//! any of the provided ranges:
|
|
3307
|
+
//!
|
|
3308
|
+
//! * ``[d_keys.Current(), d_keys.Current() + num_items)``
|
|
3309
|
+
//! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)``
|
|
3310
|
+
//!
|
|
3311
|
+
//! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify
|
|
3312
|
+
//! differentiating key bits. This can reduce overall sorting overhead and
|
|
3313
|
+
//! yield a corresponding performance improvement.
|
|
3314
|
+
//! * Upon completion, the sorting operation will update the "current"
|
|
3315
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
3316
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
3317
|
+
//! number of key bits specified and the targeted device architecture).
|
|
3318
|
+
//! * @devicestorageP
|
|
3319
|
+
//! * @devicestorage
|
|
3320
|
+
//!
|
|
3321
|
+
//! Snippet
|
|
3322
|
+
//! --------------------------------------------------
|
|
3323
|
+
//!
|
|
3324
|
+
//! Let's consider a user-defined ``custom_t`` type below. To sort an array of
|
|
3325
|
+
//! ``custom_t`` objects, we have to tell CUB about relevant members of the
|
|
3326
|
+
//! ``custom_t`` type. We do this by providing a decomposer that returns a
|
|
3327
|
+
//! tuple of references to relevant members of the key.
|
|
3328
|
+
//!
|
|
3329
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
3330
|
+
//! :language: c++
|
|
3331
|
+
//! :dedent:
|
|
3332
|
+
//! :start-after: example-begin custom-type
|
|
3333
|
+
//! :end-before: example-end custom-type
|
|
3334
|
+
//!
|
|
3335
|
+
//! The following snippet shows how to sort an array of ``custom_t`` objects
|
|
3336
|
+
//! using ``cub::DeviceRadixSort::SortKeysDescending``:
|
|
3337
|
+
//!
|
|
3338
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_radix_sort_custom.cu
|
|
3339
|
+
//! :language: c++
|
|
3340
|
+
//! :dedent:
|
|
3341
|
+
//! :start-after: example-begin keys-descending-bits-db
|
|
3342
|
+
//! :end-before: example-end keys-descending-bits-db
|
|
3343
|
+
//!
|
|
3344
|
+
//! @endrst
|
|
3345
|
+
//!
|
|
3346
|
+
//! @tparam KeyT
|
|
3347
|
+
//! **[inferred]** KeyT type
|
|
3348
|
+
//!
|
|
3349
|
+
//! @tparam NumItemsT
|
|
3350
|
+
//! **[inferred]** Type of num_items
|
|
3351
|
+
//!
|
|
3352
|
+
//! @tparam DecomposerT
|
|
3353
|
+
//! **[inferred]** Type of a callable object responsible for decomposing a
|
|
3354
|
+
//! ``KeyT`` into a tuple of references to its constituent arithmetic types:
|
|
3355
|
+
//! ``::cuda::std::tuple<ArithmeticTs&...> operator()(KeyT &key)``.
|
|
3356
|
+
//! The leftmost element of the tuple is considered the most significant.
|
|
3357
|
+
//! The call operator must not modify members of the key.
|
|
3358
|
+
//!
|
|
3359
|
+
//! @param[in] d_temp_storage
|
|
3360
|
+
//! Device-accessible allocation of temporary storage. When ``nullptr``, the
|
|
3361
|
+
//! required allocation size is written to ``temp_storage_bytes`` and no work
|
|
3362
|
+
//! is done.
|
|
3363
|
+
//!
|
|
3364
|
+
//! @param[in,out] temp_storage_bytes
|
|
3365
|
+
//! Reference to size in bytes of ``d_temp_storage`` allocation
|
|
3366
|
+
//!
|
|
3367
|
+
//! @param[in,out] d_keys
|
|
3368
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
3369
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
3370
|
+
//! point to the sorted output keys
|
|
3371
|
+
//!
|
|
3372
|
+
//! @param[in] num_items
|
|
3373
|
+
//! Number of items to sort
|
|
3374
|
+
//!
|
|
3375
|
+
//! @param decomposer
|
|
3376
|
+
//! Callable object responsible for decomposing a ``KeyT`` into a tuple of
|
|
3377
|
+
//! references to its constituent arithmetic types. The leftmost element of
|
|
3378
|
+
//! the tuple is considered the most significant. The call operator must not
|
|
3379
|
+
//! modify members of the key.
|
|
3380
|
+
//!
|
|
3381
|
+
//! @param[in] begin_bit
|
|
3382
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for
|
|
3383
|
+
//! key comparison
|
|
3384
|
+
//!
|
|
3385
|
+
//! @param[in] end_bit
|
|
3386
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
3387
|
+
//! comparison (e.g., ``(sizeof(float) + sizeof(long long int)) * 8``)
|
|
3388
|
+
//!
|
|
3389
|
+
//! @param[in] stream
|
|
3390
|
+
//! **[optional]** CUDA stream to launch kernels within.
|
|
3391
|
+
//! Default is stream<sub>0</sub>.
|
|
3392
|
+
template <typename KeyT, typename NumItemsT, typename DecomposerT>
|
|
3393
|
+
CUB_RUNTIME_FUNCTION static //
|
|
3394
|
+
::cuda::std::enable_if_t< //
|
|
3395
|
+
!::cuda::std::is_convertible_v<DecomposerT, int>, //
|
|
3396
|
+
cudaError_t>
|
|
3397
|
+
SortKeysDescending(
|
|
3398
|
+
void* d_temp_storage,
|
|
3399
|
+
size_t& temp_storage_bytes,
|
|
3400
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
3401
|
+
NumItemsT num_items,
|
|
3402
|
+
DecomposerT decomposer,
|
|
3403
|
+
int begin_bit,
|
|
3404
|
+
int end_bit,
|
|
3405
|
+
cudaStream_t stream = 0)
|
|
3406
|
+
{
|
|
3407
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
3408
|
+
|
|
3409
|
+
// unsigned integer type for global offsets
|
|
3410
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
3411
|
+
using decomposer_check_t = detail::radix::decomposer_check_t<KeyT, DecomposerT>;
|
|
3412
|
+
|
|
3413
|
+
static_assert(decomposer_check_t::value,
|
|
3414
|
+
"DecomposerT must be a callable object returning a tuple of references to "
|
|
3415
|
+
"arithmetic types");
|
|
3416
|
+
|
|
3417
|
+
constexpr bool is_overwrite_okay = true;
|
|
3418
|
+
DoubleBuffer<NullType> d_values;
|
|
3419
|
+
|
|
3420
|
+
return DeviceRadixSort::custom_radix_sort<SortOrder::Descending>(
|
|
3421
|
+
decomposer_check_t{},
|
|
3422
|
+
d_temp_storage,
|
|
3423
|
+
temp_storage_bytes,
|
|
3424
|
+
is_overwrite_okay,
|
|
3425
|
+
d_keys,
|
|
3426
|
+
d_values,
|
|
3427
|
+
static_cast<offset_t>(num_items),
|
|
3428
|
+
decomposer,
|
|
3429
|
+
begin_bit,
|
|
3430
|
+
end_bit,
|
|
3431
|
+
stream);
|
|
3432
|
+
}
|
|
3433
|
+
|
|
3434
|
+
//! @} end member group
|
|
3435
|
+
};
|
|
3436
|
+
|
|
3437
|
+
CUB_NAMESPACE_END
|