cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
- cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
- cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
- cuda/cccl/headers/include/cub/config.cuh +2 -26
- cuda/cccl/headers/include/cub/cub.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
- cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
- cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
- cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
- cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
- cuda/cccl/headers/include/cub/util_device.cuh +18 -59
- cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
- cuda/cccl/headers/include/cub/util_math.cuh +2 -28
- cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
- cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
- cuda/cccl/headers/include/cub/util_type.cuh +5 -32
- cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
- cuda/cccl/headers/include/cub/version.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
- cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
- cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
- cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
- cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
- cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
- cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
- cuda/cccl/headers/include/cuda/barrier +42 -16
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/memory_resource +6 -1
- cuda/cccl/headers/include/cuda/numeric +2 -0
- cuda/cccl/headers/include/cuda/pipeline +3 -2
- cuda/cccl/headers/include/cuda/ptx +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
- cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
- cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
- cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
- cuda/cccl/headers/include/cuda/std/__new_ +1 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random_ +2 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/array +1 -1
- cuda/cccl/headers/include/cuda/std/atomic +1 -1
- cuda/cccl/headers/include/cuda/std/bitset +2 -10
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
- cuda/cccl/headers/include/cuda/std/functional +1 -1
- cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
- cuda/cccl/headers/include/cuda/std/iterator +1 -1
- cuda/cccl/headers/include/cuda/std/numbers +0 -2
- cuda/cccl/headers/include/cuda/std/ratio +2 -2
- cuda/cccl/headers/include/cuda/std/span +2 -2
- cuda/cccl/headers/include/cuda/std/string_view +24 -42
- cuda/cccl/headers/include/cuda/std/tuple +18 -1
- cuda/cccl/headers/include/cuda/std/type_traits +0 -1
- cuda/cccl/headers/include/cuda/std/variant +8 -1
- cuda/cccl/headers/include/nv/target +2 -6
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
- cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
- cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
- cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
- cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
- cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
- cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
- cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
- cuda/cccl/headers/include/thrust/functional.h +0 -2
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/new.h +0 -2
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
- cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
- cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
- cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
- cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
- cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
- cuda/cccl/headers/include/thrust/transform.h +14 -3
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
- cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
- cuda/cccl/headers/include/thrust/zip_function.h +2 -28
- cuda/compute/__init__.py +4 -0
- cuda/compute/_bindings.pyi +26 -3
- cuda/compute/_bindings_impl.pyx +143 -1
- cuda/compute/algorithms/__init__.py +9 -5
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
- cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda_cccl-0.3.4.dist-info/METADATA +78 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
- cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
- cuda_cccl-0.3.2.dist-info/METADATA +0 -42
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,30 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
*
|
|
5
|
-
* Redistribution and use in source and binary forms, with or without
|
|
6
|
-
* modification, are permitted provided that the following conditions are met:
|
|
7
|
-
* * Redistributions of source code must retain the above copyright
|
|
8
|
-
* notice, this list of conditions and the following disclaimer.
|
|
9
|
-
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
-
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
-
* documentation and/or other materials provided with the distribution.
|
|
12
|
-
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
-
* names of its contributors may be used to endorse or promote products
|
|
14
|
-
* derived from this software without specific prior written permission.
|
|
15
|
-
*
|
|
16
|
-
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
-
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
-
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
-
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
-
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
-
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
-
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
-
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
-
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
-
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
-
*
|
|
27
|
-
******************************************************************************/
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
2
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
// SPDX-License-Identifier: BSD-3
|
|
28
4
|
|
|
29
5
|
/**
|
|
30
6
|
* @file
|
|
@@ -212,46 +188,46 @@ CUB_NAMESPACE_BEGIN
|
|
|
212
188
|
//! @tparam KeyT
|
|
213
189
|
//! KeyT type
|
|
214
190
|
//!
|
|
215
|
-
//! @tparam
|
|
191
|
+
//! @tparam BlockDimX
|
|
216
192
|
//! The thread block length in threads along the X dimension
|
|
217
193
|
//!
|
|
218
|
-
//! @tparam
|
|
194
|
+
//! @tparam ItemsPerThread
|
|
219
195
|
//! The number of items per thread
|
|
220
196
|
//!
|
|
221
197
|
//! @tparam ValueT
|
|
222
198
|
//! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort)
|
|
223
199
|
//!
|
|
224
|
-
//! @tparam
|
|
200
|
+
//! @tparam RadixBits
|
|
225
201
|
//! **[optional]** The number of radix bits per digit place (default: 4 bits)
|
|
226
202
|
//!
|
|
227
|
-
//! @tparam
|
|
203
|
+
//! @tparam MemoizeOuterScan
|
|
228
204
|
//! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory
|
|
229
205
|
//! reads at the expense of higher register pressure (default: true for architectures SM35 and
|
|
230
206
|
//! newer, false otherwise).
|
|
231
207
|
//!
|
|
232
|
-
//! @tparam
|
|
208
|
+
//! @tparam InnerScanAlgorithm
|
|
233
209
|
//! **[optional]** The cub::BlockScanAlgorithm algorithm to use
|
|
234
210
|
//! (default: cub::BLOCK_SCAN_WARP_SCANS)
|
|
235
211
|
//!
|
|
236
|
-
//! @tparam
|
|
212
|
+
//! @tparam SMemConfig
|
|
237
213
|
//! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`)
|
|
238
214
|
//!
|
|
239
|
-
//! @tparam
|
|
215
|
+
//! @tparam BlockDimY
|
|
240
216
|
//! **[optional]** The thread block length in threads along the Y dimension (default: 1)
|
|
241
217
|
//!
|
|
242
|
-
//! @tparam
|
|
218
|
+
//! @tparam BlockDimZ
|
|
243
219
|
//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
|
|
244
220
|
//!
|
|
245
221
|
template <typename KeyT,
|
|
246
|
-
int
|
|
247
|
-
int
|
|
248
|
-
typename ValueT
|
|
249
|
-
int
|
|
250
|
-
bool
|
|
251
|
-
BlockScanAlgorithm
|
|
252
|
-
cudaSharedMemConfig
|
|
253
|
-
int
|
|
254
|
-
int
|
|
222
|
+
int BlockDimX,
|
|
223
|
+
int ItemsPerThread,
|
|
224
|
+
typename ValueT = NullType,
|
|
225
|
+
int RadixBits = 4,
|
|
226
|
+
bool MemoizeOuterScan = true,
|
|
227
|
+
BlockScanAlgorithm InnerScanAlgorithm = BLOCK_SCAN_WARP_SCANS,
|
|
228
|
+
cudaSharedMemConfig SMemConfig = cudaSharedMemBankSizeFourByte,
|
|
229
|
+
int BlockDimY = 1,
|
|
230
|
+
int BlockDimZ = 1>
|
|
255
231
|
class BlockRadixSort
|
|
256
232
|
{
|
|
257
233
|
private:
|
|
@@ -262,7 +238,7 @@ private:
|
|
|
262
238
|
enum
|
|
263
239
|
{
|
|
264
240
|
// The thread block size in threads
|
|
265
|
-
BLOCK_THREADS =
|
|
241
|
+
BLOCK_THREADS = BlockDimX * BlockDimY * BlockDimZ,
|
|
266
242
|
|
|
267
243
|
// Whether or not there are values to be trucked along with keys
|
|
268
244
|
KEYS_ONLY = ::cuda::std::is_same_v<ValueT, NullType>,
|
|
@@ -275,34 +251,20 @@ private:
|
|
|
275
251
|
|
|
276
252
|
/// Ascending BlockRadixRank utility type
|
|
277
253
|
using AscendingBlockRadixRank =
|
|
278
|
-
BlockRadixRank<
|
|
279
|
-
RADIX_BITS,
|
|
280
|
-
false,
|
|
281
|
-
MEMOIZE_OUTER_SCAN,
|
|
282
|
-
INNER_SCAN_ALGORITHM,
|
|
283
|
-
SMEM_CONFIG,
|
|
284
|
-
BLOCK_DIM_Y,
|
|
285
|
-
BLOCK_DIM_Z>;
|
|
254
|
+
BlockRadixRank<BlockDimX, RadixBits, false, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ>;
|
|
286
255
|
|
|
287
256
|
/// Descending BlockRadixRank utility type
|
|
288
257
|
using DescendingBlockRadixRank =
|
|
289
|
-
BlockRadixRank<
|
|
290
|
-
RADIX_BITS,
|
|
291
|
-
true,
|
|
292
|
-
MEMOIZE_OUTER_SCAN,
|
|
293
|
-
INNER_SCAN_ALGORITHM,
|
|
294
|
-
SMEM_CONFIG,
|
|
295
|
-
BLOCK_DIM_Y,
|
|
296
|
-
BLOCK_DIM_Z>;
|
|
258
|
+
BlockRadixRank<BlockDimX, RadixBits, true, MemoizeOuterScan, InnerScanAlgorithm, SMemConfig, BlockDimY, BlockDimZ>;
|
|
297
259
|
|
|
298
260
|
/// Digit extractor type
|
|
299
261
|
using fundamental_digit_extractor_t = BFEDigitExtractor<KeyT>;
|
|
300
262
|
|
|
301
263
|
/// BlockExchange utility type for keys
|
|
302
|
-
using BlockExchangeKeys = BlockExchange<KeyT,
|
|
264
|
+
using BlockExchangeKeys = BlockExchange<KeyT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
|
|
303
265
|
|
|
304
266
|
/// BlockExchange utility type for values
|
|
305
|
-
using BlockExchangeValues = BlockExchange<ValueT,
|
|
267
|
+
using BlockExchangeValues = BlockExchange<ValueT, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
|
|
306
268
|
|
|
307
269
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
308
270
|
/// Shared memory storage layout type
|
|
@@ -339,8 +301,8 @@ private:
|
|
|
339
301
|
/// Rank keys (specialized for ascending sort)
|
|
340
302
|
template <class DigitExtractorT>
|
|
341
303
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
342
|
-
RankKeys(bit_ordered_type (&unsigned_keys)[
|
|
343
|
-
int (&ranks)[
|
|
304
|
+
RankKeys(bit_ordered_type (&unsigned_keys)[ItemsPerThread],
|
|
305
|
+
int (&ranks)[ItemsPerThread],
|
|
344
306
|
DigitExtractorT digit_extractor,
|
|
345
307
|
::cuda::std::false_type /*is_descending*/)
|
|
346
308
|
{
|
|
@@ -350,8 +312,8 @@ private:
|
|
|
350
312
|
/// Rank keys (specialized for descending sort)
|
|
351
313
|
template <class DigitExtractorT>
|
|
352
314
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
353
|
-
RankKeys(bit_ordered_type (&unsigned_keys)[
|
|
354
|
-
int (&ranks)[
|
|
315
|
+
RankKeys(bit_ordered_type (&unsigned_keys)[ItemsPerThread],
|
|
316
|
+
int (&ranks)[ItemsPerThread],
|
|
355
317
|
DigitExtractorT digit_extractor,
|
|
356
318
|
::cuda::std::true_type /*is_descending*/)
|
|
357
319
|
{
|
|
@@ -360,8 +322,8 @@ private:
|
|
|
360
322
|
|
|
361
323
|
/// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
|
|
362
324
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
|
|
363
|
-
ValueT (&values)[
|
|
364
|
-
int (&ranks)[
|
|
325
|
+
ValueT (&values)[ItemsPerThread],
|
|
326
|
+
int (&ranks)[ItemsPerThread],
|
|
365
327
|
::cuda::std::false_type /*is_keys_only*/,
|
|
366
328
|
::cuda::std::true_type /*is_blocked*/)
|
|
367
329
|
{
|
|
@@ -373,8 +335,8 @@ private:
|
|
|
373
335
|
|
|
374
336
|
/// ExchangeValues (specialized for key-value sort, to-striped arrangement)
|
|
375
337
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
|
|
376
|
-
ValueT (&values)[
|
|
377
|
-
int (&ranks)[
|
|
338
|
+
ValueT (&values)[ItemsPerThread],
|
|
339
|
+
int (&ranks)[ItemsPerThread],
|
|
378
340
|
::cuda::std::false_type /*is_keys_only*/,
|
|
379
341
|
::cuda::std::false_type /*is_blocked*/)
|
|
380
342
|
{
|
|
@@ -387,8 +349,8 @@ private:
|
|
|
387
349
|
/// ExchangeValues (specialized for keys-only sort)
|
|
388
350
|
template <bool IS_BLOCKED>
|
|
389
351
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ExchangeValues(
|
|
390
|
-
ValueT (& /*values*/)[
|
|
391
|
-
int (& /*ranks*/)[
|
|
352
|
+
ValueT (& /*values*/)[ItemsPerThread],
|
|
353
|
+
int (& /*ranks*/)[ItemsPerThread],
|
|
392
354
|
::cuda::std::true_type /*is_keys_only*/,
|
|
393
355
|
::cuda::std::bool_constant<IS_BLOCKED> /*is_blocked*/)
|
|
394
356
|
{}
|
|
@@ -416,18 +378,18 @@ private:
|
|
|
416
378
|
*/
|
|
417
379
|
template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
|
|
418
380
|
_CCCL_DEVICE _CCCL_FORCEINLINE void SortBlocked(
|
|
419
|
-
KeyT (&keys)[
|
|
420
|
-
ValueT (&values)[
|
|
381
|
+
KeyT (&keys)[ItemsPerThread],
|
|
382
|
+
ValueT (&values)[ItemsPerThread],
|
|
421
383
|
int begin_bit,
|
|
422
384
|
int end_bit,
|
|
423
385
|
::cuda::std::bool_constant<DESCENDING> is_descending,
|
|
424
386
|
::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
|
|
425
387
|
DecomposerT decomposer = {})
|
|
426
388
|
{
|
|
427
|
-
bit_ordered_type(&unsigned_keys)[
|
|
389
|
+
bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
|
|
428
390
|
|
|
429
391
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
430
|
-
for (int KEY = 0; KEY <
|
|
392
|
+
for (int KEY = 0; KEY < ItemsPerThread; KEY++)
|
|
431
393
|
{
|
|
432
394
|
unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
|
|
433
395
|
}
|
|
@@ -435,14 +397,14 @@ private:
|
|
|
435
397
|
// Radix sorting passes
|
|
436
398
|
while (true)
|
|
437
399
|
{
|
|
438
|
-
int pass_bits = ::cuda::std::min(
|
|
400
|
+
int pass_bits = ::cuda::std::min(RadixBits, end_bit - begin_bit);
|
|
439
401
|
auto digit_extractor =
|
|
440
402
|
traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
|
|
441
403
|
|
|
442
404
|
// Rank the blocked keys
|
|
443
|
-
int ranks[
|
|
405
|
+
int ranks[ItemsPerThread];
|
|
444
406
|
RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
|
|
445
|
-
begin_bit +=
|
|
407
|
+
begin_bit += RadixBits;
|
|
446
408
|
|
|
447
409
|
__syncthreads();
|
|
448
410
|
|
|
@@ -463,7 +425,7 @@ private:
|
|
|
463
425
|
|
|
464
426
|
// Untwiddle bits if necessary
|
|
465
427
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
466
|
-
for (int KEY = 0; KEY <
|
|
428
|
+
for (int KEY = 0; KEY < ItemsPerThread; KEY++)
|
|
467
429
|
{
|
|
468
430
|
unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
|
|
469
431
|
}
|
|
@@ -495,18 +457,18 @@ public:
|
|
|
495
457
|
*/
|
|
496
458
|
template <bool DESCENDING, bool KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
|
|
497
459
|
_CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
|
|
498
|
-
KeyT (&keys)[
|
|
499
|
-
ValueT (&values)[
|
|
460
|
+
KeyT (&keys)[ItemsPerThread],
|
|
461
|
+
ValueT (&values)[ItemsPerThread],
|
|
500
462
|
int begin_bit,
|
|
501
463
|
int end_bit,
|
|
502
464
|
::cuda::std::bool_constant<DESCENDING> is_descending,
|
|
503
465
|
::cuda::std::bool_constant<KEYS_ONLY> is_keys_only,
|
|
504
466
|
DecomposerT decomposer = {})
|
|
505
467
|
{
|
|
506
|
-
bit_ordered_type(&unsigned_keys)[
|
|
468
|
+
bit_ordered_type(&unsigned_keys)[ItemsPerThread] = reinterpret_cast<bit_ordered_type(&)[ItemsPerThread]>(keys);
|
|
507
469
|
|
|
508
470
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
509
|
-
for (int KEY = 0; KEY <
|
|
471
|
+
for (int KEY = 0; KEY < ItemsPerThread; KEY++)
|
|
510
472
|
{
|
|
511
473
|
unsigned_keys[KEY] = bit_ordered_conversion::to_bit_ordered(decomposer, unsigned_keys[KEY]);
|
|
512
474
|
}
|
|
@@ -514,14 +476,14 @@ public:
|
|
|
514
476
|
// Radix sorting passes
|
|
515
477
|
while (true)
|
|
516
478
|
{
|
|
517
|
-
int pass_bits = ::cuda::std::min(
|
|
479
|
+
int pass_bits = ::cuda::std::min(RadixBits, end_bit - begin_bit);
|
|
518
480
|
auto digit_extractor =
|
|
519
481
|
traits::template digit_extractor<fundamental_digit_extractor_t>(begin_bit, pass_bits, decomposer);
|
|
520
482
|
|
|
521
483
|
// Rank the blocked keys
|
|
522
|
-
int ranks[
|
|
484
|
+
int ranks[ItemsPerThread];
|
|
523
485
|
RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
|
|
524
|
-
begin_bit +=
|
|
486
|
+
begin_bit += RadixBits;
|
|
525
487
|
|
|
526
488
|
__syncthreads();
|
|
527
489
|
|
|
@@ -549,7 +511,7 @@ public:
|
|
|
549
511
|
|
|
550
512
|
// Untwiddle bits if necessary
|
|
551
513
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
552
|
-
for (int KEY = 0; KEY <
|
|
514
|
+
for (int KEY = 0; KEY < ItemsPerThread; KEY++)
|
|
553
515
|
{
|
|
554
516
|
unsigned_keys[KEY] = bit_ordered_conversion::from_bit_ordered(decomposer, unsigned_keys[KEY]);
|
|
555
517
|
}
|
|
@@ -567,7 +529,7 @@ public:
|
|
|
567
529
|
//! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
568
530
|
_CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort()
|
|
569
531
|
: temp_storage(PrivateStorage())
|
|
570
|
-
, linear_tid(RowMajorTid(
|
|
532
|
+
, linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
|
|
571
533
|
{}
|
|
572
534
|
|
|
573
535
|
/**
|
|
@@ -578,7 +540,7 @@ public:
|
|
|
578
540
|
*/
|
|
579
541
|
_CCCL_DEVICE _CCCL_FORCEINLINE BlockRadixSort(TempStorage& temp_storage)
|
|
580
542
|
: temp_storage(temp_storage.Alias())
|
|
581
|
-
, linear_tid(RowMajorTid(
|
|
543
|
+
, linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
|
|
582
544
|
{}
|
|
583
545
|
|
|
584
546
|
//! @} end member group
|
|
@@ -633,9 +595,9 @@ public:
|
|
|
633
595
|
//! @param[in] end_bit
|
|
634
596
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
635
597
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
636
|
-
Sort(KeyT (&keys)[
|
|
598
|
+
Sort(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
637
599
|
{
|
|
638
|
-
NullType values[
|
|
600
|
+
NullType values[ItemsPerThread];
|
|
639
601
|
|
|
640
602
|
SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
641
603
|
}
|
|
@@ -700,9 +662,9 @@ public:
|
|
|
700
662
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
701
663
|
::cuda::std::enable_if_t< //
|
|
702
664
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
703
|
-
Sort(KeyT (&keys)[
|
|
665
|
+
Sort(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
704
666
|
{
|
|
705
|
-
NullType values[
|
|
667
|
+
NullType values[ItemsPerThread];
|
|
706
668
|
|
|
707
669
|
SortBlocked(
|
|
708
670
|
keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -760,7 +722,7 @@ public:
|
|
|
760
722
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
761
723
|
::cuda::std::enable_if_t< //
|
|
762
724
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
763
|
-
Sort(KeyT (&keys)[
|
|
725
|
+
Sort(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
|
|
764
726
|
{
|
|
765
727
|
Sort(keys, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
|
|
766
728
|
}
|
|
@@ -824,11 +786,8 @@ public:
|
|
|
824
786
|
//!
|
|
825
787
|
//! @param[in] end_bit
|
|
826
788
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
827
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
828
|
-
|
|
829
|
-
ValueT (&values)[ITEMS_PER_THREAD],
|
|
830
|
-
int begin_bit = 0,
|
|
831
|
-
int end_bit = sizeof(KeyT) * 8)
|
|
789
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Sort(
|
|
790
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
832
791
|
{
|
|
833
792
|
SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
834
793
|
}
|
|
@@ -901,11 +860,8 @@ public:
|
|
|
901
860
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
902
861
|
::cuda::std::enable_if_t< //
|
|
903
862
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
904
|
-
Sort(
|
|
905
|
-
|
|
906
|
-
DecomposerT decomposer,
|
|
907
|
-
int begin_bit,
|
|
908
|
-
int end_bit)
|
|
863
|
+
Sort(
|
|
864
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
909
865
|
{
|
|
910
866
|
SortBlocked(
|
|
911
867
|
keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -971,7 +927,7 @@ public:
|
|
|
971
927
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
972
928
|
::cuda::std::enable_if_t< //
|
|
973
929
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
974
|
-
Sort(KeyT (&keys)[
|
|
930
|
+
Sort(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
|
|
975
931
|
{
|
|
976
932
|
Sort(keys, values, decomposer, 0, detail::radix::traits_t<KeyT>::default_end_bit(decomposer));
|
|
977
933
|
}
|
|
@@ -1025,9 +981,9 @@ public:
|
|
|
1025
981
|
//! @param[in] end_bit
|
|
1026
982
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
1027
983
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1028
|
-
SortDescending(KeyT (&keys)[
|
|
984
|
+
SortDescending(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
1029
985
|
{
|
|
1030
|
-
NullType values[
|
|
986
|
+
NullType values[ItemsPerThread];
|
|
1031
987
|
|
|
1032
988
|
SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
1033
989
|
}
|
|
@@ -1092,9 +1048,9 @@ public:
|
|
|
1092
1048
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1093
1049
|
::cuda::std::enable_if_t< //
|
|
1094
1050
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1095
|
-
SortDescending(KeyT (&keys)[
|
|
1051
|
+
SortDescending(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
1096
1052
|
{
|
|
1097
|
-
NullType values[
|
|
1053
|
+
NullType values[ItemsPerThread];
|
|
1098
1054
|
|
|
1099
1055
|
SortBlocked(
|
|
1100
1056
|
keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -1152,9 +1108,9 @@ public:
|
|
|
1152
1108
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1153
1109
|
::cuda::std::enable_if_t< //
|
|
1154
1110
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1155
|
-
SortDescending(KeyT (&keys)[
|
|
1111
|
+
SortDescending(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
|
|
1156
1112
|
{
|
|
1157
|
-
NullType values[
|
|
1113
|
+
NullType values[ItemsPerThread];
|
|
1158
1114
|
|
|
1159
1115
|
SortBlocked(
|
|
1160
1116
|
keys,
|
|
@@ -1224,10 +1180,7 @@ public:
|
|
|
1224
1180
|
//! @param[in] end_bit
|
|
1225
1181
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
1226
1182
|
_CCCL_DEVICE _CCCL_FORCEINLINE void SortDescending(
|
|
1227
|
-
KeyT (&keys)[
|
|
1228
|
-
ValueT (&values)[ITEMS_PER_THREAD],
|
|
1229
|
-
int begin_bit = 0,
|
|
1230
|
-
int end_bit = sizeof(KeyT) * 8)
|
|
1183
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
1231
1184
|
{
|
|
1232
1185
|
SortBlocked(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
1233
1186
|
}
|
|
@@ -1300,11 +1253,8 @@ public:
|
|
|
1300
1253
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1301
1254
|
::cuda::std::enable_if_t< //
|
|
1302
1255
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1303
|
-
SortDescending(
|
|
1304
|
-
|
|
1305
|
-
DecomposerT decomposer,
|
|
1306
|
-
int begin_bit,
|
|
1307
|
-
int end_bit)
|
|
1256
|
+
SortDescending(
|
|
1257
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
1308
1258
|
{
|
|
1309
1259
|
SortBlocked(
|
|
1310
1260
|
keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -1370,7 +1320,7 @@ public:
|
|
|
1370
1320
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1371
1321
|
::cuda::std::enable_if_t< //
|
|
1372
1322
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1373
|
-
SortDescending(KeyT (&keys)[
|
|
1323
|
+
SortDescending(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
|
|
1374
1324
|
{
|
|
1375
1325
|
SortBlocked(
|
|
1376
1326
|
keys,
|
|
@@ -1435,9 +1385,9 @@ public:
|
|
|
1435
1385
|
//! @param[in] end_bit
|
|
1436
1386
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
1437
1387
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1438
|
-
SortBlockedToStriped(KeyT (&keys)[
|
|
1388
|
+
SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
1439
1389
|
{
|
|
1440
|
-
NullType values[
|
|
1390
|
+
NullType values[ItemsPerThread];
|
|
1441
1391
|
|
|
1442
1392
|
SortBlockedToStriped(
|
|
1443
1393
|
keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
@@ -1504,9 +1454,9 @@ public:
|
|
|
1504
1454
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1505
1455
|
::cuda::std::enable_if_t< //
|
|
1506
1456
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1507
|
-
SortBlockedToStriped(KeyT (&keys)[
|
|
1457
|
+
SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
1508
1458
|
{
|
|
1509
|
-
NullType values[
|
|
1459
|
+
NullType values[ItemsPerThread];
|
|
1510
1460
|
|
|
1511
1461
|
SortBlockedToStriped(
|
|
1512
1462
|
keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -1565,9 +1515,9 @@ public:
|
|
|
1565
1515
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1566
1516
|
::cuda::std::enable_if_t< //
|
|
1567
1517
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1568
|
-
SortBlockedToStriped(KeyT (&keys)[
|
|
1518
|
+
SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
|
|
1569
1519
|
{
|
|
1570
|
-
NullType values[
|
|
1520
|
+
NullType values[ItemsPerThread];
|
|
1571
1521
|
|
|
1572
1522
|
SortBlockedToStriped(
|
|
1573
1523
|
keys,
|
|
@@ -1637,10 +1587,7 @@ public:
|
|
|
1637
1587
|
//! @param[in] end_bit
|
|
1638
1588
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
1639
1589
|
_CCCL_DEVICE _CCCL_FORCEINLINE void SortBlockedToStriped(
|
|
1640
|
-
KeyT (&keys)[
|
|
1641
|
-
ValueT (&values)[ITEMS_PER_THREAD],
|
|
1642
|
-
int begin_bit = 0,
|
|
1643
|
-
int end_bit = sizeof(KeyT) * 8)
|
|
1590
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
1644
1591
|
{
|
|
1645
1592
|
SortBlockedToStriped(
|
|
1646
1593
|
keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
@@ -1710,11 +1657,8 @@ public:
|
|
|
1710
1657
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1711
1658
|
::cuda::std::enable_if_t< //
|
|
1712
1659
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1713
|
-
SortBlockedToStriped(
|
|
1714
|
-
|
|
1715
|
-
DecomposerT decomposer,
|
|
1716
|
-
int begin_bit,
|
|
1717
|
-
int end_bit)
|
|
1660
|
+
SortBlockedToStriped(
|
|
1661
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
1718
1662
|
{
|
|
1719
1663
|
SortBlockedToStriped(
|
|
1720
1664
|
keys, values, begin_bit, end_bit, ::cuda::std::false_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -1776,7 +1720,7 @@ public:
|
|
|
1776
1720
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1777
1721
|
::cuda::std::enable_if_t< //
|
|
1778
1722
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1779
|
-
SortBlockedToStriped(KeyT (&keys)[
|
|
1723
|
+
SortBlockedToStriped(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
|
|
1780
1724
|
{
|
|
1781
1725
|
SortBlockedToStriped(
|
|
1782
1726
|
keys,
|
|
@@ -1837,9 +1781,9 @@ public:
|
|
|
1837
1781
|
//! @param[in] end_bit
|
|
1838
1782
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
1839
1783
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1840
|
-
SortDescendingBlockedToStriped(KeyT (&keys)[
|
|
1784
|
+
SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
1841
1785
|
{
|
|
1842
|
-
NullType values[
|
|
1786
|
+
NullType values[ItemsPerThread];
|
|
1843
1787
|
|
|
1844
1788
|
SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
1845
1789
|
}
|
|
@@ -1905,9 +1849,9 @@ public:
|
|
|
1905
1849
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1906
1850
|
::cuda::std::enable_if_t< //
|
|
1907
1851
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1908
|
-
SortDescendingBlockedToStriped(KeyT (&keys)[
|
|
1852
|
+
SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
1909
1853
|
{
|
|
1910
|
-
NullType values[
|
|
1854
|
+
NullType values[ItemsPerThread];
|
|
1911
1855
|
|
|
1912
1856
|
SortBlockedToStriped(
|
|
1913
1857
|
keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -1966,9 +1910,9 @@ public:
|
|
|
1966
1910
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
1967
1911
|
::cuda::std::enable_if_t< //
|
|
1968
1912
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
1969
|
-
SortDescendingBlockedToStriped(KeyT (&keys)[
|
|
1913
|
+
SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], DecomposerT decomposer)
|
|
1970
1914
|
{
|
|
1971
|
-
NullType values[
|
|
1915
|
+
NullType values[ItemsPerThread];
|
|
1972
1916
|
|
|
1973
1917
|
SortBlockedToStriped(
|
|
1974
1918
|
keys,
|
|
@@ -2038,10 +1982,7 @@ public:
|
|
|
2038
1982
|
//! @param[in] end_bit
|
|
2039
1983
|
//! **[optional]** The past-the-end (most-significant) bit index needed for key comparison
|
|
2040
1984
|
_CCCL_DEVICE _CCCL_FORCEINLINE void SortDescendingBlockedToStriped(
|
|
2041
|
-
KeyT (&keys)[
|
|
2042
|
-
ValueT (&values)[ITEMS_PER_THREAD],
|
|
2043
|
-
int begin_bit = 0,
|
|
2044
|
-
int end_bit = sizeof(KeyT) * 8)
|
|
1985
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], int begin_bit = 0, int end_bit = sizeof(KeyT) * 8)
|
|
2045
1986
|
{
|
|
2046
1987
|
SortBlockedToStriped(keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>);
|
|
2047
1988
|
}
|
|
@@ -2111,11 +2052,7 @@ public:
|
|
|
2111
2052
|
::cuda::std::enable_if_t< //
|
|
2112
2053
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
2113
2054
|
SortDescendingBlockedToStriped(
|
|
2114
|
-
KeyT (&keys)[
|
|
2115
|
-
ValueT (&values)[ITEMS_PER_THREAD],
|
|
2116
|
-
DecomposerT decomposer,
|
|
2117
|
-
int begin_bit,
|
|
2118
|
-
int end_bit)
|
|
2055
|
+
KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer, int begin_bit, int end_bit)
|
|
2119
2056
|
{
|
|
2120
2057
|
SortBlockedToStriped(
|
|
2121
2058
|
keys, values, begin_bit, end_bit, ::cuda::std::true_type(), detail::bool_constant_v<KEYS_ONLY>, decomposer);
|
|
@@ -2177,8 +2114,7 @@ public:
|
|
|
2177
2114
|
_CCCL_DEVICE _CCCL_FORCEINLINE //
|
|
2178
2115
|
::cuda::std::enable_if_t< //
|
|
2179
2116
|
!::cuda::std::is_convertible_v<DecomposerT, int>>
|
|
2180
|
-
SortDescendingBlockedToStriped(
|
|
2181
|
-
KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&values)[ITEMS_PER_THREAD], DecomposerT decomposer)
|
|
2117
|
+
SortDescendingBlockedToStriped(KeyT (&keys)[ItemsPerThread], ValueT (&values)[ItemsPerThread], DecomposerT decomposer)
|
|
2182
2118
|
{
|
|
2183
2119
|
SortBlockedToStriped(
|
|
2184
2120
|
keys,
|
|
@@ -1,30 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
*
|
|
5
|
-
* Redistribution and use in source and binary forms, with or without
|
|
6
|
-
* modification, are permitted provided that the following conditions are met:
|
|
7
|
-
* * Redistributions of source code must retain the above copyright
|
|
8
|
-
* notice, this list of conditions and the following disclaimer.
|
|
9
|
-
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
-
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
-
* documentation and/or other materials provided with the distribution.
|
|
12
|
-
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
-
* names of its contributors may be used to endorse or promote products
|
|
14
|
-
* derived from this software without specific prior written permission.
|
|
15
|
-
*
|
|
16
|
-
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
-
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
-
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
-
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
-
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
-
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
-
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
-
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
-
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
-
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
-
*
|
|
27
|
-
******************************************************************************/
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
2
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
// SPDX-License-Identifier: BSD-3
|
|
28
4
|
|
|
29
5
|
/**
|
|
30
6
|
* @file
|
|
@@ -65,10 +41,10 @@ CUB_NAMESPACE_BEGIN
|
|
|
65
41
|
//! @tparam T
|
|
66
42
|
//! The data type to be exchanged.
|
|
67
43
|
//!
|
|
68
|
-
//! @tparam
|
|
44
|
+
//! @tparam BlockThreads
|
|
69
45
|
//! The thread block size in threads.
|
|
70
46
|
//!
|
|
71
|
-
template <typename T, int
|
|
47
|
+
template <typename T, int BlockThreads>
|
|
72
48
|
struct BlockRakingLayout
|
|
73
49
|
{
|
|
74
50
|
//---------------------------------------------------------------------
|
|
@@ -78,15 +54,15 @@ struct BlockRakingLayout
|
|
|
78
54
|
enum
|
|
79
55
|
{
|
|
80
56
|
/// The total number of elements that need to be cooperatively reduced
|
|
81
|
-
SHARED_ELEMENTS =
|
|
57
|
+
SHARED_ELEMENTS = BlockThreads,
|
|
82
58
|
|
|
83
59
|
/// Maximum number of warp-synchronous raking threads
|
|
84
|
-
MAX_RAKING_THREADS = ::cuda::std::min(
|
|
60
|
+
MAX_RAKING_THREADS = ::cuda::std::min(BlockThreads, detail::warp_threads),
|
|
85
61
|
|
|
86
62
|
/// Number of raking elements per warp-synchronous raking thread (rounded up)
|
|
87
63
|
SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
|
|
88
64
|
|
|
89
|
-
/// Never use a raking thread that will have no valid data (e.g., when
|
|
65
|
+
/// Never use a raking thread that will have no valid data (e.g., when BlockThreads is 62 and SEGMENT_LENGTH is 2,
|
|
90
66
|
/// we should only use 31 raking threads)
|
|
91
67
|
RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
|
|
92
68
|
|