cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
- cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
- cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
- cuda/cccl/headers/include/cub/config.cuh +2 -26
- cuda/cccl/headers/include/cub/cub.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
- cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
- cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
- cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
- cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
- cuda/cccl/headers/include/cub/util_device.cuh +18 -59
- cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
- cuda/cccl/headers/include/cub/util_math.cuh +2 -28
- cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
- cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
- cuda/cccl/headers/include/cub/util_type.cuh +5 -32
- cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
- cuda/cccl/headers/include/cub/version.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
- cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
- cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
- cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
- cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
- cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
- cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
- cuda/cccl/headers/include/cuda/barrier +42 -16
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/memory_resource +6 -1
- cuda/cccl/headers/include/cuda/numeric +2 -0
- cuda/cccl/headers/include/cuda/pipeline +3 -2
- cuda/cccl/headers/include/cuda/ptx +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
- cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
- cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
- cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
- cuda/cccl/headers/include/cuda/std/__new_ +1 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random_ +2 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/array +1 -1
- cuda/cccl/headers/include/cuda/std/atomic +1 -1
- cuda/cccl/headers/include/cuda/std/bitset +2 -10
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
- cuda/cccl/headers/include/cuda/std/functional +1 -1
- cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
- cuda/cccl/headers/include/cuda/std/iterator +1 -1
- cuda/cccl/headers/include/cuda/std/numbers +0 -2
- cuda/cccl/headers/include/cuda/std/ratio +2 -2
- cuda/cccl/headers/include/cuda/std/span +2 -2
- cuda/cccl/headers/include/cuda/std/string_view +24 -42
- cuda/cccl/headers/include/cuda/std/tuple +18 -1
- cuda/cccl/headers/include/cuda/std/type_traits +0 -1
- cuda/cccl/headers/include/cuda/std/variant +8 -1
- cuda/cccl/headers/include/nv/target +2 -6
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
- cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
- cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
- cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
- cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
- cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
- cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
- cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
- cuda/cccl/headers/include/thrust/functional.h +0 -2
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/new.h +0 -2
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
- cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
- cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
- cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
- cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
- cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
- cuda/cccl/headers/include/thrust/transform.h +14 -3
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
- cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
- cuda/cccl/headers/include/thrust/zip_function.h +2 -28
- cuda/compute/__init__.py +4 -0
- cuda/compute/_bindings.pyi +26 -3
- cuda/compute/_bindings_impl.pyx +143 -1
- cuda/compute/algorithms/__init__.py +9 -5
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
- cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda_cccl-0.3.4.dist-info/METADATA +78 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
- cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
- cuda_cccl-0.3.2.dist-info/METADATA +0 -42
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,30 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
*
|
|
5
|
-
* Redistribution and use in source and binary forms, with or without
|
|
6
|
-
* modification, are permitted provided that the following conditions are met:
|
|
7
|
-
* * Redistributions of source code must retain the above copyright
|
|
8
|
-
* notice, this list of conditions and the following disclaimer.
|
|
9
|
-
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
-
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
-
* documentation and/or other materials provided with the distribution.
|
|
12
|
-
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
-
* names of its contributors may be used to endorse or promote products
|
|
14
|
-
* derived from this software without specific prior written permission.
|
|
15
|
-
*
|
|
16
|
-
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
-
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
-
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
-
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
-
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
-
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
-
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
-
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
-
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
-
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
-
*
|
|
27
|
-
******************************************************************************/
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
2
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
// SPDX-License-Identifier: BSD-3
|
|
28
4
|
|
|
29
5
|
//! @file
|
|
30
6
|
//! The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
|
|
@@ -47,6 +23,7 @@
|
|
|
47
23
|
#include <cub/util_type.cuh>
|
|
48
24
|
#include <cub/warp/warp_exchange.cuh>
|
|
49
25
|
|
|
26
|
+
#include <cuda/__cmath/pow2.h>
|
|
50
27
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
51
28
|
#include <cuda/std/__algorithm/min.h>
|
|
52
29
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
@@ -123,48 +100,43 @@ CUB_NAMESPACE_BEGIN
|
|
|
123
100
|
//! @tparam T
|
|
124
101
|
//! The data type to be exchanged
|
|
125
102
|
//!
|
|
126
|
-
//! @tparam
|
|
103
|
+
//! @tparam BlockDimX
|
|
127
104
|
//! The thread block length in threads along the X dimension
|
|
128
105
|
//!
|
|
129
|
-
//! @tparam
|
|
106
|
+
//! @tparam ItemsPerThread
|
|
130
107
|
//! The number of items partitioned onto each thread.
|
|
131
108
|
//!
|
|
132
|
-
//! @tparam
|
|
109
|
+
//! @tparam WarpTimeSlicing
|
|
133
110
|
//! **[optional]** When `true`, only use enough shared memory for a single warp's worth of
|
|
134
111
|
//! tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint
|
|
135
112
|
//! at the expense of decreased parallelism. (Default: false)
|
|
136
113
|
//!
|
|
137
|
-
//! @tparam
|
|
114
|
+
//! @tparam BlockDimY
|
|
138
115
|
//! **[optional]** The thread block length in threads along the Y dimension (default: 1)
|
|
139
116
|
//!
|
|
140
|
-
//! @tparam
|
|
117
|
+
//! @tparam BlockDimZ
|
|
141
118
|
//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
|
|
142
119
|
//!
|
|
143
|
-
template <typename T,
|
|
144
|
-
int BLOCK_DIM_X,
|
|
145
|
-
int ITEMS_PER_THREAD,
|
|
146
|
-
bool WARP_TIME_SLICING = false,
|
|
147
|
-
int BLOCK_DIM_Y = 1,
|
|
148
|
-
int BLOCK_DIM_Z = 1>
|
|
120
|
+
template <typename T, int BlockDimX, int ItemsPerThread, bool WarpTimeSlicing = false, int BlockDimY = 1, int BlockDimZ = 1>
|
|
149
121
|
class BlockExchange
|
|
150
122
|
{
|
|
151
|
-
static constexpr int BLOCK_THREADS =
|
|
123
|
+
static constexpr int BLOCK_THREADS = BlockDimX * BlockDimY * BlockDimZ; ///< The thread block size in threads
|
|
152
124
|
static constexpr int WARP_THREADS = detail::warp_threads;
|
|
153
125
|
static constexpr int WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS; // TODO(bgruber): use ceil_div in
|
|
154
126
|
// C++14
|
|
155
127
|
static constexpr int LOG_SMEM_BANKS = detail::log2_smem_banks;
|
|
156
128
|
|
|
157
|
-
static constexpr int TILE_ITEMS = BLOCK_THREADS *
|
|
158
|
-
static constexpr int TIME_SLICES =
|
|
129
|
+
static constexpr int TILE_ITEMS = BLOCK_THREADS * ItemsPerThread;
|
|
130
|
+
static constexpr int TIME_SLICES = WarpTimeSlicing ? WARPS : 1;
|
|
159
131
|
static constexpr int TIME_SLICED_THREADS =
|
|
160
|
-
|
|
161
|
-
static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS *
|
|
132
|
+
WarpTimeSlicing ? ::cuda::std::min(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
|
|
133
|
+
static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ItemsPerThread;
|
|
162
134
|
static constexpr int WARP_TIME_SLICED_THREADS = ::cuda::std::min(BLOCK_THREADS, WARP_THREADS);
|
|
163
|
-
static constexpr int WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS *
|
|
135
|
+
static constexpr int WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ItemsPerThread;
|
|
164
136
|
|
|
165
137
|
// Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
|
|
166
138
|
// we can typically use 128b loads)
|
|
167
|
-
static constexpr bool INSERT_PADDING =
|
|
139
|
+
static constexpr bool INSERT_PADDING = ItemsPerThread > 4 && ::cuda::is_power_of_two(ItemsPerThread);
|
|
168
140
|
static constexpr int PADDING_ITEMS = INSERT_PADDING ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0;
|
|
169
141
|
|
|
170
142
|
/// Shared memory storage layout type
|
|
@@ -181,7 +153,7 @@ private:
|
|
|
181
153
|
_TempStorage& temp_storage;
|
|
182
154
|
|
|
183
155
|
// TODO(bgruber): can we use signed int here? Only these variables are unsigned:
|
|
184
|
-
unsigned int linear_tid = RowMajorTid(
|
|
156
|
+
unsigned int linear_tid = RowMajorTid(BlockDimX, BlockDimY, BlockDimZ);
|
|
185
157
|
unsigned int lane_id = ::cuda::ptx::get_sreg_laneid();
|
|
186
158
|
unsigned int warp_id = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
|
|
187
159
|
unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
|
|
@@ -203,14 +175,14 @@ private:
|
|
|
203
175
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
204
176
|
template <typename OutputT>
|
|
205
177
|
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
|
|
206
|
-
const T (&input_items)[
|
|
207
|
-
OutputT (&output_items)[
|
|
178
|
+
const T (&input_items)[ItemsPerThread],
|
|
179
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
208
180
|
::cuda::std::false_type /*time_slicing*/)
|
|
209
181
|
{
|
|
210
182
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
211
|
-
for (int i = 0; i <
|
|
183
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
212
184
|
{
|
|
213
|
-
int item_offset = linear_tid *
|
|
185
|
+
int item_offset = linear_tid * ItemsPerThread + i;
|
|
214
186
|
if constexpr (INSERT_PADDING)
|
|
215
187
|
{
|
|
216
188
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -221,7 +193,7 @@ private:
|
|
|
221
193
|
__syncthreads();
|
|
222
194
|
|
|
223
195
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
224
|
-
for (int i = 0; i <
|
|
196
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
225
197
|
{
|
|
226
198
|
int item_offset = i * BLOCK_THREADS + linear_tid;
|
|
227
199
|
if constexpr (INSERT_PADDING)
|
|
@@ -242,11 +214,11 @@ private:
|
|
|
242
214
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
243
215
|
template <typename OutputT>
|
|
244
216
|
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
|
|
245
|
-
const T (&input_items)[
|
|
246
|
-
OutputT (&output_items)[
|
|
217
|
+
const T (&input_items)[ItemsPerThread],
|
|
218
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
247
219
|
::cuda::std::true_type /*time_slicing*/)
|
|
248
220
|
{
|
|
249
|
-
T temp_items[
|
|
221
|
+
T temp_items[ItemsPerThread];
|
|
250
222
|
|
|
251
223
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
252
224
|
for (int slice = 0; slice < TIME_SLICES; slice++)
|
|
@@ -259,9 +231,9 @@ private:
|
|
|
259
231
|
if (warp_id == slice)
|
|
260
232
|
{
|
|
261
233
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
262
|
-
for (int i = 0; i <
|
|
234
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
263
235
|
{
|
|
264
|
-
int item_offset = lane_id *
|
|
236
|
+
int item_offset = lane_id * ItemsPerThread + i;
|
|
265
237
|
if constexpr (INSERT_PADDING)
|
|
266
238
|
{
|
|
267
239
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -273,7 +245,7 @@ private:
|
|
|
273
245
|
__syncthreads();
|
|
274
246
|
|
|
275
247
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
276
|
-
for (int i = 0; i <
|
|
248
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
277
249
|
{
|
|
278
250
|
// Read a strip of items
|
|
279
251
|
const int strip_offset = i * BLOCK_THREADS;
|
|
@@ -296,7 +268,7 @@ private:
|
|
|
296
268
|
|
|
297
269
|
// Copy
|
|
298
270
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
299
|
-
for (int i = 0; i <
|
|
271
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
300
272
|
{
|
|
301
273
|
output_items[i] = temp_items[i];
|
|
302
274
|
}
|
|
@@ -312,14 +284,14 @@ private:
|
|
|
312
284
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
313
285
|
template <typename OutputT>
|
|
314
286
|
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
|
|
315
|
-
const T (&input_items)[
|
|
316
|
-
OutputT (&output_items)[
|
|
287
|
+
const T (&input_items)[ItemsPerThread],
|
|
288
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
317
289
|
::cuda::std::false_type /*time_slicing*/)
|
|
318
290
|
{
|
|
319
291
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
320
|
-
for (int i = 0; i <
|
|
292
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
321
293
|
{
|
|
322
|
-
int item_offset = warp_offset + i + (lane_id *
|
|
294
|
+
int item_offset = warp_offset + i + (lane_id * ItemsPerThread);
|
|
323
295
|
if constexpr (INSERT_PADDING)
|
|
324
296
|
{
|
|
325
297
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -330,7 +302,7 @@ private:
|
|
|
330
302
|
__syncwarp(0xffffffff);
|
|
331
303
|
|
|
332
304
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
333
|
-
for (int i = 0; i <
|
|
305
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
334
306
|
{
|
|
335
307
|
int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
|
|
336
308
|
if constexpr (INSERT_PADDING)
|
|
@@ -351,16 +323,16 @@ private:
|
|
|
351
323
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
352
324
|
template <typename OutputT>
|
|
353
325
|
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
|
|
354
|
-
const T (&input_items)[
|
|
355
|
-
OutputT (&output_items)[
|
|
326
|
+
const T (&input_items)[ItemsPerThread],
|
|
327
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
356
328
|
::cuda::std::true_type /*time_slicing*/)
|
|
357
329
|
{
|
|
358
330
|
if (warp_id == 0)
|
|
359
331
|
{
|
|
360
332
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
361
|
-
for (int i = 0; i <
|
|
333
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
362
334
|
{
|
|
363
|
-
int item_offset = i + lane_id *
|
|
335
|
+
int item_offset = i + lane_id * ItemsPerThread;
|
|
364
336
|
if constexpr (INSERT_PADDING)
|
|
365
337
|
{
|
|
366
338
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -371,7 +343,7 @@ private:
|
|
|
371
343
|
__syncwarp(0xffffffff);
|
|
372
344
|
|
|
373
345
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
374
|
-
for (int i = 0; i <
|
|
346
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
375
347
|
{
|
|
376
348
|
int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
|
|
377
349
|
if constexpr (INSERT_PADDING)
|
|
@@ -390,9 +362,9 @@ private:
|
|
|
390
362
|
if (warp_id == slice)
|
|
391
363
|
{
|
|
392
364
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
393
|
-
for (int i = 0; i <
|
|
365
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
394
366
|
{
|
|
395
|
-
int item_offset = i + lane_id *
|
|
367
|
+
int item_offset = i + lane_id * ItemsPerThread;
|
|
396
368
|
if constexpr (INSERT_PADDING)
|
|
397
369
|
{
|
|
398
370
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -403,7 +375,7 @@ private:
|
|
|
403
375
|
__syncwarp(0xffffffff);
|
|
404
376
|
|
|
405
377
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
406
|
-
for (int i = 0; i <
|
|
378
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
407
379
|
{
|
|
408
380
|
int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
|
|
409
381
|
if constexpr (INSERT_PADDING)
|
|
@@ -426,12 +398,12 @@ private:
|
|
|
426
398
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
427
399
|
template <typename OutputT>
|
|
428
400
|
_CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
|
|
429
|
-
const T (&input_items)[
|
|
430
|
-
OutputT (&output_items)[
|
|
401
|
+
const T (&input_items)[ItemsPerThread],
|
|
402
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
431
403
|
::cuda::std::false_type /*time_slicing*/)
|
|
432
404
|
{
|
|
433
405
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
434
|
-
for (int i = 0; i <
|
|
406
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
435
407
|
{
|
|
436
408
|
int item_offset = i * BLOCK_THREADS + linear_tid;
|
|
437
409
|
if constexpr (INSERT_PADDING)
|
|
@@ -445,9 +417,9 @@ private:
|
|
|
445
417
|
|
|
446
418
|
// No timeslicing
|
|
447
419
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
448
|
-
for (int i = 0; i <
|
|
420
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
449
421
|
{
|
|
450
|
-
int item_offset = linear_tid *
|
|
422
|
+
int item_offset = linear_tid * ItemsPerThread + i;
|
|
451
423
|
if constexpr (INSERT_PADDING)
|
|
452
424
|
{
|
|
453
425
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -466,12 +438,12 @@ private:
|
|
|
466
438
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
467
439
|
template <typename OutputT>
|
|
468
440
|
_CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
|
|
469
|
-
const T (&input_items)[
|
|
470
|
-
OutputT (&output_items)[
|
|
441
|
+
const T (&input_items)[ItemsPerThread],
|
|
442
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
471
443
|
::cuda::std::true_type /*time_slicing*/)
|
|
472
444
|
{
|
|
473
445
|
// Warp time-slicing
|
|
474
|
-
T temp_items[
|
|
446
|
+
T temp_items[ItemsPerThread];
|
|
475
447
|
|
|
476
448
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
477
449
|
for (int slice = 0; slice < TIME_SLICES; slice++)
|
|
@@ -482,7 +454,7 @@ private:
|
|
|
482
454
|
__syncthreads();
|
|
483
455
|
|
|
484
456
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
485
|
-
for (int i = 0; i <
|
|
457
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
486
458
|
{
|
|
487
459
|
// Write a strip of items
|
|
488
460
|
const int strip_offset = i * BLOCK_THREADS;
|
|
@@ -507,9 +479,9 @@ private:
|
|
|
507
479
|
if (warp_id == slice)
|
|
508
480
|
{
|
|
509
481
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
510
|
-
for (int i = 0; i <
|
|
482
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
511
483
|
{
|
|
512
|
-
int item_offset = lane_id *
|
|
484
|
+
int item_offset = lane_id * ItemsPerThread + i;
|
|
513
485
|
if constexpr (INSERT_PADDING)
|
|
514
486
|
{
|
|
515
487
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -521,7 +493,7 @@ private:
|
|
|
521
493
|
|
|
522
494
|
// Copy
|
|
523
495
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
524
|
-
for (int i = 0; i <
|
|
496
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
525
497
|
{
|
|
526
498
|
output_items[i] = temp_items[i];
|
|
527
499
|
}
|
|
@@ -537,12 +509,12 @@ private:
|
|
|
537
509
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
538
510
|
template <typename OutputT>
|
|
539
511
|
_CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
|
|
540
|
-
const T (&input_items)[
|
|
541
|
-
OutputT (&output_items)[
|
|
512
|
+
const T (&input_items)[ItemsPerThread],
|
|
513
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
542
514
|
::cuda::std::false_type /*time_slicing*/)
|
|
543
515
|
{
|
|
544
516
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
545
|
-
for (int i = 0; i <
|
|
517
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
546
518
|
{
|
|
547
519
|
int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
|
|
548
520
|
if constexpr (INSERT_PADDING)
|
|
@@ -555,9 +527,9 @@ private:
|
|
|
555
527
|
__syncwarp(0xffffffff);
|
|
556
528
|
|
|
557
529
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
558
|
-
for (int i = 0; i <
|
|
530
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
559
531
|
{
|
|
560
|
-
int item_offset = warp_offset + i + (lane_id *
|
|
532
|
+
int item_offset = warp_offset + i + (lane_id * ItemsPerThread);
|
|
561
533
|
if constexpr (INSERT_PADDING)
|
|
562
534
|
{
|
|
563
535
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -576,8 +548,8 @@ private:
|
|
|
576
548
|
//! Items to exchange, converting between **blocked** and **striped** arrangements.
|
|
577
549
|
template <typename OutputT>
|
|
578
550
|
_CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
|
|
579
|
-
const T (&input_items)[
|
|
580
|
-
OutputT (&output_items)[
|
|
551
|
+
const T (&input_items)[ItemsPerThread],
|
|
552
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
581
553
|
::cuda::std::true_type /*time_slicing*/)
|
|
582
554
|
{
|
|
583
555
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
@@ -588,7 +560,7 @@ private:
|
|
|
588
560
|
if (warp_id == slice)
|
|
589
561
|
{
|
|
590
562
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
591
|
-
for (int i = 0; i <
|
|
563
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
592
564
|
{
|
|
593
565
|
int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
|
|
594
566
|
if constexpr (INSERT_PADDING)
|
|
@@ -601,9 +573,9 @@ private:
|
|
|
601
573
|
__syncwarp(0xffffffff);
|
|
602
574
|
|
|
603
575
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
604
|
-
for (int i = 0; i <
|
|
576
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
605
577
|
{
|
|
606
|
-
int item_offset = i + lane_id *
|
|
578
|
+
int item_offset = i + lane_id * ItemsPerThread;
|
|
607
579
|
if constexpr (INSERT_PADDING)
|
|
608
580
|
{
|
|
609
581
|
item_offset += item_offset >> LOG_SMEM_BANKS;
|
|
@@ -626,13 +598,13 @@ private:
|
|
|
626
598
|
//! Corresponding scatter ranks
|
|
627
599
|
template <typename OutputT, typename OffsetT>
|
|
628
600
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
|
|
629
|
-
const T (&input_items)[
|
|
630
|
-
OutputT (&output_items)[
|
|
631
|
-
OffsetT (&ranks)[
|
|
601
|
+
const T (&input_items)[ItemsPerThread],
|
|
602
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
603
|
+
OffsetT (&ranks)[ItemsPerThread],
|
|
632
604
|
::cuda::std::false_type /*time_slicing*/)
|
|
633
605
|
{
|
|
634
606
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
635
|
-
for (int i = 0; i <
|
|
607
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
636
608
|
{
|
|
637
609
|
int item_offset = ranks[i];
|
|
638
610
|
if constexpr (INSERT_PADDING)
|
|
@@ -645,9 +617,9 @@ private:
|
|
|
645
617
|
__syncthreads();
|
|
646
618
|
|
|
647
619
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
648
|
-
for (int i = 0; i <
|
|
620
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
649
621
|
{
|
|
650
|
-
int item_offset = linear_tid *
|
|
622
|
+
int item_offset = linear_tid * ItemsPerThread + i;
|
|
651
623
|
if constexpr (INSERT_PADDING)
|
|
652
624
|
{
|
|
653
625
|
item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
|
|
@@ -668,12 +640,12 @@ private:
|
|
|
668
640
|
//! Corresponding scatter ranks
|
|
669
641
|
template <typename OutputT, typename OffsetT>
|
|
670
642
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
|
|
671
|
-
const T (&input_items)[
|
|
672
|
-
OutputT (&output_items)[
|
|
673
|
-
OffsetT ranks[
|
|
643
|
+
const T (&input_items)[ItemsPerThread],
|
|
644
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
645
|
+
OffsetT ranks[ItemsPerThread],
|
|
674
646
|
::cuda::std::true_type /*time_slicing*/)
|
|
675
647
|
{
|
|
676
|
-
T temp_items[
|
|
648
|
+
T temp_items[ItemsPerThread];
|
|
677
649
|
|
|
678
650
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
679
651
|
for (int slice = 0; slice < TIME_SLICES; slice++)
|
|
@@ -683,7 +655,7 @@ private:
|
|
|
683
655
|
const int slice_offset = TIME_SLICED_ITEMS * slice;
|
|
684
656
|
|
|
685
657
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
686
|
-
for (int i = 0; i <
|
|
658
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
687
659
|
{
|
|
688
660
|
int item_offset = ranks[i] - slice_offset;
|
|
689
661
|
if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
|
|
@@ -701,9 +673,9 @@ private:
|
|
|
701
673
|
if (warp_id == slice)
|
|
702
674
|
{
|
|
703
675
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
704
|
-
for (int i = 0; i <
|
|
676
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
705
677
|
{
|
|
706
|
-
int item_offset = lane_id *
|
|
678
|
+
int item_offset = lane_id * ItemsPerThread + i;
|
|
707
679
|
if constexpr (INSERT_PADDING)
|
|
708
680
|
{
|
|
709
681
|
item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
|
|
@@ -715,7 +687,7 @@ private:
|
|
|
715
687
|
|
|
716
688
|
// Copy
|
|
717
689
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
718
|
-
for (int i = 0; i <
|
|
690
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
719
691
|
{
|
|
720
692
|
output_items[i] = temp_items[i];
|
|
721
693
|
}
|
|
@@ -733,13 +705,13 @@ private:
|
|
|
733
705
|
//! Corresponding scatter ranks
|
|
734
706
|
template <typename OutputT, typename OffsetT>
|
|
735
707
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
|
|
736
|
-
const T (&input_items)[
|
|
737
|
-
OutputT (&output_items)[
|
|
738
|
-
OffsetT (&ranks)[
|
|
708
|
+
const T (&input_items)[ItemsPerThread],
|
|
709
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
710
|
+
OffsetT (&ranks)[ItemsPerThread],
|
|
739
711
|
::cuda::std::false_type /*time_slicing*/)
|
|
740
712
|
{
|
|
741
713
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
742
|
-
for (int i = 0; i <
|
|
714
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
743
715
|
{
|
|
744
716
|
int item_offset = ranks[i];
|
|
745
717
|
if constexpr (INSERT_PADDING)
|
|
@@ -752,7 +724,7 @@ private:
|
|
|
752
724
|
__syncthreads();
|
|
753
725
|
|
|
754
726
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
755
|
-
for (int i = 0; i <
|
|
727
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
756
728
|
{
|
|
757
729
|
int item_offset = i * BLOCK_THREADS + linear_tid;
|
|
758
730
|
if constexpr (INSERT_PADDING)
|
|
@@ -775,12 +747,12 @@ private:
|
|
|
775
747
|
//! Corresponding scatter ranks
|
|
776
748
|
template <typename OutputT, typename OffsetT>
|
|
777
749
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
|
|
778
|
-
const T (&input_items)[
|
|
779
|
-
OutputT (&output_items)[
|
|
780
|
-
OffsetT (&ranks)[
|
|
750
|
+
const T (&input_items)[ItemsPerThread],
|
|
751
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
752
|
+
OffsetT (&ranks)[ItemsPerThread],
|
|
781
753
|
::cuda::std::true_type /*time_slicing*/)
|
|
782
754
|
{
|
|
783
|
-
T temp_items[
|
|
755
|
+
T temp_items[ItemsPerThread];
|
|
784
756
|
|
|
785
757
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
786
758
|
for (int slice = 0; slice < TIME_SLICES; slice++)
|
|
@@ -791,7 +763,7 @@ private:
|
|
|
791
763
|
__syncthreads();
|
|
792
764
|
|
|
793
765
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
794
|
-
for (int i = 0; i <
|
|
766
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
795
767
|
{
|
|
796
768
|
int item_offset = ranks[i] - slice_offset;
|
|
797
769
|
if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
|
|
@@ -807,7 +779,7 @@ private:
|
|
|
807
779
|
__syncthreads();
|
|
808
780
|
|
|
809
781
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
810
|
-
for (int i = 0; i <
|
|
782
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
811
783
|
{
|
|
812
784
|
// Read a strip of items
|
|
813
785
|
const int strip_offset = i * BLOCK_THREADS;
|
|
@@ -830,7 +802,7 @@ private:
|
|
|
830
802
|
|
|
831
803
|
// Copy
|
|
832
804
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
833
|
-
for (int i = 0; i <
|
|
805
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
834
806
|
{
|
|
835
807
|
output_items[i] = temp_items[i];
|
|
836
808
|
}
|
|
@@ -898,9 +870,9 @@ public:
|
|
|
898
870
|
//! Items from exchange, converting between **striped** and **blocked** arrangements.
|
|
899
871
|
template <typename OutputT>
|
|
900
872
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
901
|
-
StripedToBlocked(const T (&input_items)[
|
|
873
|
+
StripedToBlocked(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
|
|
902
874
|
{
|
|
903
|
-
StripedToBlocked(input_items, output_items, detail::bool_constant_v<
|
|
875
|
+
StripedToBlocked(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
|
|
904
876
|
}
|
|
905
877
|
|
|
906
878
|
//! @rst
|
|
@@ -950,9 +922,9 @@ public:
|
|
|
950
922
|
//! Items from exchange, converting between **striped** and **blocked** arrangements.
|
|
951
923
|
template <typename OutputT>
|
|
952
924
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
953
|
-
BlockedToStriped(const T (&input_items)[
|
|
925
|
+
BlockedToStriped(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
|
|
954
926
|
{
|
|
955
|
-
BlockedToStriped(input_items, output_items, detail::bool_constant_v<
|
|
927
|
+
BlockedToStriped(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
|
|
956
928
|
}
|
|
957
929
|
|
|
958
930
|
//! @rst
|
|
@@ -1002,9 +974,9 @@ public:
|
|
|
1002
974
|
//! Items from exchange, converting between **striped** and **blocked** arrangements.
|
|
1003
975
|
template <typename OutputT>
|
|
1004
976
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1005
|
-
WarpStripedToBlocked(const T (&input_items)[
|
|
977
|
+
WarpStripedToBlocked(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
|
|
1006
978
|
{
|
|
1007
|
-
WarpStripedToBlocked(input_items, output_items, detail::bool_constant_v<
|
|
979
|
+
WarpStripedToBlocked(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
|
|
1008
980
|
}
|
|
1009
981
|
|
|
1010
982
|
//! @rst
|
|
@@ -1057,9 +1029,9 @@ public:
|
|
|
1057
1029
|
//! Items from exchange, converting between **striped** and **blocked** arrangements.
|
|
1058
1030
|
template <typename OutputT>
|
|
1059
1031
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1060
|
-
BlockedToWarpStriped(const T (&input_items)[
|
|
1032
|
+
BlockedToWarpStriped(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
|
|
1061
1033
|
{
|
|
1062
|
-
BlockedToWarpStriped(input_items, output_items, detail::bool_constant_v<
|
|
1034
|
+
BlockedToWarpStriped(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
|
|
1063
1035
|
}
|
|
1064
1036
|
|
|
1065
1037
|
//! @} end member group
|
|
@@ -1085,11 +1057,9 @@ public:
|
|
|
1085
1057
|
//! Corresponding scatter ranks
|
|
1086
1058
|
template <typename OutputT, typename OffsetT>
|
|
1087
1059
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
|
|
1088
|
-
const T (&input_items)[
|
|
1089
|
-
OutputT (&output_items)[ITEMS_PER_THREAD],
|
|
1090
|
-
OffsetT (&ranks)[ITEMS_PER_THREAD])
|
|
1060
|
+
const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
|
|
1091
1061
|
{
|
|
1092
|
-
ScatterToBlocked(input_items, output_items, ranks, detail::bool_constant_v<
|
|
1062
|
+
ScatterToBlocked(input_items, output_items, ranks, detail::bool_constant_v<WarpTimeSlicing>);
|
|
1093
1063
|
}
|
|
1094
1064
|
|
|
1095
1065
|
//! @rst
|
|
@@ -1112,11 +1082,9 @@ public:
|
|
|
1112
1082
|
//! Corresponding scatter ranks
|
|
1113
1083
|
template <typename OutputT, typename OffsetT>
|
|
1114
1084
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
|
|
1115
|
-
const T (&input_items)[
|
|
1116
|
-
OutputT (&output_items)[ITEMS_PER_THREAD],
|
|
1117
|
-
OffsetT (&ranks)[ITEMS_PER_THREAD])
|
|
1085
|
+
const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
|
|
1118
1086
|
{
|
|
1119
|
-
ScatterToStriped(input_items, output_items, ranks, detail::bool_constant_v<
|
|
1087
|
+
ScatterToStriped(input_items, output_items, ranks, detail::bool_constant_v<WarpTimeSlicing>);
|
|
1120
1088
|
}
|
|
1121
1089
|
|
|
1122
1090
|
//! @rst
|
|
@@ -1139,12 +1107,10 @@ public:
|
|
|
1139
1107
|
//! Corresponding scatter ranks
|
|
1140
1108
|
template <typename OutputT, typename OffsetT>
|
|
1141
1109
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded(
|
|
1142
|
-
const T (&input_items)[
|
|
1143
|
-
OutputT (&output_items)[ITEMS_PER_THREAD],
|
|
1144
|
-
OffsetT (&ranks)[ITEMS_PER_THREAD])
|
|
1110
|
+
const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
|
|
1145
1111
|
{
|
|
1146
1112
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
1147
|
-
for (int i = 0; i <
|
|
1113
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
1148
1114
|
{
|
|
1149
1115
|
int item_offset = ranks[i];
|
|
1150
1116
|
if constexpr (INSERT_PADDING)
|
|
@@ -1160,7 +1126,7 @@ public:
|
|
|
1160
1126
|
__syncthreads();
|
|
1161
1127
|
|
|
1162
1128
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
1163
|
-
for (int i = 0; i <
|
|
1129
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
1164
1130
|
{
|
|
1165
1131
|
int item_offset = i * BLOCK_THREADS + linear_tid;
|
|
1166
1132
|
if constexpr (INSERT_PADDING)
|
|
@@ -1197,13 +1163,13 @@ public:
|
|
|
1197
1163
|
//! Corresponding flag denoting item validity
|
|
1198
1164
|
template <typename OutputT, typename OffsetT, typename ValidFlag>
|
|
1199
1165
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
|
|
1200
|
-
const T (&input_items)[
|
|
1201
|
-
OutputT (&output_items)[
|
|
1202
|
-
OffsetT (&ranks)[
|
|
1203
|
-
ValidFlag (&is_valid)[
|
|
1166
|
+
const T (&input_items)[ItemsPerThread],
|
|
1167
|
+
OutputT (&output_items)[ItemsPerThread],
|
|
1168
|
+
OffsetT (&ranks)[ItemsPerThread],
|
|
1169
|
+
ValidFlag (&is_valid)[ItemsPerThread])
|
|
1204
1170
|
{
|
|
1205
1171
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
1206
|
-
for (int i = 0; i <
|
|
1172
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
1207
1173
|
{
|
|
1208
1174
|
int item_offset = ranks[i];
|
|
1209
1175
|
if constexpr (INSERT_PADDING)
|
|
@@ -1219,7 +1185,7 @@ public:
|
|
|
1219
1185
|
__syncthreads();
|
|
1220
1186
|
|
|
1221
1187
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
1222
|
-
for (int i = 0; i <
|
|
1188
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
1223
1189
|
{
|
|
1224
1190
|
int item_offset = i * BLOCK_THREADS + linear_tid;
|
|
1225
1191
|
if constexpr (INSERT_PADDING)
|
|
@@ -1236,28 +1202,28 @@ public:
|
|
|
1236
1202
|
|
|
1237
1203
|
/// @param[in-out] items
|
|
1238
1204
|
/// Items to exchange, converting between **striped** and **blocked** arrangements.
|
|
1239
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[
|
|
1205
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ItemsPerThread])
|
|
1240
1206
|
{
|
|
1241
1207
|
StripedToBlocked(items, items);
|
|
1242
1208
|
}
|
|
1243
1209
|
|
|
1244
1210
|
/// @param[in-out] items
|
|
1245
1211
|
/// Items to exchange, converting between **striped** and **blocked** arrangements.
|
|
1246
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[
|
|
1212
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ItemsPerThread])
|
|
1247
1213
|
{
|
|
1248
1214
|
BlockedToStriped(items, items);
|
|
1249
1215
|
}
|
|
1250
1216
|
|
|
1251
1217
|
/// @param[in-out] items
|
|
1252
1218
|
/// Items to exchange, converting between **striped** and **blocked** arrangements.
|
|
1253
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[
|
|
1219
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ItemsPerThread])
|
|
1254
1220
|
{
|
|
1255
1221
|
WarpStripedToBlocked(items, items);
|
|
1256
1222
|
}
|
|
1257
1223
|
|
|
1258
1224
|
/// @param[in-out] items
|
|
1259
1225
|
/// Items to exchange, converting between **striped** and **blocked** arrangements.
|
|
1260
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[
|
|
1226
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ItemsPerThread])
|
|
1261
1227
|
{
|
|
1262
1228
|
BlockedToWarpStriped(items, items);
|
|
1263
1229
|
}
|
|
@@ -1268,7 +1234,7 @@ public:
|
|
|
1268
1234
|
/// @param[in] ranks
|
|
1269
1235
|
/// Corresponding scatter ranks
|
|
1270
1236
|
template <typename OffsetT>
|
|
1271
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[
|
|
1237
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
|
|
1272
1238
|
{
|
|
1273
1239
|
ScatterToBlocked(items, items, ranks);
|
|
1274
1240
|
}
|
|
@@ -1278,7 +1244,7 @@ public:
|
|
|
1278
1244
|
/// @param[in] ranks
|
|
1279
1245
|
/// Corresponding scatter ranks
|
|
1280
1246
|
template <typename OffsetT>
|
|
1281
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[
|
|
1247
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
|
|
1282
1248
|
{
|
|
1283
1249
|
ScatterToStriped(items, items, ranks);
|
|
1284
1250
|
}
|
|
@@ -1289,7 +1255,7 @@ public:
|
|
|
1289
1255
|
/// Corresponding scatter ranks
|
|
1290
1256
|
template <typename OffsetT>
|
|
1291
1257
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1292
|
-
ScatterToStripedGuarded(T (&items)[
|
|
1258
|
+
ScatterToStripedGuarded(T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
|
|
1293
1259
|
{
|
|
1294
1260
|
ScatterToStripedGuarded(items, items, ranks);
|
|
1295
1261
|
}
|
|
@@ -1302,7 +1268,7 @@ public:
|
|
|
1302
1268
|
/// Corresponding flag denoting item validity
|
|
1303
1269
|
template <typename OffsetT, typename ValidFlag>
|
|
1304
1270
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
|
|
1305
|
-
T (&items)[
|
|
1271
|
+
T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread], ValidFlag (&is_valid)[ItemsPerThread])
|
|
1306
1272
|
{
|
|
1307
1273
|
ScatterToStripedFlagged(items, items, ranks, is_valid);
|
|
1308
1274
|
}
|