cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
- cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
- cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
- cuda/cccl/headers/include/cub/config.cuh +2 -26
- cuda/cccl/headers/include/cub/cub.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
- cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
- cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
- cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
- cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
- cuda/cccl/headers/include/cub/util_device.cuh +18 -59
- cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
- cuda/cccl/headers/include/cub/util_math.cuh +2 -28
- cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
- cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
- cuda/cccl/headers/include/cub/util_type.cuh +5 -32
- cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
- cuda/cccl/headers/include/cub/version.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
- cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
- cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
- cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
- cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
- cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
- cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
- cuda/cccl/headers/include/cuda/barrier +42 -16
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/memory_resource +6 -1
- cuda/cccl/headers/include/cuda/numeric +2 -0
- cuda/cccl/headers/include/cuda/pipeline +3 -2
- cuda/cccl/headers/include/cuda/ptx +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
- cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
- cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
- cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
- cuda/cccl/headers/include/cuda/std/__new_ +1 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random_ +2 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/array +1 -1
- cuda/cccl/headers/include/cuda/std/atomic +1 -1
- cuda/cccl/headers/include/cuda/std/bitset +2 -10
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
- cuda/cccl/headers/include/cuda/std/functional +1 -1
- cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
- cuda/cccl/headers/include/cuda/std/iterator +1 -1
- cuda/cccl/headers/include/cuda/std/numbers +0 -2
- cuda/cccl/headers/include/cuda/std/ratio +2 -2
- cuda/cccl/headers/include/cuda/std/span +2 -2
- cuda/cccl/headers/include/cuda/std/string_view +24 -42
- cuda/cccl/headers/include/cuda/std/tuple +18 -1
- cuda/cccl/headers/include/cuda/std/type_traits +0 -1
- cuda/cccl/headers/include/cuda/std/variant +8 -1
- cuda/cccl/headers/include/nv/target +2 -6
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
- cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
- cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
- cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
- cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
- cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
- cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
- cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
- cuda/cccl/headers/include/thrust/functional.h +0 -2
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/new.h +0 -2
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
- cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
- cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
- cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
- cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
- cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
- cuda/cccl/headers/include/thrust/transform.h +14 -3
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
- cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
- cuda/cccl/headers/include/thrust/zip_function.h +2 -28
- cuda/compute/__init__.py +4 -0
- cuda/compute/_bindings.pyi +26 -3
- cuda/compute/_bindings_impl.pyx +143 -1
- cuda/compute/algorithms/__init__.py +9 -5
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
- cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda_cccl-0.3.4.dist-info/METADATA +78 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
- cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
- cuda_cccl-0.3.2.dist-info/METADATA +0 -42
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,30 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
*
|
|
5
|
-
* Redistribution and use in source and binary forms, with or without
|
|
6
|
-
* modification, are permitted provided that the following conditions are met:
|
|
7
|
-
* * Redistributions of source code must retain the above copyright
|
|
8
|
-
* notice, this list of conditions and the following disclaimer.
|
|
9
|
-
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
-
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
-
* documentation and/or other materials provided with the distribution.
|
|
12
|
-
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
-
* names of its contributors may be used to endorse or promote products
|
|
14
|
-
* derived from this software without specific prior written permission.
|
|
15
|
-
*
|
|
16
|
-
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
-
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
-
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
-
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
-
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
-
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
-
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
-
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
-
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
-
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
-
*
|
|
27
|
-
******************************************************************************/
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
2
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
// SPDX-License-Identifier: BSD-3
|
|
28
4
|
|
|
29
5
|
//! @file
|
|
30
6
|
//! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
|
|
@@ -46,6 +22,8 @@
|
|
|
46
22
|
#include <cub/util_ptx.cuh>
|
|
47
23
|
#include <cub/util_type.cuh>
|
|
48
24
|
|
|
25
|
+
#include <cuda/std/__new/device_new.h>
|
|
26
|
+
|
|
49
27
|
CUB_NAMESPACE_BEGIN
|
|
50
28
|
|
|
51
29
|
//! @name Blocked arrangement I/O (direct)
|
|
@@ -61,7 +39,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
61
39
|
//! @tparam T
|
|
62
40
|
//! **[inferred]** The data type to load.
|
|
63
41
|
//!
|
|
64
|
-
//! @tparam
|
|
42
|
+
//! @tparam ItemsPerThread
|
|
65
43
|
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
66
44
|
//!
|
|
67
45
|
//! @tparam RandomAccessIterator
|
|
@@ -76,15 +54,15 @@ CUB_NAMESPACE_BEGIN
|
|
|
76
54
|
//!
|
|
77
55
|
//! @param[out] dst_items
|
|
78
56
|
//! Destination to load data into
|
|
79
|
-
template <typename T, int
|
|
57
|
+
template <typename T, int ItemsPerThread, typename RandomAccessIterator>
|
|
80
58
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
81
|
-
LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[
|
|
59
|
+
LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
82
60
|
{
|
|
83
61
|
// Load directly in thread-blocked order
|
|
84
62
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
85
|
-
for (int i = 0; i <
|
|
63
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
86
64
|
{
|
|
87
|
-
dst_items[i] = block_src_it[linear_tid *
|
|
65
|
+
dst_items[i] = block_src_it[linear_tid * ItemsPerThread + i];
|
|
88
66
|
}
|
|
89
67
|
}
|
|
90
68
|
|
|
@@ -98,7 +76,7 @@ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
|
|
|
98
76
|
//! @tparam T
|
|
99
77
|
//! **[inferred]** The data type to load.
|
|
100
78
|
//!
|
|
101
|
-
//! @tparam
|
|
79
|
+
//! @tparam ItemsPerThread
|
|
102
80
|
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
103
81
|
//!
|
|
104
82
|
//! @tparam RandomAccessIterator
|
|
@@ -116,14 +94,14 @@ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
|
|
|
116
94
|
//!
|
|
117
95
|
//! @param[in] block_items_end
|
|
118
96
|
//! First out-of-bounds index when loading from block_src_it
|
|
119
|
-
template <typename T, int
|
|
97
|
+
template <typename T, int ItemsPerThread, typename RandomAccessIterator>
|
|
120
98
|
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
|
|
121
|
-
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[
|
|
99
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
122
100
|
{
|
|
123
101
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
124
|
-
for (int i = 0; i <
|
|
102
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
125
103
|
{
|
|
126
|
-
const auto src_pos = linear_tid *
|
|
104
|
+
const auto src_pos = linear_tid * ItemsPerThread + i;
|
|
127
105
|
if (src_pos < block_items_end)
|
|
128
106
|
{
|
|
129
107
|
dst_items[i] = block_src_it[src_pos];
|
|
@@ -142,7 +120,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
|
|
|
142
120
|
//! @tparam T
|
|
143
121
|
//! **[inferred]** The data type to load.
|
|
144
122
|
//!
|
|
145
|
-
//! @tparam
|
|
123
|
+
//! @tparam ItemsPerThread
|
|
146
124
|
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
147
125
|
//!
|
|
148
126
|
//! @tparam RandomAccessIterator
|
|
@@ -163,16 +141,16 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
|
|
|
163
141
|
//!
|
|
164
142
|
//! @param[in] oob_default
|
|
165
143
|
//! Default value to assign out-of-bound items
|
|
166
|
-
template <typename T, typename DefaultT, int
|
|
144
|
+
template <typename T, typename DefaultT, int ItemsPerThread, typename RandomAccessIterator>
|
|
167
145
|
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
|
|
168
146
|
int linear_tid,
|
|
169
147
|
RandomAccessIterator block_src_it,
|
|
170
|
-
T (&dst_items)[
|
|
148
|
+
T (&dst_items)[ItemsPerThread],
|
|
171
149
|
int block_items_end,
|
|
172
150
|
DefaultT oob_default)
|
|
173
151
|
{
|
|
174
152
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
175
|
-
for (int i = 0; i <
|
|
153
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
176
154
|
{
|
|
177
155
|
dst_items[i] = oob_default;
|
|
178
156
|
}
|
|
@@ -193,9 +171,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
|
|
|
193
171
|
//!
|
|
194
172
|
//! @param[out] dst_items
|
|
195
173
|
//! Destination to load data into
|
|
196
|
-
template <CacheLoadModifier MODIFIER, typename T, int
|
|
174
|
+
template <CacheLoadModifier MODIFIER, typename T, int ItemsPerThread>
|
|
197
175
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
198
|
-
InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[
|
|
176
|
+
InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ItemsPerThread])
|
|
199
177
|
{
|
|
200
178
|
// Find biggest memory access word that T is a whole multiple of
|
|
201
179
|
using device_word_t = typename UnitWord<T>::DeviceWord;
|
|
@@ -226,7 +204,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
|
|
|
226
204
|
|
|
227
205
|
// Copy to destination
|
|
228
206
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
229
|
-
for (int i = 0; i <
|
|
207
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
230
208
|
{
|
|
231
209
|
dst_items[i] = *(reinterpret_cast<T*>(vec_items) + i);
|
|
232
210
|
}
|
|
@@ -248,7 +226,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
|
|
|
248
226
|
//!
|
|
249
227
|
//! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
|
|
250
228
|
//!
|
|
251
|
-
//! - ``
|
|
229
|
+
//! - ``ItemsPerThread`` is odd
|
|
252
230
|
//! - The data type ``T`` is not a built-in primitive or CUDA vector type
|
|
253
231
|
//! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
|
|
254
232
|
//!
|
|
@@ -257,7 +235,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
|
|
|
257
235
|
//! @tparam T
|
|
258
236
|
//! **[inferred]** The data type to load.
|
|
259
237
|
//!
|
|
260
|
-
//! @tparam
|
|
238
|
+
//! @tparam ItemsPerThread
|
|
261
239
|
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
262
240
|
//!
|
|
263
241
|
//! @param[in] linear_tid
|
|
@@ -269,9 +247,9 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
|
|
|
269
247
|
//!
|
|
270
248
|
//! @param[out] dst_items
|
|
271
249
|
//! destination to load data into
|
|
272
|
-
template <typename T, int
|
|
250
|
+
template <typename T, int ItemsPerThread>
|
|
273
251
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
274
|
-
LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[
|
|
252
|
+
LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ItemsPerThread])
|
|
275
253
|
{
|
|
276
254
|
InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_src_ptr, dst_items);
|
|
277
255
|
}
|
|
@@ -287,13 +265,13 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITE
|
|
|
287
265
|
//!
|
|
288
266
|
//! @endrst
|
|
289
267
|
//!
|
|
290
|
-
//! @tparam
|
|
268
|
+
//! @tparam BlockThreads
|
|
291
269
|
//! The thread block size in threads
|
|
292
270
|
//!
|
|
293
271
|
//! @tparam T
|
|
294
272
|
//! **[inferred]** The data type to load.
|
|
295
273
|
//!
|
|
296
|
-
//! @tparam
|
|
274
|
+
//! @tparam ItemsPerThread
|
|
297
275
|
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
298
276
|
//!
|
|
299
277
|
//! @tparam RandomAccessIterator
|
|
@@ -308,27 +286,27 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITE
|
|
|
308
286
|
//!
|
|
309
287
|
//! @param[out] dst_items
|
|
310
288
|
//! Destination to load data into
|
|
311
|
-
template <int
|
|
289
|
+
template <int BlockThreads, typename T, int ItemsPerThread, typename RandomAccessIterator>
|
|
312
290
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
313
|
-
LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[
|
|
291
|
+
LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
314
292
|
{
|
|
315
293
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
316
|
-
for (int i = 0; i <
|
|
294
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
317
295
|
{
|
|
318
|
-
dst_items[i] = block_src_it[linear_tid + i *
|
|
296
|
+
dst_items[i] = block_src_it[linear_tid + i * BlockThreads];
|
|
319
297
|
}
|
|
320
298
|
}
|
|
321
299
|
|
|
322
300
|
namespace detail
|
|
323
301
|
{
|
|
324
|
-
template <int
|
|
302
|
+
template <int BlockThreads, typename T, int ItemsPerThread, typename RandomAccessIterator, typename TransformOpT>
|
|
325
303
|
_CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
|
|
326
|
-
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[
|
|
304
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], TransformOpT transform_op)
|
|
327
305
|
{
|
|
328
306
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
329
|
-
for (int i = 0; i <
|
|
307
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
330
308
|
{
|
|
331
|
-
dst_items[i] = transform_op(block_src_it[linear_tid + i *
|
|
309
|
+
dst_items[i] = transform_op(block_src_it[linear_tid + i * BlockThreads]);
|
|
332
310
|
}
|
|
333
311
|
}
|
|
334
312
|
} // namespace detail
|
|
@@ -340,13 +318,13 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
|
|
|
340
318
|
//!
|
|
341
319
|
//! @endrst
|
|
342
320
|
//!
|
|
343
|
-
//! @tparam
|
|
321
|
+
//! @tparam BlockThreads
|
|
344
322
|
//! The thread block size in threads
|
|
345
323
|
//!
|
|
346
324
|
//! @tparam T
|
|
347
325
|
//! **inferred** The data type to load.
|
|
348
326
|
//!
|
|
349
|
-
//! @tparam
|
|
327
|
+
//! @tparam ItemsPerThread
|
|
350
328
|
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
351
329
|
//!
|
|
352
330
|
//! @tparam RandomAccessIterator
|
|
@@ -364,14 +342,14 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
|
|
|
364
342
|
//!
|
|
365
343
|
//! @param[in] block_items_end
|
|
366
344
|
//! Number of valid items to load
|
|
367
|
-
template <int
|
|
345
|
+
template <int BlockThreads, typename T, int ItemsPerThread, typename RandomAccessIterator>
|
|
368
346
|
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
369
|
-
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[
|
|
347
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
370
348
|
{
|
|
371
349
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
372
|
-
for (int i = 0; i <
|
|
350
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
373
351
|
{
|
|
374
|
-
const auto src_pos = linear_tid + i *
|
|
352
|
+
const auto src_pos = linear_tid + i * BlockThreads;
|
|
375
353
|
if (src_pos < block_items_end)
|
|
376
354
|
{
|
|
377
355
|
dst_items[i] = block_src_it[src_pos];
|
|
@@ -387,13 +365,13 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
|
387
365
|
//!
|
|
388
366
|
//! @endrst
|
|
389
367
|
//!
|
|
390
|
-
//! @tparam
|
|
368
|
+
//! @tparam BlockThreads
|
|
391
369
|
//! The thread block size in threads
|
|
392
370
|
//!
|
|
393
371
|
//! @tparam T
|
|
394
372
|
//! **inferred** The data type to load.
|
|
395
373
|
//!
|
|
396
|
-
//! @tparam
|
|
374
|
+
//! @tparam ItemsPerThread
|
|
397
375
|
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
398
376
|
//!
|
|
399
377
|
//! @tparam RandomAccessIterator
|
|
@@ -414,21 +392,21 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
|
414
392
|
//!
|
|
415
393
|
//! @param[in] oob_default
|
|
416
394
|
//! Default value to assign out-of-bound items
|
|
417
|
-
template <int
|
|
395
|
+
template <int BlockThreads, typename T, typename DefaultT, int ItemsPerThread, typename RandomAccessIterator>
|
|
418
396
|
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
419
397
|
int linear_tid,
|
|
420
398
|
RandomAccessIterator block_src_it,
|
|
421
|
-
T (&dst_items)[
|
|
399
|
+
T (&dst_items)[ItemsPerThread],
|
|
422
400
|
int block_items_end,
|
|
423
401
|
DefaultT oob_default)
|
|
424
402
|
{
|
|
425
403
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
426
|
-
for (int i = 0; i <
|
|
404
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
427
405
|
{
|
|
428
406
|
dst_items[i] = oob_default;
|
|
429
407
|
}
|
|
430
408
|
|
|
431
|
-
LoadDirectStriped<
|
|
409
|
+
LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end);
|
|
432
410
|
}
|
|
433
411
|
|
|
434
412
|
//! @} end member group
|
|
@@ -450,7 +428,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
|
450
428
|
//! @tparam T
|
|
451
429
|
//! **inferred** The data type to load.
|
|
452
430
|
//!
|
|
453
|
-
//! @tparam
|
|
431
|
+
//! @tparam ItemsPerThread
|
|
454
432
|
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
455
433
|
//!
|
|
456
434
|
//! @tparam RandomAccessIterator
|
|
@@ -465,17 +443,17 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
|
465
443
|
//!
|
|
466
444
|
//! @param[out] dst_items
|
|
467
445
|
//! Destination to load data into
|
|
468
|
-
template <typename T, int
|
|
446
|
+
template <typename T, int ItemsPerThread, typename RandomAccessIterator>
|
|
469
447
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
470
|
-
LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[
|
|
448
|
+
LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
471
449
|
{
|
|
472
450
|
const int tid = linear_tid & (detail::warp_threads - 1);
|
|
473
451
|
const int wid = linear_tid >> detail::log2_warp_threads;
|
|
474
|
-
const int warp_offset = wid * detail::warp_threads *
|
|
452
|
+
const int warp_offset = wid * detail::warp_threads * ItemsPerThread;
|
|
475
453
|
|
|
476
454
|
// Load directly in warp-striped order
|
|
477
455
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
478
|
-
for (int i = 0; i <
|
|
456
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
479
457
|
{
|
|
480
458
|
new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]);
|
|
481
459
|
}
|
|
@@ -496,7 +474,7 @@ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst
|
|
|
496
474
|
//! @tparam T
|
|
497
475
|
//! **inferred** The data type to load.
|
|
498
476
|
//!
|
|
499
|
-
//! @tparam
|
|
477
|
+
//! @tparam ItemsPerThread
|
|
500
478
|
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
501
479
|
//!
|
|
502
480
|
//! @tparam RandomAccessIterator
|
|
@@ -514,17 +492,17 @@ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst
|
|
|
514
492
|
//!
|
|
515
493
|
//! @param[in] block_items_end
|
|
516
494
|
//! Number of valid items to load
|
|
517
|
-
template <typename T, int
|
|
495
|
+
template <typename T, int ItemsPerThread, typename RandomAccessIterator>
|
|
518
496
|
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
|
|
519
|
-
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[
|
|
497
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
520
498
|
{
|
|
521
499
|
const int tid = linear_tid & (detail::warp_threads - 1);
|
|
522
500
|
const int wid = linear_tid >> detail::log2_warp_threads;
|
|
523
|
-
const int warp_offset = wid * detail::warp_threads *
|
|
501
|
+
const int warp_offset = wid * detail::warp_threads * ItemsPerThread;
|
|
524
502
|
|
|
525
503
|
// Load directly in warp-striped order
|
|
526
504
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
527
|
-
for (int i = 0; i <
|
|
505
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
528
506
|
{
|
|
529
507
|
const auto src_pos = warp_offset + tid + (i * detail::warp_threads);
|
|
530
508
|
if (src_pos < block_items_end)
|
|
@@ -550,7 +528,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
|
|
|
550
528
|
//! @tparam T
|
|
551
529
|
//! **inferred** The data type to load.
|
|
552
530
|
//!
|
|
553
|
-
//! @tparam
|
|
531
|
+
//! @tparam ItemsPerThread
|
|
554
532
|
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
555
533
|
//!
|
|
556
534
|
//! @tparam RandomAccessIterator
|
|
@@ -571,17 +549,17 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
|
|
|
571
549
|
//!
|
|
572
550
|
//! @param[in] oob_default
|
|
573
551
|
//! Default value to assign out-of-bound items
|
|
574
|
-
template <typename T, typename DefaultT, int
|
|
552
|
+
template <typename T, typename DefaultT, int ItemsPerThread, typename RandomAccessIterator>
|
|
575
553
|
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
|
|
576
554
|
int linear_tid,
|
|
577
555
|
RandomAccessIterator block_src_it,
|
|
578
|
-
T (&dst_items)[
|
|
556
|
+
T (&dst_items)[ItemsPerThread],
|
|
579
557
|
int block_items_end,
|
|
580
558
|
DefaultT oob_default)
|
|
581
559
|
{
|
|
582
560
|
// Load directly in warp-striped order
|
|
583
561
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
584
|
-
for (int i = 0; i <
|
|
562
|
+
for (int i = 0; i < ItemsPerThread; i++)
|
|
585
563
|
{
|
|
586
564
|
dst_items[i] = oob_default;
|
|
587
565
|
}
|
|
@@ -629,7 +607,7 @@ enum BlockLoadAlgorithm
|
|
|
629
607
|
//!
|
|
630
608
|
//! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using CUDA's built-in
|
|
631
609
|
//! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated
|
|
632
|
-
//! when ``T = int`` and ``
|
|
610
|
+
//! when ``T = int`` and ``ItemsPerThread % 4 == 0``.
|
|
633
611
|
//!
|
|
634
612
|
//! Performance Considerations
|
|
635
613
|
//! ++++++++++++++++++++++++++
|
|
@@ -639,7 +617,7 @@ enum BlockLoadAlgorithm
|
|
|
639
617
|
//! is lower).
|
|
640
618
|
//! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
|
|
641
619
|
//!
|
|
642
|
-
//! - ``
|
|
620
|
+
//! - ``ItemsPerThread`` is odd
|
|
643
621
|
//! - The ``RandomAccessIterator`` is not a simple pointer type
|
|
644
622
|
//! - The block input offset is not quadword-aligned
|
|
645
623
|
//! - The data type ``T`` is not a built-in primitive or CUDA vector type
|
|
@@ -675,7 +653,7 @@ enum BlockLoadAlgorithm
|
|
|
675
653
|
//! Usage Considerations
|
|
676
654
|
//! ++++++++++++++++++++++++++
|
|
677
655
|
//!
|
|
678
|
-
//! -
|
|
656
|
+
//! - BlockThreads must be a multiple of WARP_THREADS
|
|
679
657
|
//!
|
|
680
658
|
//! Performance Considerations
|
|
681
659
|
//! ++++++++++++++++++++++++++
|
|
@@ -701,7 +679,7 @@ enum BlockLoadAlgorithm
|
|
|
701
679
|
//! Usage Considerations
|
|
702
680
|
//! ++++++++++++++++++++++++++
|
|
703
681
|
//!
|
|
704
|
-
//! -
|
|
682
|
+
//! - BlockThreads must be a multiple of WARP_THREADS
|
|
705
683
|
//!
|
|
706
684
|
//! Performance Considerations
|
|
707
685
|
//! ++++++++++++++++++++++++++
|
|
@@ -791,7 +769,7 @@ enum BlockLoadAlgorithm
|
|
|
791
769
|
//! @tparam BLOCK_DIM_X
|
|
792
770
|
//! The thread block length in threads along the X dimension
|
|
793
771
|
//!
|
|
794
|
-
//! @tparam
|
|
772
|
+
//! @tparam ItemsPerThread
|
|
795
773
|
//! The number of consecutive items partitioned onto each thread.
|
|
796
774
|
//!
|
|
797
775
|
//! @tparam ALGORITHM
|
|
@@ -804,20 +782,20 @@ enum BlockLoadAlgorithm
|
|
|
804
782
|
//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
|
|
805
783
|
//!
|
|
806
784
|
template <typename T,
|
|
807
|
-
int
|
|
808
|
-
int
|
|
809
|
-
BlockLoadAlgorithm
|
|
810
|
-
int
|
|
811
|
-
int
|
|
785
|
+
int BlockDimX,
|
|
786
|
+
int ItemsPerThread,
|
|
787
|
+
BlockLoadAlgorithm Algorithm = BLOCK_LOAD_DIRECT,
|
|
788
|
+
int BlockDimY = 1,
|
|
789
|
+
int BlockDimZ = 1>
|
|
812
790
|
class BlockLoad
|
|
813
791
|
{
|
|
814
|
-
static constexpr int
|
|
792
|
+
static constexpr int BlockThreads = BlockDimX * BlockDimY * BlockDimZ; // total threads in the block
|
|
815
793
|
|
|
816
|
-
template <BlockLoadAlgorithm _POLICY, int
|
|
794
|
+
template <BlockLoadAlgorithm _POLICY, int Dummy>
|
|
817
795
|
struct LoadInternal; // helper to dispatch the load algorithm
|
|
818
796
|
|
|
819
|
-
template <int
|
|
820
|
-
struct LoadInternal<BLOCK_LOAD_DIRECT,
|
|
797
|
+
template <int Dummy>
|
|
798
|
+
struct LoadInternal<BLOCK_LOAD_DIRECT, Dummy>
|
|
821
799
|
{
|
|
822
800
|
using TempStorage = NullType;
|
|
823
801
|
int linear_tid;
|
|
@@ -827,28 +805,28 @@ class BlockLoad
|
|
|
827
805
|
{}
|
|
828
806
|
|
|
829
807
|
template <typename RandomAccessIterator>
|
|
830
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
808
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
831
809
|
{
|
|
832
810
|
LoadDirectBlocked(linear_tid, block_src_it, dst_items);
|
|
833
811
|
}
|
|
834
812
|
|
|
835
813
|
template <typename RandomAccessIterator>
|
|
836
814
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
837
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
815
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
838
816
|
{
|
|
839
817
|
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
|
|
840
818
|
}
|
|
841
819
|
|
|
842
820
|
template <typename RandomAccessIterator, typename DefaultT>
|
|
843
821
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
844
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
822
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
|
|
845
823
|
{
|
|
846
824
|
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
847
825
|
}
|
|
848
826
|
};
|
|
849
827
|
|
|
850
|
-
template <int
|
|
851
|
-
struct LoadInternal<BLOCK_LOAD_STRIPED,
|
|
828
|
+
template <int Dummy>
|
|
829
|
+
struct LoadInternal<BLOCK_LOAD_STRIPED, Dummy>
|
|
852
830
|
{
|
|
853
831
|
using TempStorage = NullType;
|
|
854
832
|
int linear_tid;
|
|
@@ -858,28 +836,28 @@ class BlockLoad
|
|
|
858
836
|
{}
|
|
859
837
|
|
|
860
838
|
template <typename RandomAccessIterator>
|
|
861
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
839
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
862
840
|
{
|
|
863
|
-
LoadDirectStriped<
|
|
841
|
+
LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items);
|
|
864
842
|
}
|
|
865
843
|
|
|
866
844
|
template <typename RandomAccessIterator>
|
|
867
845
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
868
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
846
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
869
847
|
{
|
|
870
|
-
LoadDirectStriped<
|
|
848
|
+
LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end);
|
|
871
849
|
}
|
|
872
850
|
|
|
873
851
|
template <typename RandomAccessIterator, typename DefaultT>
|
|
874
852
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
875
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
853
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
|
|
876
854
|
{
|
|
877
|
-
LoadDirectStriped<
|
|
855
|
+
LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
878
856
|
}
|
|
879
857
|
};
|
|
880
858
|
|
|
881
|
-
template <int
|
|
882
|
-
struct LoadInternal<BLOCK_LOAD_VECTORIZE,
|
|
859
|
+
template <int Dummy>
|
|
860
|
+
struct LoadInternal<BLOCK_LOAD_VECTORIZE, Dummy>
|
|
883
861
|
{
|
|
884
862
|
using TempStorage = NullType;
|
|
885
863
|
int linear_tid;
|
|
@@ -889,7 +867,7 @@ class BlockLoad
|
|
|
889
867
|
{}
|
|
890
868
|
|
|
891
869
|
// attempts vectorization (pointer)
|
|
892
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[
|
|
870
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ItemsPerThread])
|
|
893
871
|
{
|
|
894
872
|
InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
|
|
895
873
|
}
|
|
@@ -898,14 +876,14 @@ class BlockLoad
|
|
|
898
876
|
// Otherwise, when the pointer type is 'T*', the compiler will prefer the overloaded version
|
|
899
877
|
// Load(RandomAccessIterator...) over Load(const T*...), which means it will never perform vectorized loading for
|
|
900
878
|
// pointers to non-const types.
|
|
901
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(T* block_ptr, T (&dst_items)[
|
|
879
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(T* block_ptr, T (&dst_items)[ItemsPerThread])
|
|
902
880
|
{
|
|
903
881
|
InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
|
|
904
882
|
}
|
|
905
883
|
|
|
906
884
|
// any other iterator, no vectorization
|
|
907
885
|
template <typename RandomAccessIterator>
|
|
908
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
886
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
909
887
|
{
|
|
910
888
|
LoadDirectBlocked(linear_tid, block_src_it, dst_items);
|
|
911
889
|
}
|
|
@@ -913,7 +891,7 @@ class BlockLoad
|
|
|
913
891
|
// attempts vectorization (cache modified iterator)
|
|
914
892
|
template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
|
|
915
893
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
916
|
-
Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[
|
|
894
|
+
Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ItemsPerThread])
|
|
917
895
|
{
|
|
918
896
|
InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_src_it.ptr, dst_items);
|
|
919
897
|
}
|
|
@@ -921,7 +899,7 @@ class BlockLoad
|
|
|
921
899
|
// skips vectorization
|
|
922
900
|
template <typename RandomAccessIterator>
|
|
923
901
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
924
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
902
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
925
903
|
{
|
|
926
904
|
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
|
|
927
905
|
}
|
|
@@ -929,16 +907,16 @@ class BlockLoad
|
|
|
929
907
|
// skips vectorization
|
|
930
908
|
template <typename RandomAccessIterator, typename DefaultT>
|
|
931
909
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
932
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
910
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
|
|
933
911
|
{
|
|
934
912
|
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
935
913
|
}
|
|
936
914
|
};
|
|
937
915
|
|
|
938
|
-
template <int
|
|
939
|
-
struct LoadInternal<BLOCK_LOAD_TRANSPOSE,
|
|
916
|
+
template <int Dummy>
|
|
917
|
+
struct LoadInternal<BLOCK_LOAD_TRANSPOSE, Dummy>
|
|
940
918
|
{
|
|
941
|
-
using BlockExchange = BlockExchange<T,
|
|
919
|
+
using BlockExchange = BlockExchange<T, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
|
|
942
920
|
using _TempStorage = typename BlockExchange::TempStorage;
|
|
943
921
|
using TempStorage = Uninitialized<_TempStorage>;
|
|
944
922
|
|
|
@@ -951,36 +929,36 @@ class BlockLoad
|
|
|
951
929
|
{}
|
|
952
930
|
|
|
953
931
|
template <typename RandomAccessIterator>
|
|
954
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
932
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
955
933
|
{
|
|
956
|
-
LoadDirectStriped<
|
|
934
|
+
LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items);
|
|
957
935
|
BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
|
|
958
936
|
}
|
|
959
937
|
|
|
960
938
|
template <typename RandomAccessIterator>
|
|
961
939
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
962
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
940
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
963
941
|
{
|
|
964
|
-
LoadDirectStriped<
|
|
942
|
+
LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end);
|
|
965
943
|
BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
|
|
966
944
|
}
|
|
967
945
|
|
|
968
946
|
template <typename RandomAccessIterator, typename DefaultT>
|
|
969
947
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
970
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
948
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
|
|
971
949
|
{
|
|
972
|
-
LoadDirectStriped<
|
|
950
|
+
LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
973
951
|
BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
|
|
974
952
|
}
|
|
975
953
|
};
|
|
976
954
|
|
|
977
|
-
template <int
|
|
978
|
-
struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE,
|
|
955
|
+
template <int Dummy>
|
|
956
|
+
struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, Dummy>
|
|
979
957
|
{
|
|
980
958
|
static constexpr int WARP_THREADS = detail::warp_threads;
|
|
981
|
-
static_assert(
|
|
959
|
+
static_assert(BlockThreads % WARP_THREADS == 0, "BlockThreads must be a multiple of WARP_THREADS");
|
|
982
960
|
|
|
983
|
-
using BlockExchange = BlockExchange<T,
|
|
961
|
+
using BlockExchange = BlockExchange<T, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
|
|
984
962
|
using _TempStorage = typename BlockExchange::TempStorage;
|
|
985
963
|
using TempStorage = Uninitialized<_TempStorage>;
|
|
986
964
|
|
|
@@ -993,7 +971,7 @@ class BlockLoad
|
|
|
993
971
|
{}
|
|
994
972
|
|
|
995
973
|
template <typename RandomAccessIterator>
|
|
996
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
974
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
997
975
|
{
|
|
998
976
|
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
|
|
999
977
|
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
@@ -1001,7 +979,7 @@ class BlockLoad
|
|
|
1001
979
|
|
|
1002
980
|
template <typename RandomAccessIterator>
|
|
1003
981
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1004
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
982
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
1005
983
|
{
|
|
1006
984
|
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
|
|
1007
985
|
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
@@ -1009,20 +987,20 @@ class BlockLoad
|
|
|
1009
987
|
|
|
1010
988
|
template <typename RandomAccessIterator, typename DefaultT>
|
|
1011
989
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1012
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
990
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
|
|
1013
991
|
{
|
|
1014
992
|
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
1015
993
|
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1016
994
|
}
|
|
1017
995
|
};
|
|
1018
996
|
|
|
1019
|
-
template <int
|
|
1020
|
-
struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
|
|
997
|
+
template <int Dummy>
|
|
998
|
+
struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, Dummy>
|
|
1021
999
|
{
|
|
1022
1000
|
static constexpr int WARP_THREADS = detail::warp_threads;
|
|
1023
|
-
static_assert(
|
|
1001
|
+
static_assert(BlockThreads % WARP_THREADS == 0, "BlockThreads must be a multiple of WARP_THREADS");
|
|
1024
1002
|
|
|
1025
|
-
using BlockExchange = BlockExchange<T,
|
|
1003
|
+
using BlockExchange = BlockExchange<T, BlockDimX, ItemsPerThread, true, BlockDimY, BlockDimZ>;
|
|
1026
1004
|
using _TempStorage = typename BlockExchange::TempStorage;
|
|
1027
1005
|
using TempStorage = Uninitialized<_TempStorage>;
|
|
1028
1006
|
|
|
@@ -1035,7 +1013,7 @@ class BlockLoad
|
|
|
1035
1013
|
{}
|
|
1036
1014
|
|
|
1037
1015
|
template <typename RandomAccessIterator>
|
|
1038
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
1016
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
1039
1017
|
{
|
|
1040
1018
|
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
|
|
1041
1019
|
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
@@ -1043,7 +1021,7 @@ class BlockLoad
|
|
|
1043
1021
|
|
|
1044
1022
|
template <typename RandomAccessIterator>
|
|
1045
1023
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1046
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
1024
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
1047
1025
|
{
|
|
1048
1026
|
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
|
|
1049
1027
|
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
@@ -1051,14 +1029,14 @@ class BlockLoad
|
|
|
1051
1029
|
|
|
1052
1030
|
template <typename RandomAccessIterator, typename DefaultT>
|
|
1053
1031
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1054
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
1032
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
|
|
1055
1033
|
{
|
|
1056
1034
|
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
1057
1035
|
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1058
1036
|
}
|
|
1059
1037
|
};
|
|
1060
1038
|
|
|
1061
|
-
using InternalLoad = LoadInternal<
|
|
1039
|
+
using InternalLoad = LoadInternal<Algorithm, 0>; // load implementation to use
|
|
1062
1040
|
using _TempStorage = typename InternalLoad::TempStorage;
|
|
1063
1041
|
|
|
1064
1042
|
// Internal storage allocator
|
|
@@ -1081,14 +1059,14 @@ public:
|
|
|
1081
1059
|
/// @brief Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
1082
1060
|
_CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad()
|
|
1083
1061
|
: temp_storage(PrivateStorage())
|
|
1084
|
-
, linear_tid(RowMajorTid(
|
|
1062
|
+
, linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
|
|
1085
1063
|
{}
|
|
1086
1064
|
|
|
1087
1065
|
/// @brief Collective constructor using the specified memory allocation as temporary storage.
|
|
1088
1066
|
/// @param[in] temp_storage Reference to memory allocation having layout type TempStorage
|
|
1089
1067
|
_CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage)
|
|
1090
1068
|
: temp_storage(temp_storage.Alias())
|
|
1091
|
-
, linear_tid(RowMajorTid(
|
|
1069
|
+
, linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
|
|
1092
1070
|
{}
|
|
1093
1071
|
|
|
1094
1072
|
//! @} end member group
|
|
@@ -1137,7 +1115,7 @@ public:
|
|
|
1137
1115
|
//! @param[out] dst_items
|
|
1138
1116
|
//! Destination to load data into
|
|
1139
1117
|
template <typename RandomAccessIterator>
|
|
1140
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
1118
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
|
|
1141
1119
|
{
|
|
1142
1120
|
InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items);
|
|
1143
1121
|
}
|
|
@@ -1191,7 +1169,7 @@ public:
|
|
|
1191
1169
|
//! Number of valid items to load
|
|
1192
1170
|
template <typename RandomAccessIterator>
|
|
1193
1171
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1194
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
1172
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
|
|
1195
1173
|
{
|
|
1196
1174
|
InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end);
|
|
1197
1175
|
}
|
|
@@ -1247,7 +1225,7 @@ public:
|
|
|
1247
1225
|
//! Default value to assign out-of-bound items
|
|
1248
1226
|
template <typename RandomAccessIterator, typename DefaultT>
|
|
1249
1227
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1250
|
-
Load(RandomAccessIterator block_src_it, T (&dst_items)[
|
|
1228
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
|
|
1251
1229
|
{
|
|
1252
1230
|
InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default);
|
|
1253
1231
|
}
|