cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
- cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
- cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
- cuda/cccl/headers/include/cub/config.cuh +2 -26
- cuda/cccl/headers/include/cub/cub.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
- cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
- cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
- cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
- cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
- cuda/cccl/headers/include/cub/util_device.cuh +18 -59
- cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
- cuda/cccl/headers/include/cub/util_math.cuh +2 -28
- cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
- cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
- cuda/cccl/headers/include/cub/util_type.cuh +5 -32
- cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
- cuda/cccl/headers/include/cub/version.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
- cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
- cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
- cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
- cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
- cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
- cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
- cuda/cccl/headers/include/cuda/barrier +42 -16
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/memory_resource +6 -1
- cuda/cccl/headers/include/cuda/numeric +2 -0
- cuda/cccl/headers/include/cuda/pipeline +3 -2
- cuda/cccl/headers/include/cuda/ptx +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
- cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
- cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
- cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
- cuda/cccl/headers/include/cuda/std/__new_ +1 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random_ +2 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/array +1 -1
- cuda/cccl/headers/include/cuda/std/atomic +1 -1
- cuda/cccl/headers/include/cuda/std/bitset +2 -10
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
- cuda/cccl/headers/include/cuda/std/functional +1 -1
- cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
- cuda/cccl/headers/include/cuda/std/iterator +1 -1
- cuda/cccl/headers/include/cuda/std/numbers +0 -2
- cuda/cccl/headers/include/cuda/std/ratio +2 -2
- cuda/cccl/headers/include/cuda/std/span +2 -2
- cuda/cccl/headers/include/cuda/std/string_view +24 -42
- cuda/cccl/headers/include/cuda/std/tuple +18 -1
- cuda/cccl/headers/include/cuda/std/type_traits +0 -1
- cuda/cccl/headers/include/cuda/std/variant +8 -1
- cuda/cccl/headers/include/nv/target +2 -6
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
- cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
- cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
- cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
- cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
- cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
- cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
- cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
- cuda/cccl/headers/include/thrust/functional.h +0 -2
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/new.h +0 -2
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
- cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
- cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
- cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
- cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
- cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
- cuda/cccl/headers/include/thrust/transform.h +14 -3
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
- cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
- cuda/cccl/headers/include/thrust/zip_function.h +2 -28
- cuda/compute/__init__.py +4 -0
- cuda/compute/_bindings.pyi +26 -3
- cuda/compute/_bindings_impl.pyx +143 -1
- cuda/compute/algorithms/__init__.py +9 -5
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
- cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda_cccl-0.3.4.dist-info/METADATA +78 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
- cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
- cuda_cccl-0.3.2.dist-info/METADATA +0 -42
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -44,6 +44,10 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD
|
|
|
44
44
|
|
|
45
45
|
#if _CCCL_HAS_CUDA_COMPILER()
|
|
46
46
|
|
|
47
|
+
extern "C" _CCCL_DEVICE void __atomic_cas_128b_unsupported_before_SM_90();
|
|
48
|
+
extern "C" _CCCL_DEVICE void __atomic_exchange_128b_unsupported_before_SM_90();
|
|
49
|
+
extern "C" _CCCL_DEVICE void __atomic_ldst_128b_unsupported_before_SM_70();
|
|
50
|
+
|
|
47
51
|
static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_block_tag)
|
|
48
52
|
{ asm volatile("membar.cta;" ::: "memory"); }
|
|
49
53
|
static inline _CCCL_DEVICE void __cuda_atomic_membar(__thread_scope_device_tag)
|
|
@@ -695,131 +699,222 @@ static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
|
695
699
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
696
700
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
|
|
697
701
|
{
|
|
702
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
703
|
+
NV_DISPATCH_TARGET(
|
|
704
|
+
NV_PROVIDES_SM_70, (),
|
|
705
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
706
|
+
)
|
|
698
707
|
asm volatile(R"YYY(
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
708
|
+
{
|
|
709
|
+
.reg .b128 _d;
|
|
710
|
+
ld.acquire.cta.b128 _d,[%2];
|
|
711
|
+
mov.b128 {%0, %1}, _d;
|
|
712
|
+
}
|
|
713
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
703
714
|
}
|
|
704
715
|
template <class _Type>
|
|
705
716
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
706
717
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
|
|
707
718
|
{
|
|
719
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
720
|
+
NV_DISPATCH_TARGET(
|
|
721
|
+
NV_PROVIDES_SM_70, (),
|
|
722
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
723
|
+
)
|
|
708
724
|
asm volatile(R"YYY(
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
725
|
+
{
|
|
726
|
+
.reg .b128 _d;
|
|
727
|
+
ld.acquire.cluster.b128 _d,[%2];
|
|
728
|
+
mov.b128 {%0, %1}, _d;
|
|
729
|
+
}
|
|
730
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
713
731
|
}
|
|
714
732
|
template <class _Type>
|
|
715
733
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
716
734
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
|
|
717
735
|
{
|
|
736
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
737
|
+
NV_DISPATCH_TARGET(
|
|
738
|
+
NV_PROVIDES_SM_70, (),
|
|
739
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
740
|
+
)
|
|
718
741
|
asm volatile(R"YYY(
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
742
|
+
{
|
|
743
|
+
.reg .b128 _d;
|
|
744
|
+
ld.acquire.gpu.b128 _d,[%2];
|
|
745
|
+
mov.b128 {%0, %1}, _d;
|
|
746
|
+
}
|
|
747
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
723
748
|
}
|
|
724
749
|
template <class _Type>
|
|
725
750
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
726
751
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
|
|
727
752
|
{
|
|
753
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
754
|
+
NV_DISPATCH_TARGET(
|
|
755
|
+
NV_PROVIDES_SM_70, (),
|
|
756
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
757
|
+
)
|
|
728
758
|
asm volatile(R"YYY(
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
759
|
+
{
|
|
760
|
+
.reg .b128 _d;
|
|
761
|
+
ld.acquire.sys.b128 _d,[%2];
|
|
762
|
+
mov.b128 {%0, %1}, _d;
|
|
763
|
+
}
|
|
764
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
733
765
|
}
|
|
734
766
|
template <class _Type>
|
|
735
767
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
736
768
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
|
|
737
769
|
{
|
|
770
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
771
|
+
NV_DISPATCH_TARGET(
|
|
772
|
+
NV_PROVIDES_SM_70, (),
|
|
773
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
774
|
+
)
|
|
738
775
|
asm volatile(R"YYY(
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
776
|
+
{
|
|
777
|
+
.reg .b128 _d;
|
|
778
|
+
ld.relaxed.cta.b128 _d,[%2];
|
|
779
|
+
mov.b128 {%0, %1}, _d;
|
|
780
|
+
}
|
|
781
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
743
782
|
}
|
|
744
783
|
template <class _Type>
|
|
745
784
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
746
785
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
|
|
747
786
|
{
|
|
787
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
788
|
+
NV_DISPATCH_TARGET(
|
|
789
|
+
NV_PROVIDES_SM_70, (),
|
|
790
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
791
|
+
)
|
|
748
792
|
asm volatile(R"YYY(
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
793
|
+
{
|
|
794
|
+
.reg .b128 _d;
|
|
795
|
+
ld.relaxed.cluster.b128 _d,[%2];
|
|
796
|
+
mov.b128 {%0, %1}, _d;
|
|
797
|
+
}
|
|
798
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
753
799
|
}
|
|
754
800
|
template <class _Type>
|
|
755
801
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
756
802
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
|
|
757
803
|
{
|
|
804
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
805
|
+
NV_DISPATCH_TARGET(
|
|
806
|
+
NV_PROVIDES_SM_70, (),
|
|
807
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
808
|
+
)
|
|
758
809
|
asm volatile(R"YYY(
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
810
|
+
{
|
|
811
|
+
.reg .b128 _d;
|
|
812
|
+
ld.relaxed.gpu.b128 _d,[%2];
|
|
813
|
+
mov.b128 {%0, %1}, _d;
|
|
814
|
+
}
|
|
815
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
763
816
|
}
|
|
764
817
|
template <class _Type>
|
|
765
818
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
766
819
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
|
|
767
820
|
{
|
|
821
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
822
|
+
NV_DISPATCH_TARGET(
|
|
823
|
+
NV_PROVIDES_SM_70, (),
|
|
824
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
825
|
+
)
|
|
768
826
|
asm volatile(R"YYY(
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
827
|
+
{
|
|
828
|
+
.reg .b128 _d;
|
|
829
|
+
ld.relaxed.sys.b128 _d,[%2];
|
|
830
|
+
mov.b128 {%0, %1}, _d;
|
|
831
|
+
}
|
|
832
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
773
833
|
}
|
|
774
834
|
template <class _Type>
|
|
775
835
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
776
836
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
|
|
777
837
|
{
|
|
838
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
839
|
+
NV_DISPATCH_TARGET(
|
|
840
|
+
NV_PROVIDES_SM_70, (),
|
|
841
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
842
|
+
)
|
|
778
843
|
asm volatile(R"YYY(
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
844
|
+
{
|
|
845
|
+
.reg .b128 _d;
|
|
846
|
+
ld.mmio.relaxed.sys.b128 _d,[%2];
|
|
847
|
+
mov.b128 {%0, %1}, _d;
|
|
848
|
+
}
|
|
849
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
783
850
|
}
|
|
784
851
|
template <class _Type>
|
|
785
852
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
786
853
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
|
|
787
854
|
{
|
|
855
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
856
|
+
NV_DISPATCH_TARGET(
|
|
857
|
+
NV_PROVIDES_SM_70, (),
|
|
858
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
859
|
+
)
|
|
788
860
|
asm volatile(R"YYY(
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
861
|
+
{
|
|
862
|
+
.reg .b128 _d;
|
|
863
|
+
ld.volatile.b128 _d,[%2];
|
|
864
|
+
mov.b128 {%0, %1}, _d;
|
|
865
|
+
}
|
|
866
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
793
867
|
}
|
|
794
868
|
template <class _Type>
|
|
795
869
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
796
870
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
|
|
797
871
|
{
|
|
872
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
873
|
+
NV_DISPATCH_TARGET(
|
|
874
|
+
NV_PROVIDES_SM_70, (),
|
|
875
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
876
|
+
)
|
|
798
877
|
asm volatile(R"YYY(
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
878
|
+
{
|
|
879
|
+
.reg .b128 _d;
|
|
880
|
+
ld.volatile.b128 _d,[%2];
|
|
881
|
+
mov.b128 {%0, %1}, _d;
|
|
882
|
+
}
|
|
883
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
803
884
|
}
|
|
804
885
|
template <class _Type>
|
|
805
886
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
806
887
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
|
|
807
888
|
{
|
|
889
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
890
|
+
NV_DISPATCH_TARGET(
|
|
891
|
+
NV_PROVIDES_SM_70, (),
|
|
892
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
893
|
+
)
|
|
808
894
|
asm volatile(R"YYY(
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
895
|
+
{
|
|
896
|
+
.reg .b128 _d;
|
|
897
|
+
ld.volatile.b128 _d,[%2];
|
|
898
|
+
mov.b128 {%0, %1}, _d;
|
|
899
|
+
}
|
|
900
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
813
901
|
}
|
|
814
902
|
template <class _Type>
|
|
815
903
|
static inline _CCCL_DEVICE void __cuda_atomic_load(
|
|
816
904
|
const _Type* __ptr, _Type& __dst, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
|
|
817
905
|
{
|
|
906
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
907
|
+
NV_DISPATCH_TARGET(
|
|
908
|
+
NV_PROVIDES_SM_70, (),
|
|
909
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
910
|
+
)
|
|
818
911
|
asm volatile(R"YYY(
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
912
|
+
{
|
|
913
|
+
.reg .b128 _d;
|
|
914
|
+
ld.volatile.b128 _d,[%2];
|
|
915
|
+
mov.b128 {%0, %1}, _d;
|
|
916
|
+
}
|
|
917
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr) : "memory");
|
|
823
918
|
}
|
|
824
919
|
|
|
825
920
|
template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
|
|
@@ -1037,131 +1132,222 @@ template <class _Type>
|
|
|
1037
1132
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1038
1133
|
_Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
|
|
1039
1134
|
{
|
|
1135
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1136
|
+
NV_DISPATCH_TARGET(
|
|
1137
|
+
NV_PROVIDES_SM_70, (),
|
|
1138
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1139
|
+
)
|
|
1040
1140
|
asm volatile(R"YYY(
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1141
|
+
{
|
|
1142
|
+
.reg .b128 _v;
|
|
1143
|
+
mov.b128 _v, {%1, %2};
|
|
1144
|
+
st.release.cta.b128 [%0],_v;
|
|
1145
|
+
}
|
|
1146
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1045
1147
|
}
|
|
1046
1148
|
template <class _Type>
|
|
1047
1149
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1048
1150
|
_Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
|
|
1049
1151
|
{
|
|
1152
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1153
|
+
NV_DISPATCH_TARGET(
|
|
1154
|
+
NV_PROVIDES_SM_70, (),
|
|
1155
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1156
|
+
)
|
|
1050
1157
|
asm volatile(R"YYY(
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1158
|
+
{
|
|
1159
|
+
.reg .b128 _v;
|
|
1160
|
+
mov.b128 _v, {%1, %2};
|
|
1161
|
+
st.release.cluster.b128 [%0],_v;
|
|
1162
|
+
}
|
|
1163
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1055
1164
|
}
|
|
1056
1165
|
template <class _Type>
|
|
1057
1166
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1058
1167
|
_Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
|
|
1059
1168
|
{
|
|
1169
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1170
|
+
NV_DISPATCH_TARGET(
|
|
1171
|
+
NV_PROVIDES_SM_70, (),
|
|
1172
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1173
|
+
)
|
|
1060
1174
|
asm volatile(R"YYY(
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1175
|
+
{
|
|
1176
|
+
.reg .b128 _v;
|
|
1177
|
+
mov.b128 _v, {%1, %2};
|
|
1178
|
+
st.release.gpu.b128 [%0],_v;
|
|
1179
|
+
}
|
|
1180
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1065
1181
|
}
|
|
1066
1182
|
template <class _Type>
|
|
1067
1183
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1068
1184
|
_Type* __ptr, _Type& __val, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
|
|
1069
1185
|
{
|
|
1186
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1187
|
+
NV_DISPATCH_TARGET(
|
|
1188
|
+
NV_PROVIDES_SM_70, (),
|
|
1189
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1190
|
+
)
|
|
1070
1191
|
asm volatile(R"YYY(
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1192
|
+
{
|
|
1193
|
+
.reg .b128 _v;
|
|
1194
|
+
mov.b128 _v, {%1, %2};
|
|
1195
|
+
st.release.sys.b128 [%0],_v;
|
|
1196
|
+
}
|
|
1197
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1075
1198
|
}
|
|
1076
1199
|
template <class _Type>
|
|
1077
1200
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1078
1201
|
_Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
|
|
1079
1202
|
{
|
|
1203
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1204
|
+
NV_DISPATCH_TARGET(
|
|
1205
|
+
NV_PROVIDES_SM_70, (),
|
|
1206
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1207
|
+
)
|
|
1080
1208
|
asm volatile(R"YYY(
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1209
|
+
{
|
|
1210
|
+
.reg .b128 _v;
|
|
1211
|
+
mov.b128 _v, {%1, %2};
|
|
1212
|
+
st.relaxed.cta.b128 [%0],_v;
|
|
1213
|
+
}
|
|
1214
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1085
1215
|
}
|
|
1086
1216
|
template <class _Type>
|
|
1087
1217
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1088
1218
|
_Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
|
|
1089
1219
|
{
|
|
1220
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1221
|
+
NV_DISPATCH_TARGET(
|
|
1222
|
+
NV_PROVIDES_SM_70, (),
|
|
1223
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1224
|
+
)
|
|
1090
1225
|
asm volatile(R"YYY(
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1226
|
+
{
|
|
1227
|
+
.reg .b128 _v;
|
|
1228
|
+
mov.b128 _v, {%1, %2};
|
|
1229
|
+
st.relaxed.cluster.b128 [%0],_v;
|
|
1230
|
+
}
|
|
1231
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1095
1232
|
}
|
|
1096
1233
|
template <class _Type>
|
|
1097
1234
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1098
1235
|
_Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
|
|
1099
1236
|
{
|
|
1237
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1238
|
+
NV_DISPATCH_TARGET(
|
|
1239
|
+
NV_PROVIDES_SM_70, (),
|
|
1240
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1241
|
+
)
|
|
1100
1242
|
asm volatile(R"YYY(
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1243
|
+
{
|
|
1244
|
+
.reg .b128 _v;
|
|
1245
|
+
mov.b128 _v, {%1, %2};
|
|
1246
|
+
st.relaxed.gpu.b128 [%0],_v;
|
|
1247
|
+
}
|
|
1248
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1105
1249
|
}
|
|
1106
1250
|
template <class _Type>
|
|
1107
1251
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1108
1252
|
_Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
|
|
1109
1253
|
{
|
|
1254
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1255
|
+
NV_DISPATCH_TARGET(
|
|
1256
|
+
NV_PROVIDES_SM_70, (),
|
|
1257
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1258
|
+
)
|
|
1110
1259
|
asm volatile(R"YYY(
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1260
|
+
{
|
|
1261
|
+
.reg .b128 _v;
|
|
1262
|
+
mov.b128 _v, {%1, %2};
|
|
1263
|
+
st.relaxed.sys.b128 [%0],_v;
|
|
1264
|
+
}
|
|
1265
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1115
1266
|
}
|
|
1116
1267
|
template <class _Type>
|
|
1117
1268
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1118
1269
|
_Type* __ptr, _Type& __val, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_enable)
|
|
1119
1270
|
{
|
|
1271
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1272
|
+
NV_DISPATCH_TARGET(
|
|
1273
|
+
NV_PROVIDES_SM_70, (),
|
|
1274
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1275
|
+
)
|
|
1120
1276
|
asm volatile(R"YYY(
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1277
|
+
{
|
|
1278
|
+
.reg .b128 _v;
|
|
1279
|
+
mov.b128 _v, {%1, %2};
|
|
1280
|
+
st.mmio.relaxed.sys.b128 [%0],_v;
|
|
1281
|
+
}
|
|
1282
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1125
1283
|
}
|
|
1126
1284
|
template <class _Type>
|
|
1127
1285
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1128
1286
|
_Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag, __atomic_cuda_mmio_disable)
|
|
1129
1287
|
{
|
|
1288
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1289
|
+
NV_DISPATCH_TARGET(
|
|
1290
|
+
NV_PROVIDES_SM_70, (),
|
|
1291
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1292
|
+
)
|
|
1130
1293
|
asm volatile(R"YYY(
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1294
|
+
{
|
|
1295
|
+
.reg .b128 _v;
|
|
1296
|
+
mov.b128 _v, {%1, %2};
|
|
1297
|
+
st.volatile.b128 [%0],_v;
|
|
1298
|
+
}
|
|
1299
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1135
1300
|
}
|
|
1136
1301
|
template <class _Type>
|
|
1137
1302
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1138
1303
|
_Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag, __atomic_cuda_mmio_disable)
|
|
1139
1304
|
{
|
|
1305
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1306
|
+
NV_DISPATCH_TARGET(
|
|
1307
|
+
NV_PROVIDES_SM_70, (),
|
|
1308
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1309
|
+
)
|
|
1140
1310
|
asm volatile(R"YYY(
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1311
|
+
{
|
|
1312
|
+
.reg .b128 _v;
|
|
1313
|
+
mov.b128 _v, {%1, %2};
|
|
1314
|
+
st.volatile.b128 [%0],_v;
|
|
1315
|
+
}
|
|
1316
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1145
1317
|
}
|
|
1146
1318
|
template <class _Type>
|
|
1147
1319
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1148
1320
|
_Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag, __atomic_cuda_mmio_disable)
|
|
1149
1321
|
{
|
|
1322
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1323
|
+
NV_DISPATCH_TARGET(
|
|
1324
|
+
NV_PROVIDES_SM_70, (),
|
|
1325
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1326
|
+
)
|
|
1150
1327
|
asm volatile(R"YYY(
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1328
|
+
{
|
|
1329
|
+
.reg .b128 _v;
|
|
1330
|
+
mov.b128 _v, {%1, %2};
|
|
1331
|
+
st.volatile.b128 [%0],_v;
|
|
1332
|
+
}
|
|
1333
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1155
1334
|
}
|
|
1156
1335
|
template <class _Type>
|
|
1157
1336
|
static inline _CCCL_DEVICE void __cuda_atomic_store(
|
|
1158
1337
|
_Type* __ptr, _Type& __val, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag, __atomic_cuda_mmio_disable)
|
|
1159
1338
|
{
|
|
1339
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b ld/st is not supported until PTX ISA version 840");
|
|
1340
|
+
NV_DISPATCH_TARGET(
|
|
1341
|
+
NV_PROVIDES_SM_70, (),
|
|
1342
|
+
NV_ANY_TARGET, (__atomic_ldst_128b_unsupported_before_SM_70();)
|
|
1343
|
+
)
|
|
1160
1344
|
asm volatile(R"YYY(
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1345
|
+
{
|
|
1346
|
+
.reg .b128 _v;
|
|
1347
|
+
mov.b128 _v, {%1, %2};
|
|
1348
|
+
st.volatile.b128 [%0],_v;
|
|
1349
|
+
}
|
|
1350
|
+
)YYY" :: "l"(__ptr), "l"(__val.__x),"l"(__val.__y) : "memory");
|
|
1165
1351
|
}
|
|
1166
1352
|
|
|
1167
1353
|
template <typename _Type, typename _Tag, typename _Sco, typename _Mmio>
|
|
@@ -1391,242 +1577,382 @@ template <class _Type>
|
|
|
1391
1577
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1392
1578
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1393
1579
|
{
|
|
1580
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1581
|
+
NV_DISPATCH_TARGET(
|
|
1582
|
+
NV_PROVIDES_SM_90, (),
|
|
1583
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1584
|
+
)
|
|
1394
1585
|
asm volatile(R"YYY(
|
|
1395
|
-
|
|
1396
|
-
.reg .b128
|
|
1397
|
-
|
|
1398
|
-
mov.b128 {%
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1586
|
+
{
|
|
1587
|
+
.reg .b128 _d;
|
|
1588
|
+
.reg .b128 _v;
|
|
1589
|
+
mov.b128 _d, {%0, %1};
|
|
1590
|
+
mov.b128 _v, {%4, %5};
|
|
1591
|
+
atom.cas.acquire.cta.b128 _d,[%2],_d,_v;
|
|
1592
|
+
mov.b128 {%0, %1}, _d;
|
|
1593
|
+
}
|
|
1594
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1402
1595
|
template <class _Type>
|
|
1403
1596
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1404
1597
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1405
1598
|
{
|
|
1599
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1600
|
+
NV_DISPATCH_TARGET(
|
|
1601
|
+
NV_PROVIDES_SM_90, (),
|
|
1602
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1603
|
+
)
|
|
1406
1604
|
asm volatile(R"YYY(
|
|
1407
|
-
|
|
1408
|
-
.reg .b128
|
|
1409
|
-
|
|
1410
|
-
mov.b128 {%
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1605
|
+
{
|
|
1606
|
+
.reg .b128 _d;
|
|
1607
|
+
.reg .b128 _v;
|
|
1608
|
+
mov.b128 _d, {%0, %1};
|
|
1609
|
+
mov.b128 _v, {%4, %5};
|
|
1610
|
+
atom.cas.acquire.cluster.b128 _d,[%2],_d,_v;
|
|
1611
|
+
mov.b128 {%0, %1}, _d;
|
|
1612
|
+
}
|
|
1613
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1414
1614
|
template <class _Type>
|
|
1415
1615
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1416
1616
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1417
1617
|
{
|
|
1618
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1619
|
+
NV_DISPATCH_TARGET(
|
|
1620
|
+
NV_PROVIDES_SM_90, (),
|
|
1621
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1622
|
+
)
|
|
1418
1623
|
asm volatile(R"YYY(
|
|
1419
|
-
|
|
1420
|
-
.reg .b128
|
|
1421
|
-
|
|
1422
|
-
mov.b128 {%
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1624
|
+
{
|
|
1625
|
+
.reg .b128 _d;
|
|
1626
|
+
.reg .b128 _v;
|
|
1627
|
+
mov.b128 _d, {%0, %1};
|
|
1628
|
+
mov.b128 _v, {%4, %5};
|
|
1629
|
+
atom.cas.acquire.gpu.b128 _d,[%2],_d,_v;
|
|
1630
|
+
mov.b128 {%0, %1}, _d;
|
|
1631
|
+
}
|
|
1632
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1426
1633
|
template <class _Type>
|
|
1427
1634
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1428
1635
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1429
1636
|
{
|
|
1637
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1638
|
+
NV_DISPATCH_TARGET(
|
|
1639
|
+
NV_PROVIDES_SM_90, (),
|
|
1640
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1641
|
+
)
|
|
1430
1642
|
asm volatile(R"YYY(
|
|
1431
|
-
|
|
1432
|
-
.reg .b128
|
|
1433
|
-
|
|
1434
|
-
mov.b128 {%
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1643
|
+
{
|
|
1644
|
+
.reg .b128 _d;
|
|
1645
|
+
.reg .b128 _v;
|
|
1646
|
+
mov.b128 _d, {%0, %1};
|
|
1647
|
+
mov.b128 _v, {%4, %5};
|
|
1648
|
+
atom.cas.acquire.sys.b128 _d,[%2],_d,_v;
|
|
1649
|
+
mov.b128 {%0, %1}, _d;
|
|
1650
|
+
}
|
|
1651
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1438
1652
|
template <class _Type>
|
|
1439
1653
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1440
1654
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1441
1655
|
{
|
|
1656
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1657
|
+
NV_DISPATCH_TARGET(
|
|
1658
|
+
NV_PROVIDES_SM_90, (),
|
|
1659
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1660
|
+
)
|
|
1442
1661
|
asm volatile(R"YYY(
|
|
1443
|
-
|
|
1444
|
-
.reg .b128
|
|
1445
|
-
|
|
1446
|
-
mov.b128 {%
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1662
|
+
{
|
|
1663
|
+
.reg .b128 _d;
|
|
1664
|
+
.reg .b128 _v;
|
|
1665
|
+
mov.b128 _d, {%0, %1};
|
|
1666
|
+
mov.b128 _v, {%4, %5};
|
|
1667
|
+
atom.cas.relaxed.cta.b128 _d,[%2],_d,_v;
|
|
1668
|
+
mov.b128 {%0, %1}, _d;
|
|
1669
|
+
}
|
|
1670
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1450
1671
|
template <class _Type>
|
|
1451
1672
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1452
1673
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1453
1674
|
{
|
|
1675
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1676
|
+
NV_DISPATCH_TARGET(
|
|
1677
|
+
NV_PROVIDES_SM_90, (),
|
|
1678
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1679
|
+
)
|
|
1454
1680
|
asm volatile(R"YYY(
|
|
1455
|
-
|
|
1456
|
-
.reg .b128
|
|
1457
|
-
|
|
1458
|
-
mov.b128 {%
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1681
|
+
{
|
|
1682
|
+
.reg .b128 _d;
|
|
1683
|
+
.reg .b128 _v;
|
|
1684
|
+
mov.b128 _d, {%0, %1};
|
|
1685
|
+
mov.b128 _v, {%4, %5};
|
|
1686
|
+
atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v;
|
|
1687
|
+
mov.b128 {%0, %1}, _d;
|
|
1688
|
+
}
|
|
1689
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1462
1690
|
template <class _Type>
|
|
1463
1691
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1464
1692
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1465
1693
|
{
|
|
1694
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1695
|
+
NV_DISPATCH_TARGET(
|
|
1696
|
+
NV_PROVIDES_SM_90, (),
|
|
1697
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1698
|
+
)
|
|
1466
1699
|
asm volatile(R"YYY(
|
|
1467
|
-
|
|
1468
|
-
.reg .b128
|
|
1469
|
-
|
|
1470
|
-
mov.b128 {%
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1700
|
+
{
|
|
1701
|
+
.reg .b128 _d;
|
|
1702
|
+
.reg .b128 _v;
|
|
1703
|
+
mov.b128 _d, {%0, %1};
|
|
1704
|
+
mov.b128 _v, {%4, %5};
|
|
1705
|
+
atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v;
|
|
1706
|
+
mov.b128 {%0, %1}, _d;
|
|
1707
|
+
}
|
|
1708
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1474
1709
|
template <class _Type>
|
|
1475
1710
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1476
1711
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1477
1712
|
{
|
|
1713
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1714
|
+
NV_DISPATCH_TARGET(
|
|
1715
|
+
NV_PROVIDES_SM_90, (),
|
|
1716
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1717
|
+
)
|
|
1478
1718
|
asm volatile(R"YYY(
|
|
1479
|
-
|
|
1480
|
-
.reg .b128
|
|
1481
|
-
|
|
1482
|
-
mov.b128 {%
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1719
|
+
{
|
|
1720
|
+
.reg .b128 _d;
|
|
1721
|
+
.reg .b128 _v;
|
|
1722
|
+
mov.b128 _d, {%0, %1};
|
|
1723
|
+
mov.b128 _v, {%4, %5};
|
|
1724
|
+
atom.cas.relaxed.sys.b128 _d,[%2],_d,_v;
|
|
1725
|
+
mov.b128 {%0, %1}, _d;
|
|
1726
|
+
}
|
|
1727
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1486
1728
|
template <class _Type>
|
|
1487
1729
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1488
1730
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1489
1731
|
{
|
|
1732
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1733
|
+
NV_DISPATCH_TARGET(
|
|
1734
|
+
NV_PROVIDES_SM_90, (),
|
|
1735
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1736
|
+
)
|
|
1490
1737
|
asm volatile(R"YYY(
|
|
1491
|
-
|
|
1492
|
-
.reg .b128
|
|
1493
|
-
|
|
1494
|
-
mov.b128 {%
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1738
|
+
{
|
|
1739
|
+
.reg .b128 _d;
|
|
1740
|
+
.reg .b128 _v;
|
|
1741
|
+
mov.b128 _d, {%0, %1};
|
|
1742
|
+
mov.b128 _v, {%4, %5};
|
|
1743
|
+
atom.cas.release.cta.b128 _d,[%2],_d,_v;
|
|
1744
|
+
mov.b128 {%0, %1}, _d;
|
|
1745
|
+
}
|
|
1746
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1498
1747
|
template <class _Type>
|
|
1499
1748
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1500
1749
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1501
1750
|
{
|
|
1751
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1752
|
+
NV_DISPATCH_TARGET(
|
|
1753
|
+
NV_PROVIDES_SM_90, (),
|
|
1754
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1755
|
+
)
|
|
1502
1756
|
asm volatile(R"YYY(
|
|
1503
|
-
|
|
1504
|
-
.reg .b128
|
|
1505
|
-
|
|
1506
|
-
mov.b128 {%
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1757
|
+
{
|
|
1758
|
+
.reg .b128 _d;
|
|
1759
|
+
.reg .b128 _v;
|
|
1760
|
+
mov.b128 _d, {%0, %1};
|
|
1761
|
+
mov.b128 _v, {%4, %5};
|
|
1762
|
+
atom.cas.release.cluster.b128 _d,[%2],_d,_v;
|
|
1763
|
+
mov.b128 {%0, %1}, _d;
|
|
1764
|
+
}
|
|
1765
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1510
1766
|
template <class _Type>
|
|
1511
1767
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1512
1768
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1513
1769
|
{
|
|
1770
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1771
|
+
NV_DISPATCH_TARGET(
|
|
1772
|
+
NV_PROVIDES_SM_90, (),
|
|
1773
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1774
|
+
)
|
|
1514
1775
|
asm volatile(R"YYY(
|
|
1515
|
-
|
|
1516
|
-
.reg .b128
|
|
1517
|
-
|
|
1518
|
-
mov.b128 {%
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1776
|
+
{
|
|
1777
|
+
.reg .b128 _d;
|
|
1778
|
+
.reg .b128 _v;
|
|
1779
|
+
mov.b128 _d, {%0, %1};
|
|
1780
|
+
mov.b128 _v, {%4, %5};
|
|
1781
|
+
atom.cas.release.gpu.b128 _d,[%2],_d,_v;
|
|
1782
|
+
mov.b128 {%0, %1}, _d;
|
|
1783
|
+
}
|
|
1784
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1522
1785
|
template <class _Type>
|
|
1523
1786
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1524
1787
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1525
1788
|
{
|
|
1789
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1790
|
+
NV_DISPATCH_TARGET(
|
|
1791
|
+
NV_PROVIDES_SM_90, (),
|
|
1792
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1793
|
+
)
|
|
1526
1794
|
asm volatile(R"YYY(
|
|
1527
|
-
|
|
1528
|
-
.reg .b128
|
|
1529
|
-
|
|
1530
|
-
mov.b128 {%
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1795
|
+
{
|
|
1796
|
+
.reg .b128 _d;
|
|
1797
|
+
.reg .b128 _v;
|
|
1798
|
+
mov.b128 _d, {%0, %1};
|
|
1799
|
+
mov.b128 _v, {%4, %5};
|
|
1800
|
+
atom.cas.release.sys.b128 _d,[%2],_d,_v;
|
|
1801
|
+
mov.b128 {%0, %1}, _d;
|
|
1802
|
+
}
|
|
1803
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1534
1804
|
template <class _Type>
|
|
1535
1805
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1536
1806
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1537
1807
|
{
|
|
1808
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1809
|
+
NV_DISPATCH_TARGET(
|
|
1810
|
+
NV_PROVIDES_SM_90, (),
|
|
1811
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1812
|
+
)
|
|
1538
1813
|
asm volatile(R"YYY(
|
|
1539
|
-
|
|
1540
|
-
.reg .b128
|
|
1541
|
-
|
|
1542
|
-
mov.b128 {%
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1814
|
+
{
|
|
1815
|
+
.reg .b128 _d;
|
|
1816
|
+
.reg .b128 _v;
|
|
1817
|
+
mov.b128 _d, {%0, %1};
|
|
1818
|
+
mov.b128 _v, {%4, %5};
|
|
1819
|
+
atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v;
|
|
1820
|
+
mov.b128 {%0, %1}, _d;
|
|
1821
|
+
}
|
|
1822
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1546
1823
|
template <class _Type>
|
|
1547
1824
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1548
1825
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1549
1826
|
{
|
|
1827
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1828
|
+
NV_DISPATCH_TARGET(
|
|
1829
|
+
NV_PROVIDES_SM_90, (),
|
|
1830
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1831
|
+
)
|
|
1550
1832
|
asm volatile(R"YYY(
|
|
1551
|
-
|
|
1552
|
-
.reg .b128
|
|
1553
|
-
|
|
1554
|
-
mov.b128 {%
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1833
|
+
{
|
|
1834
|
+
.reg .b128 _d;
|
|
1835
|
+
.reg .b128 _v;
|
|
1836
|
+
mov.b128 _d, {%0, %1};
|
|
1837
|
+
mov.b128 _v, {%4, %5};
|
|
1838
|
+
atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v;
|
|
1839
|
+
mov.b128 {%0, %1}, _d;
|
|
1840
|
+
}
|
|
1841
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1558
1842
|
template <class _Type>
|
|
1559
1843
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1560
1844
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1561
1845
|
{
|
|
1846
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1847
|
+
NV_DISPATCH_TARGET(
|
|
1848
|
+
NV_PROVIDES_SM_90, (),
|
|
1849
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1850
|
+
)
|
|
1562
1851
|
asm volatile(R"YYY(
|
|
1563
|
-
|
|
1564
|
-
.reg .b128
|
|
1565
|
-
|
|
1566
|
-
mov.b128 {%
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1852
|
+
{
|
|
1853
|
+
.reg .b128 _d;
|
|
1854
|
+
.reg .b128 _v;
|
|
1855
|
+
mov.b128 _d, {%0, %1};
|
|
1856
|
+
mov.b128 _v, {%4, %5};
|
|
1857
|
+
atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v;
|
|
1858
|
+
mov.b128 {%0, %1}, _d;
|
|
1859
|
+
}
|
|
1860
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1570
1861
|
template <class _Type>
|
|
1571
1862
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1572
1863
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1573
1864
|
{
|
|
1865
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1866
|
+
NV_DISPATCH_TARGET(
|
|
1867
|
+
NV_PROVIDES_SM_90, (),
|
|
1868
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1869
|
+
)
|
|
1574
1870
|
asm volatile(R"YYY(
|
|
1575
|
-
|
|
1576
|
-
.reg .b128
|
|
1577
|
-
|
|
1578
|
-
mov.b128 {%
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1871
|
+
{
|
|
1872
|
+
.reg .b128 _d;
|
|
1873
|
+
.reg .b128 _v;
|
|
1874
|
+
mov.b128 _d, {%0, %1};
|
|
1875
|
+
mov.b128 _v, {%4, %5};
|
|
1876
|
+
atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v;
|
|
1877
|
+
mov.b128 {%0, %1}, _d;
|
|
1878
|
+
}
|
|
1879
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1582
1880
|
template <class _Type>
|
|
1583
1881
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1584
1882
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1585
1883
|
{
|
|
1884
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1885
|
+
NV_DISPATCH_TARGET(
|
|
1886
|
+
NV_PROVIDES_SM_90, (),
|
|
1887
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1888
|
+
)
|
|
1586
1889
|
asm volatile(R"YYY(
|
|
1587
|
-
|
|
1588
|
-
.reg .b128
|
|
1589
|
-
|
|
1590
|
-
mov.b128 {%
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1890
|
+
{
|
|
1891
|
+
.reg .b128 _d;
|
|
1892
|
+
.reg .b128 _v;
|
|
1893
|
+
mov.b128 _d, {%0, %1};
|
|
1894
|
+
mov.b128 _v, {%4, %5};
|
|
1895
|
+
atom.cas.cta.b128 _d,[%2],_d,_v;
|
|
1896
|
+
mov.b128 {%0, %1}, _d;
|
|
1897
|
+
}
|
|
1898
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1594
1899
|
template <class _Type>
|
|
1595
1900
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1596
1901
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1597
1902
|
{
|
|
1903
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1904
|
+
NV_DISPATCH_TARGET(
|
|
1905
|
+
NV_PROVIDES_SM_90, (),
|
|
1906
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1907
|
+
)
|
|
1598
1908
|
asm volatile(R"YYY(
|
|
1599
|
-
|
|
1600
|
-
.reg .b128
|
|
1601
|
-
|
|
1602
|
-
mov.b128 {%
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1909
|
+
{
|
|
1910
|
+
.reg .b128 _d;
|
|
1911
|
+
.reg .b128 _v;
|
|
1912
|
+
mov.b128 _d, {%0, %1};
|
|
1913
|
+
mov.b128 _v, {%4, %5};
|
|
1914
|
+
atom.cas.cluster.b128 _d,[%2],_d,_v;
|
|
1915
|
+
mov.b128 {%0, %1}, _d;
|
|
1916
|
+
}
|
|
1917
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1606
1918
|
template <class _Type>
|
|
1607
1919
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1608
1920
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1609
1921
|
{
|
|
1922
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1923
|
+
NV_DISPATCH_TARGET(
|
|
1924
|
+
NV_PROVIDES_SM_90, (),
|
|
1925
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1926
|
+
)
|
|
1610
1927
|
asm volatile(R"YYY(
|
|
1611
|
-
|
|
1612
|
-
.reg .b128
|
|
1613
|
-
|
|
1614
|
-
mov.b128 {%
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1928
|
+
{
|
|
1929
|
+
.reg .b128 _d;
|
|
1930
|
+
.reg .b128 _v;
|
|
1931
|
+
mov.b128 _d, {%0, %1};
|
|
1932
|
+
mov.b128 _v, {%4, %5};
|
|
1933
|
+
atom.cas.gpu.b128 _d,[%2],_d,_v;
|
|
1934
|
+
mov.b128 {%0, %1}, _d;
|
|
1935
|
+
}
|
|
1936
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1618
1937
|
template <class _Type>
|
|
1619
1938
|
static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
|
|
1620
1939
|
_Type* __ptr, _Type& __dst, _Type __cmp, _Type __op, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1621
1940
|
{
|
|
1941
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b CAS is not supported until PTX ISA version 840");
|
|
1942
|
+
NV_DISPATCH_TARGET(
|
|
1943
|
+
NV_PROVIDES_SM_90, (),
|
|
1944
|
+
NV_ANY_TARGET, (__atomic_cas_128b_unsupported_before_SM_90();)
|
|
1945
|
+
)
|
|
1622
1946
|
asm volatile(R"YYY(
|
|
1623
|
-
|
|
1624
|
-
.reg .b128
|
|
1625
|
-
|
|
1626
|
-
mov.b128 {%
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1947
|
+
{
|
|
1948
|
+
.reg .b128 _d;
|
|
1949
|
+
.reg .b128 _v;
|
|
1950
|
+
mov.b128 _d, {%0, %1};
|
|
1951
|
+
mov.b128 _v, {%4, %5};
|
|
1952
|
+
atom.cas.sys.b128 _d,[%2],_d,_v;
|
|
1953
|
+
mov.b128 {%0, %1}, _d;
|
|
1954
|
+
}
|
|
1955
|
+
)YYY" : "=l"(__dst.__x),"=l"(__dst.__y) : "l"(__ptr), "l"(__cmp.__x),"l"(__cmp.__y), "l"(__op.__x),"l"(__op.__y) : "memory"); return __dst.__x == __cmp.__x && __dst.__y == __cmp.__y; }
|
|
1630
1956
|
|
|
1631
1957
|
template <typename _Type, typename _Tag, typename _Sco>
|
|
1632
1958
|
struct __cuda_atomic_bind_compare_exchange {
|
|
@@ -1858,241 +2184,381 @@ template <class _Type>
|
|
|
1858
2184
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1859
2185
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1860
2186
|
{
|
|
2187
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2188
|
+
NV_DISPATCH_TARGET(
|
|
2189
|
+
NV_PROVIDES_SM_90, (),
|
|
2190
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2191
|
+
)
|
|
1861
2192
|
asm volatile(R"YYY(
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
2193
|
+
{
|
|
2194
|
+
.reg .b128 _d;
|
|
2195
|
+
.reg .b128 _v;
|
|
2196
|
+
mov.b128 _v, {%3, %4};
|
|
2197
|
+
atom.exch.acquire.cta.b128 _d,[%2],_v;
|
|
2198
|
+
mov.b128 {%0, %1}, _d;
|
|
2199
|
+
}
|
|
2200
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1868
2201
|
}
|
|
1869
2202
|
template <class _Type>
|
|
1870
2203
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1871
2204
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1872
2205
|
{
|
|
2206
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2207
|
+
NV_DISPATCH_TARGET(
|
|
2208
|
+
NV_PROVIDES_SM_90, (),
|
|
2209
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2210
|
+
)
|
|
1873
2211
|
asm volatile(R"YYY(
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
2212
|
+
{
|
|
2213
|
+
.reg .b128 _d;
|
|
2214
|
+
.reg .b128 _v;
|
|
2215
|
+
mov.b128 _v, {%3, %4};
|
|
2216
|
+
atom.exch.acquire.cluster.b128 _d,[%2],_v;
|
|
2217
|
+
mov.b128 {%0, %1}, _d;
|
|
2218
|
+
}
|
|
2219
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1880
2220
|
}
|
|
1881
2221
|
template <class _Type>
|
|
1882
2222
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1883
2223
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1884
2224
|
{
|
|
2225
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2226
|
+
NV_DISPATCH_TARGET(
|
|
2227
|
+
NV_PROVIDES_SM_90, (),
|
|
2228
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2229
|
+
)
|
|
1885
2230
|
asm volatile(R"YYY(
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
2231
|
+
{
|
|
2232
|
+
.reg .b128 _d;
|
|
2233
|
+
.reg .b128 _v;
|
|
2234
|
+
mov.b128 _v, {%3, %4};
|
|
2235
|
+
atom.exch.acquire.gpu.b128 _d,[%2],_v;
|
|
2236
|
+
mov.b128 {%0, %1}, _d;
|
|
2237
|
+
}
|
|
2238
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1892
2239
|
}
|
|
1893
2240
|
template <class _Type>
|
|
1894
2241
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1895
2242
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acquire, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1896
2243
|
{
|
|
2244
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2245
|
+
NV_DISPATCH_TARGET(
|
|
2246
|
+
NV_PROVIDES_SM_90, (),
|
|
2247
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2248
|
+
)
|
|
1897
2249
|
asm volatile(R"YYY(
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
2250
|
+
{
|
|
2251
|
+
.reg .b128 _d;
|
|
2252
|
+
.reg .b128 _v;
|
|
2253
|
+
mov.b128 _v, {%3, %4};
|
|
2254
|
+
atom.exch.acquire.sys.b128 _d,[%2],_v;
|
|
2255
|
+
mov.b128 {%0, %1}, _d;
|
|
2256
|
+
}
|
|
2257
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1904
2258
|
}
|
|
1905
2259
|
template <class _Type>
|
|
1906
2260
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1907
2261
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1908
2262
|
{
|
|
2263
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2264
|
+
NV_DISPATCH_TARGET(
|
|
2265
|
+
NV_PROVIDES_SM_90, (),
|
|
2266
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2267
|
+
)
|
|
1909
2268
|
asm volatile(R"YYY(
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
2269
|
+
{
|
|
2270
|
+
.reg .b128 _d;
|
|
2271
|
+
.reg .b128 _v;
|
|
2272
|
+
mov.b128 _v, {%3, %4};
|
|
2273
|
+
atom.exch.relaxed.cta.b128 _d,[%2],_v;
|
|
2274
|
+
mov.b128 {%0, %1}, _d;
|
|
2275
|
+
}
|
|
2276
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1916
2277
|
}
|
|
1917
2278
|
template <class _Type>
|
|
1918
2279
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1919
2280
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1920
2281
|
{
|
|
2282
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2283
|
+
NV_DISPATCH_TARGET(
|
|
2284
|
+
NV_PROVIDES_SM_90, (),
|
|
2285
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2286
|
+
)
|
|
1921
2287
|
asm volatile(R"YYY(
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
2288
|
+
{
|
|
2289
|
+
.reg .b128 _d;
|
|
2290
|
+
.reg .b128 _v;
|
|
2291
|
+
mov.b128 _v, {%3, %4};
|
|
2292
|
+
atom.exch.relaxed.cluster.b128 _d,[%2],_v;
|
|
2293
|
+
mov.b128 {%0, %1}, _d;
|
|
2294
|
+
}
|
|
2295
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1928
2296
|
}
|
|
1929
2297
|
template <class _Type>
|
|
1930
2298
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1931
2299
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1932
2300
|
{
|
|
2301
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2302
|
+
NV_DISPATCH_TARGET(
|
|
2303
|
+
NV_PROVIDES_SM_90, (),
|
|
2304
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2305
|
+
)
|
|
1933
2306
|
asm volatile(R"YYY(
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
2307
|
+
{
|
|
2308
|
+
.reg .b128 _d;
|
|
2309
|
+
.reg .b128 _v;
|
|
2310
|
+
mov.b128 _v, {%3, %4};
|
|
2311
|
+
atom.exch.relaxed.gpu.b128 _d,[%2],_v;
|
|
2312
|
+
mov.b128 {%0, %1}, _d;
|
|
2313
|
+
}
|
|
2314
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1940
2315
|
}
|
|
1941
2316
|
template <class _Type>
|
|
1942
2317
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1943
2318
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_relaxed, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1944
2319
|
{
|
|
2320
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2321
|
+
NV_DISPATCH_TARGET(
|
|
2322
|
+
NV_PROVIDES_SM_90, (),
|
|
2323
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2324
|
+
)
|
|
1945
2325
|
asm volatile(R"YYY(
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
2326
|
+
{
|
|
2327
|
+
.reg .b128 _d;
|
|
2328
|
+
.reg .b128 _v;
|
|
2329
|
+
mov.b128 _v, {%3, %4};
|
|
2330
|
+
atom.exch.relaxed.sys.b128 _d,[%2],_v;
|
|
2331
|
+
mov.b128 {%0, %1}, _d;
|
|
2332
|
+
}
|
|
2333
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1952
2334
|
}
|
|
1953
2335
|
template <class _Type>
|
|
1954
2336
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1955
2337
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
1956
2338
|
{
|
|
2339
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2340
|
+
NV_DISPATCH_TARGET(
|
|
2341
|
+
NV_PROVIDES_SM_90, (),
|
|
2342
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2343
|
+
)
|
|
1957
2344
|
asm volatile(R"YYY(
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
2345
|
+
{
|
|
2346
|
+
.reg .b128 _d;
|
|
2347
|
+
.reg .b128 _v;
|
|
2348
|
+
mov.b128 _v, {%3, %4};
|
|
2349
|
+
atom.exch.release.cta.b128 _d,[%2],_v;
|
|
2350
|
+
mov.b128 {%0, %1}, _d;
|
|
2351
|
+
}
|
|
2352
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1964
2353
|
}
|
|
1965
2354
|
template <class _Type>
|
|
1966
2355
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1967
2356
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
1968
2357
|
{
|
|
2358
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2359
|
+
NV_DISPATCH_TARGET(
|
|
2360
|
+
NV_PROVIDES_SM_90, (),
|
|
2361
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2362
|
+
)
|
|
1969
2363
|
asm volatile(R"YYY(
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
2364
|
+
{
|
|
2365
|
+
.reg .b128 _d;
|
|
2366
|
+
.reg .b128 _v;
|
|
2367
|
+
mov.b128 _v, {%3, %4};
|
|
2368
|
+
atom.exch.release.cluster.b128 _d,[%2],_v;
|
|
2369
|
+
mov.b128 {%0, %1}, _d;
|
|
2370
|
+
}
|
|
2371
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1976
2372
|
}
|
|
1977
2373
|
template <class _Type>
|
|
1978
2374
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1979
2375
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
1980
2376
|
{
|
|
2377
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2378
|
+
NV_DISPATCH_TARGET(
|
|
2379
|
+
NV_PROVIDES_SM_90, (),
|
|
2380
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2381
|
+
)
|
|
1981
2382
|
asm volatile(R"YYY(
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
2383
|
+
{
|
|
2384
|
+
.reg .b128 _d;
|
|
2385
|
+
.reg .b128 _v;
|
|
2386
|
+
mov.b128 _v, {%3, %4};
|
|
2387
|
+
atom.exch.release.gpu.b128 _d,[%2],_v;
|
|
2388
|
+
mov.b128 {%0, %1}, _d;
|
|
2389
|
+
}
|
|
2390
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
1988
2391
|
}
|
|
1989
2392
|
template <class _Type>
|
|
1990
2393
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
1991
2394
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_release, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
1992
2395
|
{
|
|
2396
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2397
|
+
NV_DISPATCH_TARGET(
|
|
2398
|
+
NV_PROVIDES_SM_90, (),
|
|
2399
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2400
|
+
)
|
|
1993
2401
|
asm volatile(R"YYY(
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2402
|
+
{
|
|
2403
|
+
.reg .b128 _d;
|
|
2404
|
+
.reg .b128 _v;
|
|
2405
|
+
mov.b128 _v, {%3, %4};
|
|
2406
|
+
atom.exch.release.sys.b128 _d,[%2],_v;
|
|
2407
|
+
mov.b128 {%0, %1}, _d;
|
|
2408
|
+
}
|
|
2409
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2000
2410
|
}
|
|
2001
2411
|
template <class _Type>
|
|
2002
2412
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2003
2413
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
2004
2414
|
{
|
|
2415
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2416
|
+
NV_DISPATCH_TARGET(
|
|
2417
|
+
NV_PROVIDES_SM_90, (),
|
|
2418
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2419
|
+
)
|
|
2005
2420
|
asm volatile(R"YYY(
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2421
|
+
{
|
|
2422
|
+
.reg .b128 _d;
|
|
2423
|
+
.reg .b128 _v;
|
|
2424
|
+
mov.b128 _v, {%3, %4};
|
|
2425
|
+
atom.exch.acq_rel.cta.b128 _d,[%2],_v;
|
|
2426
|
+
mov.b128 {%0, %1}, _d;
|
|
2427
|
+
}
|
|
2428
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2012
2429
|
}
|
|
2013
2430
|
template <class _Type>
|
|
2014
2431
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2015
2432
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
2016
2433
|
{
|
|
2434
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2435
|
+
NV_DISPATCH_TARGET(
|
|
2436
|
+
NV_PROVIDES_SM_90, (),
|
|
2437
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2438
|
+
)
|
|
2017
2439
|
asm volatile(R"YYY(
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2440
|
+
{
|
|
2441
|
+
.reg .b128 _d;
|
|
2442
|
+
.reg .b128 _v;
|
|
2443
|
+
mov.b128 _v, {%3, %4};
|
|
2444
|
+
atom.exch.acq_rel.cluster.b128 _d,[%2],_v;
|
|
2445
|
+
mov.b128 {%0, %1}, _d;
|
|
2446
|
+
}
|
|
2447
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2024
2448
|
}
|
|
2025
2449
|
template <class _Type>
|
|
2026
2450
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2027
2451
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
2028
2452
|
{
|
|
2453
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2454
|
+
NV_DISPATCH_TARGET(
|
|
2455
|
+
NV_PROVIDES_SM_90, (),
|
|
2456
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2457
|
+
)
|
|
2029
2458
|
asm volatile(R"YYY(
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2459
|
+
{
|
|
2460
|
+
.reg .b128 _d;
|
|
2461
|
+
.reg .b128 _v;
|
|
2462
|
+
mov.b128 _v, {%3, %4};
|
|
2463
|
+
atom.exch.acq_rel.gpu.b128 _d,[%2],_v;
|
|
2464
|
+
mov.b128 {%0, %1}, _d;
|
|
2465
|
+
}
|
|
2466
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2036
2467
|
}
|
|
2037
2468
|
template <class _Type>
|
|
2038
2469
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2039
2470
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_acq_rel, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
2040
2471
|
{
|
|
2472
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2473
|
+
NV_DISPATCH_TARGET(
|
|
2474
|
+
NV_PROVIDES_SM_90, (),
|
|
2475
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2476
|
+
)
|
|
2041
2477
|
asm volatile(R"YYY(
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2478
|
+
{
|
|
2479
|
+
.reg .b128 _d;
|
|
2480
|
+
.reg .b128 _v;
|
|
2481
|
+
mov.b128 _v, {%3, %4};
|
|
2482
|
+
atom.exch.acq_rel.sys.b128 _d,[%2],_v;
|
|
2483
|
+
mov.b128 {%0, %1}, _d;
|
|
2484
|
+
}
|
|
2485
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2048
2486
|
}
|
|
2049
2487
|
template <class _Type>
|
|
2050
2488
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2051
2489
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_block_tag)
|
|
2052
2490
|
{
|
|
2491
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2492
|
+
NV_DISPATCH_TARGET(
|
|
2493
|
+
NV_PROVIDES_SM_90, (),
|
|
2494
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2495
|
+
)
|
|
2053
2496
|
asm volatile(R"YYY(
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2497
|
+
{
|
|
2498
|
+
.reg .b128 _d;
|
|
2499
|
+
.reg .b128 _v;
|
|
2500
|
+
mov.b128 _v, {%3, %4};
|
|
2501
|
+
atom.exch.cta.b128 _d,[%2],_v;
|
|
2502
|
+
mov.b128 {%0, %1}, _d;
|
|
2503
|
+
}
|
|
2504
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2060
2505
|
}
|
|
2061
2506
|
template <class _Type>
|
|
2062
2507
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2063
2508
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_cluster_tag)
|
|
2064
2509
|
{
|
|
2510
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2511
|
+
NV_DISPATCH_TARGET(
|
|
2512
|
+
NV_PROVIDES_SM_90, (),
|
|
2513
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2514
|
+
)
|
|
2065
2515
|
asm volatile(R"YYY(
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2516
|
+
{
|
|
2517
|
+
.reg .b128 _d;
|
|
2518
|
+
.reg .b128 _v;
|
|
2519
|
+
mov.b128 _v, {%3, %4};
|
|
2520
|
+
atom.exch.cluster.b128 _d,[%2],_v;
|
|
2521
|
+
mov.b128 {%0, %1}, _d;
|
|
2522
|
+
}
|
|
2523
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2072
2524
|
}
|
|
2073
2525
|
template <class _Type>
|
|
2074
2526
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2075
2527
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_device_tag)
|
|
2076
2528
|
{
|
|
2529
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2530
|
+
NV_DISPATCH_TARGET(
|
|
2531
|
+
NV_PROVIDES_SM_90, (),
|
|
2532
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2533
|
+
)
|
|
2077
2534
|
asm volatile(R"YYY(
|
|
2078
|
-
|
|
2079
|
-
|
|
2080
|
-
|
|
2081
|
-
|
|
2082
|
-
|
|
2083
|
-
|
|
2535
|
+
{
|
|
2536
|
+
.reg .b128 _d;
|
|
2537
|
+
.reg .b128 _v;
|
|
2538
|
+
mov.b128 _v, {%3, %4};
|
|
2539
|
+
atom.exch.gpu.b128 _d,[%2],_v;
|
|
2540
|
+
mov.b128 {%0, %1}, _d;
|
|
2541
|
+
}
|
|
2542
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2084
2543
|
}
|
|
2085
2544
|
template <class _Type>
|
|
2086
2545
|
static inline _CCCL_DEVICE void __cuda_atomic_exchange(
|
|
2087
2546
|
_Type* __ptr, _Type& __old, _Type __new, __atomic_cuda_volatile, __atomic_cuda_operand_b128, __thread_scope_system_tag)
|
|
2088
2547
|
{
|
|
2548
|
+
static_assert(__cccl_ptx_isa >= 840 && (sizeof(_Type) == 16), "128b exchange is not supported until PTX ISA version 840");
|
|
2549
|
+
NV_DISPATCH_TARGET(
|
|
2550
|
+
NV_PROVIDES_SM_90, (),
|
|
2551
|
+
NV_ANY_TARGET, (__atomic_exchange_128b_unsupported_before_SM_90();)
|
|
2552
|
+
)
|
|
2089
2553
|
asm volatile(R"YYY(
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2554
|
+
{
|
|
2555
|
+
.reg .b128 _d;
|
|
2556
|
+
.reg .b128 _v;
|
|
2557
|
+
mov.b128 _v, {%3, %4};
|
|
2558
|
+
atom.exch.sys.b128 _d,[%2],_v;
|
|
2559
|
+
mov.b128 {%0, %1}, _d;
|
|
2560
|
+
}
|
|
2561
|
+
)YYY" : "=l"(__old.__x),"=l"(__old.__y) : "l"(__ptr), "l"(__new.__x),"l"(__new.__y) : "memory");
|
|
2096
2562
|
}
|
|
2097
2563
|
|
|
2098
2564
|
template <typename _Type, typename _Tag, typename _Sco>
|
|
@@ -2572,7 +3038,7 @@ struct __cuda_atomic_bind_fetch_add {
|
|
|
2572
3038
|
}
|
|
2573
3039
|
};
|
|
2574
3040
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmetic<_Type> = 0>
|
|
2575
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3041
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
2576
3042
|
{
|
|
2577
3043
|
constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;
|
|
2578
3044
|
__op = __op * __skip_v;
|
|
@@ -2588,7 +3054,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type* __ptr, _Up __op,
|
|
|
2588
3054
|
return __dst;
|
|
2589
3055
|
}
|
|
2590
3056
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_arithmetic<_Type> = 0>
|
|
2591
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3057
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_add_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
2592
3058
|
{
|
|
2593
3059
|
constexpr auto __skip_v = __atomic_ptr_skip_t<_Type>::__skip;
|
|
2594
3060
|
__op = __op * __skip_v;
|
|
@@ -2777,7 +3243,7 @@ struct __cuda_atomic_bind_fetch_and {
|
|
|
2777
3243
|
}
|
|
2778
3244
|
};
|
|
2779
3245
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
|
|
2780
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3246
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
2781
3247
|
{
|
|
2782
3248
|
constexpr auto __skip_v = 1;
|
|
2783
3249
|
__op = __op * __skip_v;
|
|
@@ -2793,7 +3259,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type* __ptr, _Up __op,
|
|
|
2793
3259
|
return __dst;
|
|
2794
3260
|
}
|
|
2795
3261
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
|
|
2796
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3262
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_and_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
2797
3263
|
{
|
|
2798
3264
|
constexpr auto __skip_v = 1;
|
|
2799
3265
|
__op = __op * __skip_v;
|
|
@@ -3142,7 +3608,7 @@ struct __cuda_atomic_bind_fetch_max {
|
|
|
3142
3608
|
}
|
|
3143
3609
|
};
|
|
3144
3610
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
|
|
3145
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3611
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3146
3612
|
{
|
|
3147
3613
|
constexpr auto __skip_v = 1;
|
|
3148
3614
|
__op = __op * __skip_v;
|
|
@@ -3158,7 +3624,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type* __ptr, _Up __op,
|
|
|
3158
3624
|
return __dst;
|
|
3159
3625
|
}
|
|
3160
3626
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
|
|
3161
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3627
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_max_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3162
3628
|
{
|
|
3163
3629
|
constexpr auto __skip_v = 1;
|
|
3164
3630
|
__op = __op * __skip_v;
|
|
@@ -3507,7 +3973,7 @@ struct __cuda_atomic_bind_fetch_min {
|
|
|
3507
3973
|
}
|
|
3508
3974
|
};
|
|
3509
3975
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
|
|
3510
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3976
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3511
3977
|
{
|
|
3512
3978
|
constexpr auto __skip_v = 1;
|
|
3513
3979
|
__op = __op * __skip_v;
|
|
@@ -3523,7 +3989,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type* __ptr, _Up __op,
|
|
|
3523
3989
|
return __dst;
|
|
3524
3990
|
}
|
|
3525
3991
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_minmax<_Type> = 0>
|
|
3526
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3992
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_min_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3527
3993
|
{
|
|
3528
3994
|
constexpr auto __skip_v = 1;
|
|
3529
3995
|
__op = __op * __skip_v;
|
|
@@ -3712,7 +4178,7 @@ struct __cuda_atomic_bind_fetch_or {
|
|
|
3712
4178
|
}
|
|
3713
4179
|
};
|
|
3714
4180
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
|
|
3715
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
4181
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3716
4182
|
{
|
|
3717
4183
|
constexpr auto __skip_v = 1;
|
|
3718
4184
|
__op = __op * __skip_v;
|
|
@@ -3728,7 +4194,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type* __ptr, _Up __op,
|
|
|
3728
4194
|
return __dst;
|
|
3729
4195
|
}
|
|
3730
4196
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
|
|
3731
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
4197
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_or_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3732
4198
|
{
|
|
3733
4199
|
constexpr auto __skip_v = 1;
|
|
3734
4200
|
__op = __op * __skip_v;
|
|
@@ -3917,7 +4383,7 @@ struct __cuda_atomic_bind_fetch_xor {
|
|
|
3917
4383
|
}
|
|
3918
4384
|
};
|
|
3919
4385
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
|
|
3920
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
4386
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3921
4387
|
{
|
|
3922
4388
|
constexpr auto __skip_v = 1;
|
|
3923
4389
|
__op = __op * __skip_v;
|
|
@@ -3933,7 +4399,7 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type* __ptr, _Up __op,
|
|
|
3933
4399
|
return __dst;
|
|
3934
4400
|
}
|
|
3935
4401
|
template <class _Type, class _Up, class _Sco, __atomic_enable_if_native_bitwise<_Type> = 0>
|
|
3936
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
4402
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3937
4403
|
{
|
|
3938
4404
|
constexpr auto __skip_v = 1;
|
|
3939
4405
|
__op = __op * __skip_v;
|
|
@@ -3950,12 +4416,12 @@ static inline _CCCL_DEVICE _Type __atomic_fetch_xor_cuda(_Type volatile* __ptr,
|
|
|
3950
4416
|
}
|
|
3951
4417
|
|
|
3952
4418
|
template <class _Type, class _Up, class _Sco>
|
|
3953
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
4419
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type* __ptr, _Up __op, int __memorder, _Sco)
|
|
3954
4420
|
{
|
|
3955
4421
|
return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
|
|
3956
4422
|
}
|
|
3957
4423
|
template <class _Type, class _Up, class _Sco>
|
|
3958
|
-
static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
4424
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Type __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Up __op, int __memorder, _Sco)
|
|
3959
4425
|
{
|
|
3960
4426
|
return __atomic_fetch_add_cuda(__ptr, -__op, __memorder, _Sco{});
|
|
3961
4427
|
}
|