cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
- cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
- cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
- cuda/cccl/headers/include/cub/config.cuh +2 -26
- cuda/cccl/headers/include/cub/cub.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
- cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
- cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
- cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
- cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
- cuda/cccl/headers/include/cub/util_device.cuh +18 -59
- cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
- cuda/cccl/headers/include/cub/util_math.cuh +2 -28
- cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
- cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
- cuda/cccl/headers/include/cub/util_type.cuh +5 -32
- cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
- cuda/cccl/headers/include/cub/version.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
- cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
- cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
- cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
- cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
- cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
- cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
- cuda/cccl/headers/include/cuda/barrier +42 -16
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/memory_resource +6 -1
- cuda/cccl/headers/include/cuda/numeric +2 -0
- cuda/cccl/headers/include/cuda/pipeline +3 -2
- cuda/cccl/headers/include/cuda/ptx +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
- cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
- cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
- cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
- cuda/cccl/headers/include/cuda/std/__new_ +1 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random_ +2 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/array +1 -1
- cuda/cccl/headers/include/cuda/std/atomic +1 -1
- cuda/cccl/headers/include/cuda/std/bitset +2 -10
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
- cuda/cccl/headers/include/cuda/std/functional +1 -1
- cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
- cuda/cccl/headers/include/cuda/std/iterator +1 -1
- cuda/cccl/headers/include/cuda/std/numbers +0 -2
- cuda/cccl/headers/include/cuda/std/ratio +2 -2
- cuda/cccl/headers/include/cuda/std/span +2 -2
- cuda/cccl/headers/include/cuda/std/string_view +24 -42
- cuda/cccl/headers/include/cuda/std/tuple +18 -1
- cuda/cccl/headers/include/cuda/std/type_traits +0 -1
- cuda/cccl/headers/include/cuda/std/variant +8 -1
- cuda/cccl/headers/include/nv/target +2 -6
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
- cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
- cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
- cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
- cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
- cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
- cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
- cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
- cuda/cccl/headers/include/thrust/functional.h +0 -2
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/new.h +0 -2
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
- cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
- cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
- cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
- cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
- cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
- cuda/cccl/headers/include/thrust/transform.h +14 -3
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
- cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
- cuda/cccl/headers/include/thrust/zip_function.h +2 -28
- cuda/compute/__init__.py +4 -0
- cuda/compute/_bindings.pyi +26 -3
- cuda/compute/_bindings_impl.pyx +143 -1
- cuda/compute/algorithms/__init__.py +9 -5
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
- cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda_cccl-0.3.4.dist-info/METADATA +78 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
- cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
- cuda_cccl-0.3.2.dist-info/METADATA +0 -42
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -29,7 +29,6 @@
|
|
|
29
29
|
#include <cuda/__barrier/barrier_arrive_tx.h>
|
|
30
30
|
#include <cuda/__barrier/barrier_block_scope.h>
|
|
31
31
|
#include <cuda/__barrier/barrier_expect_tx.h>
|
|
32
|
-
#include <cuda/__barrier/barrier_native_handle.h>
|
|
33
32
|
#include <cuda/__barrier/barrier_thread_scope.h>
|
|
34
33
|
#include <cuda/__memcpy_async/memcpy_async.h>
|
|
35
34
|
#include <cuda/__memcpy_async/memcpy_async_tx.h>
|
|
@@ -69,8 +68,9 @@ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
|
|
|
69
68
|
// device compilation begins.
|
|
70
69
|
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
|
|
71
70
|
|
|
71
|
+
//! Deprecated [Since 3.2]
|
|
72
72
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
|
|
73
|
-
inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
|
|
73
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk instead") inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
|
|
74
74
|
void* __dest, const void* __src, ::cuda::std::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
|
|
75
75
|
{
|
|
76
76
|
_CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
|
|
@@ -88,8 +88,10 @@ inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
|
|
|
88
88
|
::cuda::device::barrier_native_handle(__bar));
|
|
89
89
|
}
|
|
90
90
|
|
|
91
|
+
//! Deprecated [Since 3.2]
|
|
91
92
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
|
|
92
|
-
|
|
93
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk instead") inline _CCCL_DEVICE void
|
|
94
|
+
cp_async_bulk_shared_to_global(void* __dest, const void* __src, ::cuda::std::uint32_t __size)
|
|
93
95
|
{
|
|
94
96
|
_CCCL_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
|
|
95
97
|
_CCCL_ASSERT(::cuda::device::is_address_from(__dest, ::cuda::device::address_space::global),
|
|
@@ -100,8 +102,10 @@ inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void
|
|
|
100
102
|
::cuda::ptx::cp_async_bulk(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __dest, __src, __size);
|
|
101
103
|
}
|
|
102
104
|
|
|
105
|
+
//! Deprecated [Since 3.2]
|
|
103
106
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
104
|
-
inline _CCCL_DEVICE void
|
|
107
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
108
|
+
cp_async_bulk_tensor_1d_global_to_shared(
|
|
105
109
|
void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
|
|
106
110
|
{
|
|
107
111
|
const ::cuda::std::int32_t __coords[]{__c0};
|
|
@@ -115,8 +119,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
|
|
|
115
119
|
::cuda::device::barrier_native_handle(__bar));
|
|
116
120
|
}
|
|
117
121
|
|
|
122
|
+
//! Deprecated [Since 3.2]
|
|
118
123
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
119
|
-
inline _CCCL_DEVICE void
|
|
124
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
125
|
+
cp_async_bulk_tensor_2d_global_to_shared(
|
|
120
126
|
void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
|
|
121
127
|
{
|
|
122
128
|
const ::cuda::std::int32_t __coords[]{__c0, __c1};
|
|
@@ -130,8 +136,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
|
|
|
130
136
|
::cuda::device::barrier_native_handle(__bar));
|
|
131
137
|
}
|
|
132
138
|
|
|
139
|
+
//! Deprecated [Since 3.2]
|
|
133
140
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
134
|
-
inline _CCCL_DEVICE void
|
|
141
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
142
|
+
cp_async_bulk_tensor_3d_global_to_shared(
|
|
135
143
|
void* __dest,
|
|
136
144
|
const CUtensorMap* __tensor_map,
|
|
137
145
|
int __c0,
|
|
@@ -150,8 +158,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
|
|
|
150
158
|
::cuda::device::barrier_native_handle(__bar));
|
|
151
159
|
}
|
|
152
160
|
|
|
161
|
+
//! Deprecated [Since 3.2]
|
|
153
162
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
154
|
-
inline _CCCL_DEVICE void
|
|
163
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
164
|
+
cp_async_bulk_tensor_4d_global_to_shared(
|
|
155
165
|
void* __dest,
|
|
156
166
|
const CUtensorMap* __tensor_map,
|
|
157
167
|
int __c0,
|
|
@@ -171,8 +181,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
|
|
|
171
181
|
::cuda::device::barrier_native_handle(__bar));
|
|
172
182
|
}
|
|
173
183
|
|
|
184
|
+
//! Deprecated [Since 3.2]
|
|
174
185
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
175
|
-
inline _CCCL_DEVICE void
|
|
186
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
187
|
+
cp_async_bulk_tensor_5d_global_to_shared(
|
|
176
188
|
void* __dest,
|
|
177
189
|
const CUtensorMap* __tensor_map,
|
|
178
190
|
int __c0,
|
|
@@ -193,8 +205,9 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
|
|
|
193
205
|
::cuda::device::barrier_native_handle(__bar));
|
|
194
206
|
}
|
|
195
207
|
|
|
208
|
+
//! Deprecated [Since 3.2]
|
|
196
209
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
197
|
-
inline _CCCL_DEVICE void
|
|
210
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
198
211
|
cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
|
|
199
212
|
{
|
|
200
213
|
const ::cuda::std::int32_t __coords[]{__c0};
|
|
@@ -202,8 +215,9 @@ cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __
|
|
|
202
215
|
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
|
|
203
216
|
}
|
|
204
217
|
|
|
218
|
+
//! Deprecated [Since 3.2]
|
|
205
219
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
206
|
-
inline _CCCL_DEVICE void
|
|
220
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
207
221
|
cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
|
|
208
222
|
{
|
|
209
223
|
const ::cuda::std::int32_t __coords[]{__c0, __c1};
|
|
@@ -211,8 +225,10 @@ cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __
|
|
|
211
225
|
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
|
|
212
226
|
}
|
|
213
227
|
|
|
228
|
+
//! Deprecated [Since 3.2]
|
|
214
229
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
215
|
-
inline _CCCL_DEVICE void
|
|
230
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
231
|
+
cp_async_bulk_tensor_3d_shared_to_global(
|
|
216
232
|
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
|
|
217
233
|
{
|
|
218
234
|
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2};
|
|
@@ -220,8 +236,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
|
|
|
220
236
|
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
|
|
221
237
|
}
|
|
222
238
|
|
|
239
|
+
//! Deprecated [Since 3.2]
|
|
223
240
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
224
|
-
inline _CCCL_DEVICE void
|
|
241
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
242
|
+
cp_async_bulk_tensor_4d_shared_to_global(
|
|
225
243
|
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
|
|
226
244
|
{
|
|
227
245
|
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3};
|
|
@@ -229,8 +247,10 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
|
|
|
229
247
|
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
|
|
230
248
|
}
|
|
231
249
|
|
|
250
|
+
//! Deprecated [Since 3.2]
|
|
232
251
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
|
|
233
|
-
inline _CCCL_DEVICE void
|
|
252
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_tensor instead") inline _CCCL_DEVICE void
|
|
253
|
+
cp_async_bulk_tensor_5d_shared_to_global(
|
|
234
254
|
const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
|
|
235
255
|
{
|
|
236
256
|
const ::cuda::std::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
|
|
@@ -238,21 +258,27 @@ inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
|
|
|
238
258
|
::cuda::ptx::cp_async_bulk_tensor(::cuda::ptx::space_global, ::cuda::ptx::space_shared, __tensor_map, __coords, __src);
|
|
239
259
|
}
|
|
240
260
|
|
|
261
|
+
//! Deprecated [Since 3.2]
|
|
241
262
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
|
|
242
|
-
inline _CCCL_DEVICE void
|
|
263
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::fence_proxy_async instead") inline _CCCL_DEVICE void
|
|
264
|
+
fence_proxy_async_shared_cta()
|
|
243
265
|
{
|
|
244
266
|
::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);
|
|
245
267
|
}
|
|
246
268
|
|
|
269
|
+
//! Deprecated [Since 3.2]
|
|
247
270
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
|
|
248
|
-
inline _CCCL_DEVICE void
|
|
271
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_commit_group instead") inline _CCCL_DEVICE void
|
|
272
|
+
cp_async_bulk_commit_group()
|
|
249
273
|
{
|
|
250
274
|
::cuda::ptx::cp_async_bulk_commit_group();
|
|
251
275
|
}
|
|
252
276
|
|
|
277
|
+
//! Deprecated [Since 3.2]
|
|
253
278
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
|
|
254
279
|
template <int __n_prior>
|
|
255
|
-
inline _CCCL_DEVICE void
|
|
280
|
+
CCCL_DEPRECATED_BECAUSE("Use cuda::ptx::cp_async_bulk_wait_group_read instead") inline _CCCL_DEVICE void
|
|
281
|
+
cp_async_bulk_wait_group_read()
|
|
256
282
|
{
|
|
257
283
|
static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
|
|
258
284
|
::cuda::ptx::cp_async_bulk_wait_group_read(::cuda::ptx::n32_t<__n_prior>{});
|
|
@@ -26,10 +26,15 @@
|
|
|
26
26
|
//!
|
|
27
27
|
//!@endrst
|
|
28
28
|
|
|
29
|
+
#include <cuda/__memory_resource/any_resource.h>
|
|
30
|
+
#include <cuda/__memory_resource/device_memory_pool.h>
|
|
29
31
|
#include <cuda/__memory_resource/get_memory_resource.h>
|
|
30
32
|
#include <cuda/__memory_resource/get_property.h>
|
|
33
|
+
#include <cuda/__memory_resource/legacy_managed_memory_resource.h>
|
|
34
|
+
#include <cuda/__memory_resource/legacy_pinned_memory_resource.h>
|
|
35
|
+
#include <cuda/__memory_resource/managed_memory_pool.h>
|
|
36
|
+
#include <cuda/__memory_resource/pinned_memory_pool.h>
|
|
31
37
|
#include <cuda/__memory_resource/properties.h>
|
|
32
38
|
#include <cuda/__memory_resource/resource.h>
|
|
33
|
-
#include <cuda/__memory_resource/resource_ref.h>
|
|
34
39
|
|
|
35
40
|
#endif //_CCCL_BEGIN_NAMESPACE_CUDA
|
|
@@ -21,9 +21,11 @@
|
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
23
|
#include <cuda/__numeric/add_overflow.h>
|
|
24
|
+
#include <cuda/__numeric/div_overflow.h>
|
|
24
25
|
#include <cuda/__numeric/narrow.h>
|
|
25
26
|
#include <cuda/__numeric/overflow_cast.h>
|
|
26
27
|
#include <cuda/__numeric/overflow_result.h>
|
|
28
|
+
#include <cuda/__numeric/sub_overflow.h>
|
|
27
29
|
#include <cuda/std/numeric>
|
|
28
30
|
|
|
29
31
|
#endif // _CUDA_NUMERIC
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
#include <cuda/atomic>
|
|
30
30
|
#include <cuda/barrier>
|
|
31
31
|
#include <cuda/std/__algorithm/max.h>
|
|
32
|
+
#include <cuda/std/__bit/popcount.h>
|
|
32
33
|
#include <cuda/std/__chrono/duration.h>
|
|
33
34
|
#include <cuda/std/__chrono/time_point.h>
|
|
34
35
|
#include <cuda/std/cstdint>
|
|
@@ -112,7 +113,7 @@ public:
|
|
|
112
113
|
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
|
|
113
114
|
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
|
|
114
115
|
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
|
|
115
|
-
__sub_count = ::
|
|
116
|
+
__sub_count = ::cuda::std::popcount(__match_mask);
|
|
116
117
|
, __elected = true;
|
|
117
118
|
__sub_count = 1;)
|
|
118
119
|
bool __released = false;
|
|
@@ -301,7 +302,7 @@ make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count
|
|
|
301
302
|
::__match_any_sync(::__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
|
|
302
303
|
const uint32_t __elected_id = ::__ffs(__match_mask) - 1;
|
|
303
304
|
__elected = (::cuda::ptx::get_sreg_laneid() == __elected_id);
|
|
304
|
-
__add_count = ::
|
|
305
|
+
__add_count = ::cuda::std::popcount(__match_mask);
|
|
305
306
|
, __elected = true;
|
|
306
307
|
__add_count = 1;)
|
|
307
308
|
if (__elected)
|
|
@@ -114,6 +114,7 @@
|
|
|
114
114
|
#include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
|
|
115
115
|
#include <cuda/__ptx/instructions/tensormap_replace.h>
|
|
116
116
|
#include <cuda/__ptx/instructions/trap.h>
|
|
117
|
+
#include <cuda/__ptx/pragmas/enable_smem_spilling.h>
|
|
117
118
|
|
|
118
119
|
#include <cuda/std/__cccl/prologue.h>
|
|
119
120
|
|
|
@@ -35,14 +35,12 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD
|
|
|
35
35
|
|
|
36
36
|
namespace __unique_copy_tags
|
|
37
37
|
{
|
|
38
|
-
|
|
39
38
|
struct __reread_from_input_tag
|
|
40
39
|
{};
|
|
41
40
|
struct __reread_from_output_tag
|
|
42
41
|
{};
|
|
43
42
|
struct __read_from_tmp_value_tag
|
|
44
43
|
{};
|
|
45
|
-
|
|
46
44
|
} // namespace __unique_copy_tags
|
|
47
45
|
|
|
48
46
|
_CCCL_EXEC_CHECK_DISABLE
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
#include <cuda/std/__atomic/api/common.h>
|
|
25
25
|
#include <cuda/std/__atomic/order.h>
|
|
26
26
|
#include <cuda/std/__atomic/scopes.h>
|
|
27
|
-
#include <cuda/std/__atomic/types
|
|
27
|
+
#include <cuda/std/__atomic/types.h>
|
|
28
28
|
#include <cuda/std/__atomic/wait/notify_wait.h>
|
|
29
29
|
#include <cuda/std/__atomic/wait/polling.h>
|
|
30
30
|
#include <cuda/std/__type_traits/conditional.h>
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
|
|
25
|
+
#include <cuda/std/__functional/operations.h>
|
|
25
26
|
#include <cuda/std/__type_traits/conditional.h>
|
|
26
27
|
#include <cuda/std/__type_traits/enable_if.h>
|
|
27
28
|
#include <cuda/std/__type_traits/is_scalar.h>
|
|
@@ -147,6 +148,45 @@ _CCCL_DEVICE _Type __cuda_atomic_fetch_update(_Type* __ptr, const _Fn& __op, _Or
|
|
|
147
148
|
}
|
|
148
149
|
}
|
|
149
150
|
|
|
151
|
+
template <class _Type, template <class> class _Op>
|
|
152
|
+
struct __cuda_atomic_op_bind
|
|
153
|
+
{
|
|
154
|
+
_Type __val;
|
|
155
|
+
|
|
156
|
+
[[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __old) const
|
|
157
|
+
{
|
|
158
|
+
return _Op<_Type>{}(__val, __old);
|
|
159
|
+
}
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
template <class _Type>
|
|
163
|
+
struct __cuda_atomic_op_store
|
|
164
|
+
{
|
|
165
|
+
// Just return first value
|
|
166
|
+
[[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __val, _Type) const
|
|
167
|
+
{
|
|
168
|
+
return __val;
|
|
169
|
+
}
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
template <class _Type>
|
|
173
|
+
struct __cuda_atomic_op_fetch_min
|
|
174
|
+
{
|
|
175
|
+
[[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __op, _Type __old) const
|
|
176
|
+
{
|
|
177
|
+
return __op < __old ? __op : __old;
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
template <class _Type>
|
|
182
|
+
struct __cuda_atomic_op_fetch_max
|
|
183
|
+
{
|
|
184
|
+
[[nodiscard]] _CCCL_DEVICE _Type operator()(_Type __op, _Type __old) const
|
|
185
|
+
{
|
|
186
|
+
return __old < __op ? __op : __old;
|
|
187
|
+
}
|
|
188
|
+
};
|
|
189
|
+
|
|
150
190
|
// Optimized fetch_update CAS loop with op determined after first load reducing waste.
|
|
151
191
|
template <class _Type,
|
|
152
192
|
class _Fn,
|
|
@@ -179,9 +219,7 @@ __cuda_atomic_store(_Type* __ptr, _Type __val, _Order, _Operand, _Sco, __atomic_
|
|
|
179
219
|
// Store requires cas on 8/16b types
|
|
180
220
|
__cuda_atomic_fetch_update(
|
|
181
221
|
__ptr,
|
|
182
|
-
|
|
183
|
-
return __val;
|
|
184
|
-
},
|
|
222
|
+
__cuda_atomic_op_bind<_Type, ::cuda::std::__cuda_atomic_op_store>{__val},
|
|
185
223
|
_Order{},
|
|
186
224
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
187
225
|
_Sco{});
|
|
@@ -192,9 +230,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_add(_Type* __ptr, _Type& __d
|
|
|
192
230
|
{
|
|
193
231
|
__dst = __cuda_atomic_fetch_update(
|
|
194
232
|
__ptr,
|
|
195
|
-
|
|
196
|
-
return __old + __op;
|
|
197
|
-
},
|
|
233
|
+
__cuda_atomic_op_bind<_Type, ::cuda::std::plus>{__op},
|
|
198
234
|
_Order{},
|
|
199
235
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
200
236
|
_Sco{});
|
|
@@ -205,9 +241,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_and(_Type* __ptr, _Type& __d
|
|
|
205
241
|
{
|
|
206
242
|
__dst = __cuda_atomic_fetch_update(
|
|
207
243
|
__ptr,
|
|
208
|
-
|
|
209
|
-
return __old & __op;
|
|
210
|
-
},
|
|
244
|
+
__cuda_atomic_op_bind<_Type, ::cuda::std::bit_and>{__op},
|
|
211
245
|
_Order{},
|
|
212
246
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
213
247
|
_Sco{});
|
|
@@ -218,9 +252,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_xor(_Type* __ptr, _Type& __d
|
|
|
218
252
|
{
|
|
219
253
|
__dst = __cuda_atomic_fetch_update(
|
|
220
254
|
__ptr,
|
|
221
|
-
|
|
222
|
-
return __old ^ __op;
|
|
223
|
-
},
|
|
255
|
+
__cuda_atomic_op_bind<_Type, ::cuda::std::bit_xor>{__op},
|
|
224
256
|
_Order{},
|
|
225
257
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
226
258
|
_Sco{});
|
|
@@ -231,9 +263,7 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_or(_Type* __ptr, _Type& __ds
|
|
|
231
263
|
{
|
|
232
264
|
__dst = __cuda_atomic_fetch_update(
|
|
233
265
|
__ptr,
|
|
234
|
-
|
|
235
|
-
return __old | __op;
|
|
236
|
-
},
|
|
266
|
+
__cuda_atomic_op_bind<_Type, ::cuda::std::bit_or>{__op},
|
|
237
267
|
_Order{},
|
|
238
268
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
239
269
|
_Sco{});
|
|
@@ -244,21 +274,18 @@ static inline _CCCL_DEVICE void __cuda_atomic_fetch_min(_Type* __ptr, _Type& __d
|
|
|
244
274
|
{
|
|
245
275
|
__dst = __cuda_atomic_fetch_update(
|
|
246
276
|
__ptr,
|
|
247
|
-
|
|
248
|
-
return __op < __old ? __op : __old;
|
|
249
|
-
},
|
|
277
|
+
__cuda_atomic_op_bind<_Type, __cuda_atomic_op_fetch_min>{__op},
|
|
250
278
|
_Order{},
|
|
251
279
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
252
280
|
_Sco{});
|
|
253
281
|
}
|
|
282
|
+
|
|
254
283
|
template <class _Type, class _Order, class _Operand, class _Sco, __cuda_atomic_enable_non_native_arithmetic<_Operand> = 0>
|
|
255
284
|
static inline _CCCL_DEVICE void __cuda_atomic_fetch_max(_Type* __ptr, _Type& __dst, _Type __op, _Order, _Operand, _Sco)
|
|
256
285
|
{
|
|
257
286
|
__dst = __cuda_atomic_fetch_update(
|
|
258
287
|
__ptr,
|
|
259
|
-
|
|
260
|
-
return __old < __op ? __op : __old;
|
|
261
|
-
},
|
|
288
|
+
__cuda_atomic_op_bind<_Type, __cuda_atomic_op_fetch_max>{__op},
|
|
262
289
|
_Order{},
|
|
263
290
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
264
291
|
_Sco{});
|
|
@@ -269,16 +296,15 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(_Type* __ptr, _Type& __ds
|
|
|
269
296
|
{
|
|
270
297
|
__dst = __cuda_atomic_fetch_update(
|
|
271
298
|
__ptr,
|
|
272
|
-
|
|
273
|
-
return __op;
|
|
274
|
-
},
|
|
299
|
+
__cuda_atomic_op_bind<_Type, ::cuda::std::__cuda_atomic_op_store>{__op},
|
|
275
300
|
_Order{},
|
|
276
301
|
__atomic_cuda_operand_tag<__atomic_cuda_operand::_b, _Operand::__size>{},
|
|
277
302
|
_Sco{});
|
|
278
303
|
}
|
|
279
304
|
|
|
280
305
|
template <typename _Tp, typename _Fn, typename _Sco>
|
|
281
|
-
|
|
306
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
307
|
+
__atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __memorder, _Sco)
|
|
282
308
|
{
|
|
283
309
|
_Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
|
|
284
310
|
_Tp __desired = __op(__expected);
|
|
@@ -289,7 +315,8 @@ _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp* __ptr, const _Fn& __op, int __m
|
|
|
289
315
|
return __expected;
|
|
290
316
|
}
|
|
291
317
|
template <typename _Tp, typename _Fn, typename _Sco>
|
|
292
|
-
|
|
318
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
319
|
+
__atomic_fetch_update_cuda(_Tp volatile* __ptr, const _Fn& __op, int __memorder, _Sco)
|
|
293
320
|
{
|
|
294
321
|
_Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
|
|
295
322
|
_Tp __desired = __op(__expected);
|
|
@@ -301,14 +328,14 @@ _CCCL_DEVICE _Tp __atomic_fetch_update_cuda(_Tp volatile* __ptr, const _Fn& __op
|
|
|
301
328
|
}
|
|
302
329
|
|
|
303
330
|
template <typename _Tp, typename _Sco>
|
|
304
|
-
_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp* __ptr, int __memorder, _Sco)
|
|
331
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp* __ptr, int __memorder, _Sco)
|
|
305
332
|
{
|
|
306
333
|
_Tp __ret;
|
|
307
334
|
__atomic_load_cuda(__ptr, __ret, __memorder, _Sco{});
|
|
308
335
|
return __ret;
|
|
309
336
|
}
|
|
310
337
|
template <typename _Tp, typename _Sco>
|
|
311
|
-
_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
|
|
338
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
|
|
312
339
|
{
|
|
313
340
|
_Tp __ret;
|
|
314
341
|
__atomic_load_cuda(__ptr, __ret, __memorder, _Sco{});
|
|
@@ -327,64 +354,94 @@ _CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __me
|
|
|
327
354
|
}
|
|
328
355
|
|
|
329
356
|
template <typename _Tp, typename _Sco>
|
|
330
|
-
_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
|
|
357
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp* __ptr, _Tp __val, int __memorder, _Sco)
|
|
331
358
|
{
|
|
332
359
|
_Tp __ret;
|
|
333
360
|
__atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
|
|
334
361
|
return __ret;
|
|
335
362
|
}
|
|
336
363
|
template <typename _Tp, typename _Sco>
|
|
337
|
-
|
|
364
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
365
|
+
__atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
|
|
338
366
|
{
|
|
339
367
|
_Tp __ret;
|
|
340
368
|
__atomic_exchange_cuda(__ptr, __ret, __val, __memorder, _Sco{});
|
|
341
369
|
return __ret;
|
|
342
370
|
}
|
|
343
371
|
|
|
372
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_arithmetic<_Tp> = 0>
|
|
373
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
374
|
+
{
|
|
375
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::plus>{__val}, __memorder, _Sco{});
|
|
376
|
+
}
|
|
377
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_arithmetic<_Tp> = 0>
|
|
378
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
379
|
+
__atomic_fetch_add_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
380
|
+
{
|
|
381
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::plus>{__val}, __memorder, _Sco{});
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
|
|
385
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
386
|
+
{
|
|
387
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_and>{__val}, __memorder, _Sco{});
|
|
388
|
+
}
|
|
389
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
|
|
390
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
391
|
+
__atomic_fetch_and_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
392
|
+
{
|
|
393
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_and>{__val}, __memorder, _Sco{});
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
|
|
397
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
398
|
+
{
|
|
399
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_or>{__val}, __memorder, _Sco{});
|
|
400
|
+
}
|
|
401
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
|
|
402
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_or_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
403
|
+
{
|
|
404
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_or>{__val}, __memorder, _Sco{});
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
|
|
408
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
409
|
+
{
|
|
410
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_xor>{__val}, __memorder, _Sco{});
|
|
411
|
+
}
|
|
412
|
+
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_bitwise<_Tp> = 0>
|
|
413
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
414
|
+
__atomic_fetch_xor_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
415
|
+
{
|
|
416
|
+
return __atomic_fetch_update_cuda(__ptr, __cuda_atomic_op_bind<_Tp, ::cuda::std::bit_xor>{__val}, __memorder, _Sco{});
|
|
417
|
+
}
|
|
418
|
+
|
|
344
419
|
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
|
|
345
|
-
_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
420
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
346
421
|
{
|
|
347
422
|
return __atomic_fetch_update_cuda(
|
|
348
|
-
__ptr,
|
|
349
|
-
[__val](_Tp __old) {
|
|
350
|
-
return __val < __old ? __val : __old;
|
|
351
|
-
},
|
|
352
|
-
__memorder,
|
|
353
|
-
_Sco{});
|
|
423
|
+
__ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_min>{__val}, __memorder, _Sco{});
|
|
354
424
|
}
|
|
355
425
|
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
|
|
356
|
-
|
|
426
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
427
|
+
__atomic_fetch_min_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
357
428
|
{
|
|
358
429
|
return __atomic_fetch_update_cuda(
|
|
359
|
-
__ptr,
|
|
360
|
-
[__val](_Tp __old) {
|
|
361
|
-
return __val < __old ? __val : __old;
|
|
362
|
-
},
|
|
363
|
-
__memorder,
|
|
364
|
-
_Sco{});
|
|
430
|
+
__ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_min>{__val}, __memorder, _Sco{});
|
|
365
431
|
}
|
|
366
432
|
|
|
367
433
|
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
|
|
368
|
-
_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
434
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
369
435
|
{
|
|
370
436
|
return __atomic_fetch_update_cuda(
|
|
371
|
-
__ptr,
|
|
372
|
-
[__val](_Tp __old) {
|
|
373
|
-
return __old < __val ? __val : __old;
|
|
374
|
-
},
|
|
375
|
-
__memorder,
|
|
376
|
-
_Sco{});
|
|
437
|
+
__ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_max>{__val}, __memorder, _Sco{});
|
|
377
438
|
}
|
|
378
439
|
template <typename _Tp, typename _Up, typename _Sco, __atomic_enable_if_not_native_minmax<_Tp> = 0>
|
|
379
|
-
|
|
440
|
+
[[nodiscard]] static inline _CCCL_DEVICE _Tp
|
|
441
|
+
__atomic_fetch_max_cuda(volatile _Tp* __ptr, _Up __val, int __memorder, _Sco)
|
|
380
442
|
{
|
|
381
443
|
return __atomic_fetch_update_cuda(
|
|
382
|
-
__ptr,
|
|
383
|
-
[__val](_Tp __old) {
|
|
384
|
-
return __old < __val ? __val : __old;
|
|
385
|
-
},
|
|
386
|
-
__memorder,
|
|
387
|
-
_Sco{});
|
|
444
|
+
__ptr, __cuda_atomic_op_bind<_Tp, __cuda_atomic_op_fetch_max>{__val}, __memorder, _Sco{});
|
|
388
445
|
}
|
|
389
446
|
|
|
390
447
|
_CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int)
|