cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
- cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
- cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
- cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
- cuda/cccl/headers/include/cub/config.cuh +2 -26
- cuda/cccl/headers/include/cub/cub.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
- cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
- cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
- cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
- cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
- cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
- cuda/cccl/headers/include/cub/util_device.cuh +18 -59
- cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
- cuda/cccl/headers/include/cub/util_math.cuh +2 -28
- cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
- cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
- cuda/cccl/headers/include/cub/util_type.cuh +5 -32
- cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
- cuda/cccl/headers/include/cub/version.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
- cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
- cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
- cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
- cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
- cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
- cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
- cuda/cccl/headers/include/cuda/barrier +42 -16
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/memory_resource +6 -1
- cuda/cccl/headers/include/cuda/numeric +2 -0
- cuda/cccl/headers/include/cuda/pipeline +3 -2
- cuda/cccl/headers/include/cuda/ptx +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
- cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
- cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
- cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
- cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
- cuda/cccl/headers/include/cuda/std/__new_ +1 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random_ +2 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/array +1 -1
- cuda/cccl/headers/include/cuda/std/atomic +1 -1
- cuda/cccl/headers/include/cuda/std/bitset +2 -10
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
- cuda/cccl/headers/include/cuda/std/functional +1 -1
- cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
- cuda/cccl/headers/include/cuda/std/iterator +1 -1
- cuda/cccl/headers/include/cuda/std/numbers +0 -2
- cuda/cccl/headers/include/cuda/std/ratio +2 -2
- cuda/cccl/headers/include/cuda/std/span +2 -2
- cuda/cccl/headers/include/cuda/std/string_view +24 -42
- cuda/cccl/headers/include/cuda/std/tuple +18 -1
- cuda/cccl/headers/include/cuda/std/type_traits +0 -1
- cuda/cccl/headers/include/cuda/std/variant +8 -1
- cuda/cccl/headers/include/nv/target +2 -6
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
- cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
- cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
- cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
- cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
- cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
- cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
- cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
- cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
- cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
- cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
- cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
- cuda/cccl/headers/include/thrust/functional.h +0 -2
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/mr/new.h +0 -2
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
- cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
- cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
- cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
- cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
- cuda/cccl/headers/include/thrust/random.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
- cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
- cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
- cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
- cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
- cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
- cuda/cccl/headers/include/thrust/transform.h +14 -3
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
- cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
- cuda/cccl/headers/include/thrust/zip_function.h +2 -28
- cuda/compute/__init__.py +4 -0
- cuda/compute/_bindings.pyi +26 -3
- cuda/compute/_bindings_impl.pyx +143 -1
- cuda/compute/algorithms/__init__.py +9 -5
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
- cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda_cccl-0.3.4.dist-info/METADATA +78 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
- cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
- cuda_cccl-0.3.2.dist-info/METADATA +0 -42
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,578 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___MEMORY_RESOURCE_MEMORY_RESOURCE_BASE_H
|
|
12
|
+
#define _CUDA___MEMORY_RESOURCE_MEMORY_RESOURCE_BASE_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#if _CCCL_CUDA_COMPILER(CLANG)
|
|
25
|
+
# include <cuda_runtime.h>
|
|
26
|
+
# include <cuda_runtime_api.h>
|
|
27
|
+
#endif // _CCCL_CUDA_COMPILER(CLANG)
|
|
28
|
+
|
|
29
|
+
#include <cuda/__device/attributes.h>
|
|
30
|
+
#include <cuda/__device/device_ref.h>
|
|
31
|
+
#include <cuda/__memory_resource/any_resource.h>
|
|
32
|
+
#include <cuda/__memory_resource/properties.h>
|
|
33
|
+
#include <cuda/__runtime/api_wrapper.h>
|
|
34
|
+
#include <cuda/__stream/internal_streams.h>
|
|
35
|
+
#include <cuda/__stream/stream.h>
|
|
36
|
+
#include <cuda/__stream/stream_ref.h>
|
|
37
|
+
#include <cuda/std/__concepts/concept_macros.h>
|
|
38
|
+
#include <cuda/std/cstddef>
|
|
39
|
+
|
|
40
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
41
|
+
|
|
42
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
43
|
+
|
|
44
|
+
enum class __pool_attr_settable : bool
|
|
45
|
+
{
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
template <::cudaMemPoolAttr _Attr, typename _Type, __pool_attr_settable _Settable>
|
|
49
|
+
struct __pool_attr_impl
|
|
50
|
+
{
|
|
51
|
+
using type = _Type;
|
|
52
|
+
|
|
53
|
+
[[nodiscard]] _CCCL_HOST_API constexpr operator ::cudaMemPoolAttr() const noexcept
|
|
54
|
+
{
|
|
55
|
+
return _Attr;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
[[nodiscard]] _CCCL_HOST_API type operator()(::cudaMemPool_t __pool) const
|
|
59
|
+
{
|
|
60
|
+
size_t __value = ::cuda::__driver::__mempoolGetAttribute(__pool, static_cast<::CUmemPool_attribute>(_Attr));
|
|
61
|
+
return static_cast<type>(__value);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
static void set(::cudaMemPool_t __pool, type __value)
|
|
65
|
+
{
|
|
66
|
+
size_t __value_copy = __value;
|
|
67
|
+
if constexpr (_Settable == __pool_attr_settable{true})
|
|
68
|
+
{
|
|
69
|
+
::cuda::__driver::__mempoolSetAttribute(__pool, static_cast<::CUmemPool_attribute>(_Attr), &__value_copy);
|
|
70
|
+
}
|
|
71
|
+
else
|
|
72
|
+
{
|
|
73
|
+
::cuda::std::__throw_invalid_argument("This attribute can't be set");
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
template <::cudaMemPoolAttr _Attr>
|
|
79
|
+
struct __pool_attr : __pool_attr_impl<_Attr, size_t, __pool_attr_settable{true}>
|
|
80
|
+
{};
|
|
81
|
+
|
|
82
|
+
template <>
|
|
83
|
+
struct __pool_attr<::cudaMemPoolReuseFollowEventDependencies>
|
|
84
|
+
: __pool_attr_impl<::cudaMemPoolReuseFollowEventDependencies, bool, __pool_attr_settable{true}>
|
|
85
|
+
{};
|
|
86
|
+
|
|
87
|
+
template <>
|
|
88
|
+
struct __pool_attr<::cudaMemPoolReuseAllowOpportunistic>
|
|
89
|
+
: __pool_attr_impl<::cudaMemPoolReuseAllowOpportunistic, bool, __pool_attr_settable{true}>
|
|
90
|
+
{};
|
|
91
|
+
|
|
92
|
+
template <>
|
|
93
|
+
struct __pool_attr<::cudaMemPoolReuseAllowInternalDependencies>
|
|
94
|
+
: __pool_attr_impl<::cudaMemPoolReuseAllowInternalDependencies, bool, __pool_attr_settable{true}>
|
|
95
|
+
{};
|
|
96
|
+
|
|
97
|
+
template <>
|
|
98
|
+
struct __pool_attr<::cudaMemPoolAttrReservedMemCurrent>
|
|
99
|
+
: __pool_attr_impl<::cudaMemPoolAttrReservedMemCurrent, size_t, __pool_attr_settable{false}>
|
|
100
|
+
{};
|
|
101
|
+
|
|
102
|
+
template <>
|
|
103
|
+
struct __pool_attr<::cudaMemPoolAttrUsedMemCurrent>
|
|
104
|
+
: __pool_attr_impl<::cudaMemPoolAttrUsedMemCurrent, size_t, __pool_attr_settable{false}>
|
|
105
|
+
{};
|
|
106
|
+
|
|
107
|
+
inline void __set_attribute_non_zero_only(::cudaMemPool_t __pool, ::CUmemPool_attribute __attr, size_t __value)
|
|
108
|
+
{
|
|
109
|
+
if (__value != 0)
|
|
110
|
+
{
|
|
111
|
+
::cuda::std::__throw_invalid_argument("This attribute can't be set to a non-zero value.");
|
|
112
|
+
}
|
|
113
|
+
::cuda::__driver::__mempoolSetAttribute(__pool, __attr, &__value);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
template <>
|
|
117
|
+
struct __pool_attr<::cudaMemPoolAttrReservedMemHigh>
|
|
118
|
+
: __pool_attr_impl<::cudaMemPoolAttrReservedMemHigh, size_t, __pool_attr_settable{true}>
|
|
119
|
+
{
|
|
120
|
+
static void set(::cudaMemPool_t __pool, type __value)
|
|
121
|
+
{
|
|
122
|
+
::cuda::__set_attribute_non_zero_only(__pool, ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, __value);
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
template <>
|
|
127
|
+
struct __pool_attr<::cudaMemPoolAttrUsedMemHigh>
|
|
128
|
+
: __pool_attr_impl<::cudaMemPoolAttrUsedMemHigh, size_t, __pool_attr_settable{true}>
|
|
129
|
+
{
|
|
130
|
+
static void set(::cudaMemPool_t __pool, type __value)
|
|
131
|
+
{
|
|
132
|
+
::cuda::__set_attribute_non_zero_only(__pool, ::CU_MEMPOOL_ATTR_USED_MEM_HIGH, __value);
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
namespace memory_pool_attributes
|
|
137
|
+
{
|
|
138
|
+
// The threshold at which the pool will release memory.
|
|
139
|
+
using release_threshold_t = __pool_attr<::cudaMemPoolAttrReleaseThreshold>;
|
|
140
|
+
static constexpr release_threshold_t release_threshold{};
|
|
141
|
+
|
|
142
|
+
// Allow the pool to reuse the memory across streams as long as there is a stream ordering dependency between the
|
|
143
|
+
// streams.
|
|
144
|
+
using reuse_follow_event_dependencies_t = __pool_attr<::cudaMemPoolReuseFollowEventDependencies>;
|
|
145
|
+
static constexpr reuse_follow_event_dependencies_t reuse_follow_event_dependencies{};
|
|
146
|
+
|
|
147
|
+
// Allow the pool to reuse already completed frees when there is no dependency between the streams.
|
|
148
|
+
using reuse_allow_opportunistic_t = __pool_attr<::cudaMemPoolReuseAllowOpportunistic>;
|
|
149
|
+
static constexpr reuse_allow_opportunistic_t reuse_allow_opportunistic{};
|
|
150
|
+
|
|
151
|
+
// Allow the pool to insert stream dependencies to reuse the memory across streams.
|
|
152
|
+
using reuse_allow_internal_dependencies_t = __pool_attr<::cudaMemPoolReuseAllowInternalDependencies>;
|
|
153
|
+
static constexpr reuse_allow_internal_dependencies_t reuse_allow_internal_dependencies{};
|
|
154
|
+
|
|
155
|
+
// The current amount of memory reserved in the pool.
|
|
156
|
+
using reserved_mem_current_t = __pool_attr<::cudaMemPoolAttrReservedMemCurrent>;
|
|
157
|
+
static constexpr reserved_mem_current_t reserved_mem_current{};
|
|
158
|
+
|
|
159
|
+
// The high water mark for the reserved memory in the pool.
|
|
160
|
+
using reserved_mem_high_t = __pool_attr<::cudaMemPoolAttrReservedMemHigh>;
|
|
161
|
+
static constexpr reserved_mem_high_t reserved_mem_high{};
|
|
162
|
+
|
|
163
|
+
// The current amount of memory used in the pool.
|
|
164
|
+
using used_mem_current_t = __pool_attr<::cudaMemPoolAttrUsedMemCurrent>;
|
|
165
|
+
static constexpr used_mem_current_t used_mem_current{};
|
|
166
|
+
|
|
167
|
+
// The high water mark for the used memory in the pool.
|
|
168
|
+
using used_mem_high_t = __pool_attr<::cudaMemPoolAttrUsedMemHigh>;
|
|
169
|
+
static constexpr used_mem_high_t used_mem_high{};
|
|
170
|
+
}; // namespace memory_pool_attributes
|
|
171
|
+
|
|
172
|
+
//! @brief Checks whether the current device supports stream-ordered allocations.
|
|
173
|
+
//! @param __device The device for which to query support.
|
|
174
|
+
//! @throws cuda_error if \c cudaDeviceGetAttribute failed.
|
|
175
|
+
//! @returns true if \c cudaDevAttrMemoryPoolsSupported is not zero.
|
|
176
|
+
inline void __verify_device_supports_stream_ordered_allocations(const device_ref __device)
|
|
177
|
+
{
|
|
178
|
+
if (!__device.attribute(::cuda::device_attributes::memory_pools_supported))
|
|
179
|
+
{
|
|
180
|
+
::cuda::__throw_cuda_error(
|
|
181
|
+
::cudaErrorNotSupported, "stream-ordered allocations are not supported on the given device");
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
//! @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
|
|
186
|
+
//! CUDA driver/runtime version.
|
|
187
|
+
//! @param __device The device to check for support.
|
|
188
|
+
//! @param __handle_type An IPC export handle type to check for support.
|
|
189
|
+
//! @throws cuda_error if the specified `cudaMemAllocationHandleType` is not supported on the specified device.
|
|
190
|
+
inline void __verify_device_supports_export_handle_type(
|
|
191
|
+
const device_ref __device, ::cudaMemAllocationHandleType __handle_type, ::CUmemLocation __location)
|
|
192
|
+
{
|
|
193
|
+
if (__handle_type == ::cudaMemAllocationHandleType::cudaMemHandleTypeNone)
|
|
194
|
+
{
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
if (__location.type != ::CU_MEM_LOCATION_TYPE_DEVICE
|
|
198
|
+
#if _CCCL_CTK_AT_LEAST(12, 6)
|
|
199
|
+
&& __location.type != ::CU_MEM_LOCATION_TYPE_HOST_NUMA
|
|
200
|
+
#endif
|
|
201
|
+
)
|
|
202
|
+
{
|
|
203
|
+
::cuda::__throw_cuda_error(
|
|
204
|
+
::cudaErrorNotSupported, "Requested IPC memory handle type not supported for the given location");
|
|
205
|
+
}
|
|
206
|
+
auto __supported_handles = __device.attribute(::cuda::device_attributes::memory_pool_supported_handle_types);
|
|
207
|
+
if ((static_cast<int>(__handle_type) & __supported_handles) != static_cast<int>(__handle_type))
|
|
208
|
+
{
|
|
209
|
+
::cuda::__throw_cuda_error(
|
|
210
|
+
::cudaErrorNotSupported, "Requested IPC memory handle type not supported on a given device");
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
//! @brief Enable access to this memory pool from the supplied devices
|
|
215
|
+
//!
|
|
216
|
+
//! Device on which this pool resides can be included in the span.
|
|
217
|
+
//!
|
|
218
|
+
//! @param __pool The memory pool to set access for
|
|
219
|
+
//! @param __devices A span of `device_ref`s listing devices to enable access for
|
|
220
|
+
//! @param __flags The access flags to set
|
|
221
|
+
//! @throws cuda_error if ``cudaMemPoolSetAccess`` fails.
|
|
222
|
+
inline void
|
|
223
|
+
__mempool_set_access(::CUmemoryPool __pool, ::cuda::std::span<const device_ref> __devices, ::CUmemAccess_flags __flags)
|
|
224
|
+
{
|
|
225
|
+
::std::vector<::CUmemAccessDesc> __descs;
|
|
226
|
+
__descs.reserve(__devices.size());
|
|
227
|
+
for (size_t __i = 0; __i < __devices.size(); ++__i)
|
|
228
|
+
{
|
|
229
|
+
__descs.push_back({::CUmemLocation{::CU_MEM_LOCATION_TYPE_DEVICE, __devices[__i].get()}, __flags});
|
|
230
|
+
}
|
|
231
|
+
::cuda::__driver::__mempoolSetAccess(__pool, __descs.data(), __descs.size());
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
//! @brief Query if memory from a pool is accessible by the supplied device
|
|
235
|
+
//!
|
|
236
|
+
//! @param __pool The memory pool to query access for
|
|
237
|
+
//! @param __dev The device to query access for
|
|
238
|
+
//! @returns true if the memory pool is accessible from the device
|
|
239
|
+
[[nodiscard]] inline bool __mempool_get_access(::cudaMemPool_t __pool, device_ref __dev)
|
|
240
|
+
{
|
|
241
|
+
::CUmemAccess_flags __result;
|
|
242
|
+
::CUmemLocation __loc;
|
|
243
|
+
__loc.type = ::CU_MEM_LOCATION_TYPE_DEVICE;
|
|
244
|
+
__loc.id = __dev.get();
|
|
245
|
+
__result = ::cuda::__driver::__mempoolGetAccess(__pool, &__loc);
|
|
246
|
+
return __result == ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
//! @brief \c memory_pool_properties is a type that can controls memory pool to control the creation options.
|
|
250
|
+
//! Compared to attributes, properties can not be set after the pool is created.
|
|
251
|
+
struct memory_pool_properties
|
|
252
|
+
{
|
|
253
|
+
size_t initial_pool_size = 0;
|
|
254
|
+
size_t release_threshold = ::cuda::std::numeric_limits<size_t>::max();
|
|
255
|
+
cudaMemAllocationHandleType allocation_handle_type = ::cudaMemAllocationHandleType::cudaMemHandleTypeNone;
|
|
256
|
+
size_t max_pool_size = 0;
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
//! @brief Creates the CUDA memory pool from the passed in arguments.
|
|
260
|
+
//! @throws cuda_error If the creation of the CUDA memory pool failed.
|
|
261
|
+
//! @returns The created CUDA memory pool.
|
|
262
|
+
[[nodiscard]] static cudaMemPool_t __create_cuda_mempool(
|
|
263
|
+
memory_pool_properties __properties, ::CUmemLocation __location, CUmemAllocationType __allocation_type)
|
|
264
|
+
{
|
|
265
|
+
::CUmemPoolProps __pool_properties{};
|
|
266
|
+
__pool_properties.allocType = __allocation_type;
|
|
267
|
+
__pool_properties.handleTypes = ::CUmemAllocationHandleType(__properties.allocation_handle_type);
|
|
268
|
+
__pool_properties.location = __location;
|
|
269
|
+
|
|
270
|
+
#if _CCCL_CTK_AT_LEAST(12, 2)
|
|
271
|
+
if (__properties.max_pool_size != 0)
|
|
272
|
+
{
|
|
273
|
+
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
274
|
+
if (__allocation_type == ::CU_MEM_ALLOCATION_TYPE_MANAGED)
|
|
275
|
+
{
|
|
276
|
+
::cuda::std::__throw_invalid_argument("Max pool size is not supported for managed memory pools");
|
|
277
|
+
}
|
|
278
|
+
# endif // _CCCL_CTK_AT_LEAST(13, 0)
|
|
279
|
+
if (__properties.initial_pool_size > __properties.max_pool_size)
|
|
280
|
+
{
|
|
281
|
+
::cuda::std::__throw_invalid_argument("Initial pool size must be less than the max pool size");
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
__pool_properties.maxSize = __properties.max_pool_size;
|
|
285
|
+
#else
|
|
286
|
+
if (__properties.max_pool_size != 0)
|
|
287
|
+
{
|
|
288
|
+
::cuda::std::__throw_invalid_argument("Max pool size is not supported on this CUDA version");
|
|
289
|
+
}
|
|
290
|
+
#endif // _CCCL_CTK_AT_LEAST(12, 2)
|
|
291
|
+
|
|
292
|
+
if (__properties.initial_pool_size > __properties.release_threshold)
|
|
293
|
+
{
|
|
294
|
+
::cuda::std::__throw_invalid_argument("Initial pool size must be less than the release threshold");
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
::CUmemoryPool __cuda_pool_handle{};
|
|
298
|
+
::cudaError_t __error = ::cuda::__driver::__mempoolCreateNoThrow(&__cuda_pool_handle, &__pool_properties);
|
|
299
|
+
if (__error != ::cudaSuccess)
|
|
300
|
+
{
|
|
301
|
+
// Mempool creation failed, lets try to figure out why
|
|
302
|
+
::cuda::__verify_device_supports_stream_ordered_allocations(__location.id);
|
|
303
|
+
::cuda::__verify_device_supports_export_handle_type(__location.id, __properties.allocation_handle_type, __location);
|
|
304
|
+
|
|
305
|
+
// Could not find the reason, throw a generic error
|
|
306
|
+
::cuda::__throw_cuda_error(__error, "Failed to create a memory pool");
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
::cuda::__driver::__mempoolSetAttribute(
|
|
310
|
+
__cuda_pool_handle, ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &__properties.release_threshold);
|
|
311
|
+
|
|
312
|
+
// allocate the requested initial size to prime the pool.
|
|
313
|
+
// We need to use a new stream so we do not wait on other work
|
|
314
|
+
if (__properties.initial_pool_size != 0)
|
|
315
|
+
{
|
|
316
|
+
::CUdeviceptr __ptr = ::cuda::__driver::__mallocFromPoolAsync(
|
|
317
|
+
__properties.initial_pool_size, __cuda_pool_handle, __cccl_allocation_stream().get());
|
|
318
|
+
if (::cuda::__driver::__freeAsyncNoThrow(__ptr, __cccl_allocation_stream().get()) != ::cudaSuccess)
|
|
319
|
+
{
|
|
320
|
+
::cuda::__throw_cuda_error(::cudaErrorMemoryAllocation, "Failed to allocate initial pool size");
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return __cuda_pool_handle;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
class __memory_resource_base
|
|
327
|
+
{
|
|
328
|
+
protected:
|
|
329
|
+
::cudaMemPool_t __pool_;
|
|
330
|
+
|
|
331
|
+
//! @brief Checks whether the passed in alignment is valid.
|
|
332
|
+
//! @param __alignment the alignment to check.
|
|
333
|
+
//! @returns true if \p __alignment is valid.
|
|
334
|
+
[[nodiscard]] static constexpr bool __is_valid_alignment(const size_t __alignment) noexcept
|
|
335
|
+
{
|
|
336
|
+
return __alignment <= ::cuda::mr::default_cuda_malloc_alignment
|
|
337
|
+
&& (::cuda::mr::default_cuda_malloc_alignment % __alignment == 0);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
public:
|
|
341
|
+
__memory_resource_base(::cuda::std::nullptr_t) = delete;
|
|
342
|
+
|
|
343
|
+
//! @brief Constructs the __memory_resource_base from a \c cudaMemPool_t.
|
|
344
|
+
//! @param __pool The \c cudaMemPool_t used to allocate memory.
|
|
345
|
+
_CCCL_HOST_API explicit __memory_resource_base(::cudaMemPool_t __pool) noexcept
|
|
346
|
+
: __pool_(__pool)
|
|
347
|
+
{}
|
|
348
|
+
|
|
349
|
+
//! @brief Allocate device memory of size at least \p __bytes via cudaMallocFromPoolAsync.
|
|
350
|
+
//! @param __bytes The size in bytes of the allocation.
|
|
351
|
+
//! @param __alignment The requested alignment of the allocation.
|
|
352
|
+
//! @throws std::invalid_argument In case of invalid alignment.
|
|
353
|
+
//! @throws cuda::cuda_error If an error code was return by the CUDA API call.
|
|
354
|
+
//! @returns Pointer to the newly allocated memory.
|
|
355
|
+
[[nodiscard]] _CCCL_HOST_API void*
|
|
356
|
+
allocate_sync(const size_t __bytes, const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment)
|
|
357
|
+
{
|
|
358
|
+
if (!__is_valid_alignment(__alignment))
|
|
359
|
+
{
|
|
360
|
+
::cuda::std::__throw_invalid_argument(
|
|
361
|
+
"Invalid alignment passed to "
|
|
362
|
+
"__memory_resource_base::allocate_sync.");
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
::CUdeviceptr __ptr = ::cuda::__driver::__mallocFromPoolAsync(__bytes, __pool_, __cccl_allocation_stream().get());
|
|
366
|
+
__cccl_allocation_stream().sync();
|
|
367
|
+
return reinterpret_cast<void*>(__ptr);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
//! @brief deallocate_sync memory pointed to by \p __ptr.
|
|
371
|
+
//! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate_sync`.
|
|
372
|
+
//! @param __bytes The number of bytes that was passed to the allocation call that returned \p __ptr.
|
|
373
|
+
//! @param __alignment The alignment that was passed to the allocation call that returned \p __ptr.
|
|
374
|
+
//! @note The pointer passed to `deallocate_sync` must not be in use in a stream. It is the caller's responsibility to
|
|
375
|
+
//! properly synchronize all relevant streams before calling `deallocate_sync`.
|
|
376
|
+
_CCCL_HOST_API void deallocate_sync(
|
|
377
|
+
void* __ptr,
|
|
378
|
+
const size_t,
|
|
379
|
+
[[maybe_unused]] const size_t __alignment = ::cuda::mr::default_cuda_malloc_alignment) noexcept
|
|
380
|
+
{
|
|
381
|
+
_CCCL_ASSERT(__is_valid_alignment(__alignment),
|
|
382
|
+
"Invalid alignment passed to __memory_resource_base::deallocate_sync.");
|
|
383
|
+
_CCCL_ASSERT_CUDA_API(
|
|
384
|
+
::cuda::__driver::__freeAsyncNoThrow,
|
|
385
|
+
"deallocate failed",
|
|
386
|
+
reinterpret_cast<::CUdeviceptr>(__ptr),
|
|
387
|
+
__cccl_allocation_stream().get());
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
//! @brief Allocate device memory of size at least \p __bytes via `cudaMallocFromPoolAsync`.
|
|
391
|
+
//! @param __bytes The size in bytes of the allocation.
|
|
392
|
+
//! @param __alignment The requested alignment of the allocation.
|
|
393
|
+
//! @param __stream Stream on which to perform allocation.
|
|
394
|
+
//! @throws std::invalid_argument In case of invalid alignment.
|
|
395
|
+
//! @throws cuda::cuda_error If an error code was return by the cuda api call.
|
|
396
|
+
//! @returns Pointer to the newly allocated memory.
|
|
397
|
+
[[nodiscard]] _CCCL_HOST_API void*
|
|
398
|
+
allocate(const ::cuda::stream_ref __stream, const size_t __bytes, const size_t __alignment)
|
|
399
|
+
{
|
|
400
|
+
if (!__is_valid_alignment(__alignment))
|
|
401
|
+
{
|
|
402
|
+
::cuda::std::__throw_invalid_argument(
|
|
403
|
+
"Invalid alignment passed to "
|
|
404
|
+
"__memory_resource_base::allocate.");
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
return allocate(__stream, __bytes);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
//! @brief Allocate device memory of size at least \p __bytes via cudaMallocFromPoolAsync.
|
|
411
|
+
//! @param __bytes The size in bytes of the allocation.
|
|
412
|
+
//! @param __stream Stream on which to perform allocation.
|
|
413
|
+
//! @throws cuda::cuda_error If an error code was return by the cuda api call.
|
|
414
|
+
//! @returns Pointer to the newly allocated memory.
|
|
415
|
+
[[nodiscard]] _CCCL_HOST_API void* allocate(const ::cuda::stream_ref __stream, const size_t __bytes)
|
|
416
|
+
{
|
|
417
|
+
::CUdeviceptr __ptr = ::cuda::__driver::__mallocFromPoolAsync(__bytes, __pool_, __stream.get());
|
|
418
|
+
return reinterpret_cast<void*>(__ptr);
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
//! @brief Deallocate memory pointed to by \p __ptr.
|
|
422
|
+
//! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`
|
|
423
|
+
//! @param __bytes The number of bytes that was passed to the allocation call that returned \p __ptr.
|
|
424
|
+
//! @param __alignment The alignment that was passed to the allocation call that returned \p __ptr.
|
|
425
|
+
//! @param __stream A stream that has a stream ordering relationship with the stream used in the
|
|
426
|
+
//! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate</a> call
|
|
427
|
+
//! that returned \p __ptr.
|
|
428
|
+
//! @note The pointer passed to `deallocate` must not be in use in a stream other than \p __stream.
|
|
429
|
+
//! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate`.
|
|
430
|
+
void _CCCL_HOST_API
|
|
431
|
+
deallocate(const ::cuda::stream_ref __stream, void* __ptr, const size_t __bytes, const size_t __alignment) noexcept
|
|
432
|
+
{
|
|
433
|
+
// We need to ensure that the provided alignment matches the minimal provided alignment
|
|
434
|
+
_CCCL_ASSERT(__is_valid_alignment(__alignment), "Invalid alignment passed to __memory_resource_base::deallocate.");
|
|
435
|
+
deallocate(__stream, __ptr, __bytes);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
//! @brief Tries to release memory.
|
|
439
|
+
//! @param __min_bytes_to_keep the minimal guaranteed size of the pool.
|
|
440
|
+
//! @note If the pool has less than \p __minBytesToKeep reserved, the trim_to operation is a no-op. Otherwise the
|
|
441
|
+
//! pool will be guaranteed to have at least \p __minBytesToKeep bytes reserved after the operation.
|
|
442
|
+
_CCCL_HOST_API void trim_to(const size_t __min_bytes_to_keep)
|
|
443
|
+
{
|
|
444
|
+
::cuda::__driver::__mempoolTrimTo(__pool_, __min_bytes_to_keep);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
//! @brief Gets the value of an attribute of the pool.
|
|
448
|
+
//! @param __attr the attribute to get.
|
|
449
|
+
//! @return The value of the attribute.
|
|
450
|
+
template <typename _Attr>
|
|
451
|
+
[[nodiscard]] _CCCL_HOST_API auto attribute(_Attr __attr) const
|
|
452
|
+
{
|
|
453
|
+
return __attr(__pool_);
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
//! @brief Gets the value of an attribute of the pool.
|
|
457
|
+
//! @param __attribute the attribute to get.
|
|
458
|
+
//! @return The value of the attribute.
|
|
459
|
+
template <::cudaMemPoolAttr _Attr>
|
|
460
|
+
_CCCL_HOST_API auto attribute() const
|
|
461
|
+
{
|
|
462
|
+
return attribute(__pool_attr<_Attr>());
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
//! @brief Sets an attribute of the pool to a given value.
|
|
466
|
+
//! @param __attribute the attribute to be set.
|
|
467
|
+
//! @param __value the new value of that attribute.
|
|
468
|
+
template <typename _Attr>
|
|
469
|
+
_CCCL_HOST_API void set_attribute(_Attr __attr, typename _Attr::type __value)
|
|
470
|
+
{
|
|
471
|
+
__attr.set(__pool_, __value);
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
//! @brief Sets an attribute of the pool to a given value.
|
|
475
|
+
//! @param __attribute the attribute to be set.
|
|
476
|
+
//! @param __value the new value of that attribute.
|
|
477
|
+
template <::cudaMemPoolAttr _Attr>
|
|
478
|
+
_CCCL_HOST_API void set_attribute(typename __pool_attr<_Attr>::type __value)
|
|
479
|
+
{
|
|
480
|
+
return set_attribute(__pool_attr<_Attr>(), __value);
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
//! @brief Returns the underlying handle to the CUDA memory pool.
|
|
484
|
+
[[nodiscard]] _CCCL_API constexpr cudaMemPool_t get() const noexcept
|
|
485
|
+
{
|
|
486
|
+
return __pool_;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
//! @brief Deallocate memory pointed to by \p __ptr.
|
|
490
|
+
//! @param __ptr Pointer to be deallocated. Must have been allocated through a call to `allocate`.
|
|
491
|
+
//! @param __bytes The number of bytes that was passed to the allocation call that returned \p __ptr.
|
|
492
|
+
//! @param __stream A stream that has a stream ordering relationship with the stream used in the
|
|
493
|
+
//! <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html">allocate</a> call
|
|
494
|
+
//! that returned \p __ptr.
|
|
495
|
+
//! @note The pointer passed to `deallocate` must not be in use in a stream other than \p __stream.
|
|
496
|
+
//! It is the caller's responsibility to properly synchronize all relevant streams before calling `deallocate`.
|
|
497
|
+
_CCCL_HOST_API void deallocate(const ::cuda::stream_ref __stream, void* __ptr, size_t) noexcept
|
|
498
|
+
{
|
|
499
|
+
_CCCL_ASSERT_CUDA_API(
|
|
500
|
+
::cuda::__driver::__freeAsyncNoThrow, "deallocate failed", reinterpret_cast<::CUdeviceptr>(__ptr), __stream.get());
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
//! @brief Enable access to memory allocated through this memory resource by the supplied devices
|
|
504
|
+
//!
|
|
505
|
+
//! Access is controlled through the underlying memory pool, so this
|
|
506
|
+
//! setting is shared between all memory resources created from the same pool.
|
|
507
|
+
//! Device on which this resource allocates memory can be included in the span.
|
|
508
|
+
//!
|
|
509
|
+
//! @param __devices A span of `device_ref`s listing devices to enable access for
|
|
510
|
+
_CCCL_HOST_API void enable_access_from(::cuda::std::span<const device_ref> __devices)
|
|
511
|
+
{
|
|
512
|
+
::cuda::__mempool_set_access(__pool_, {__devices.data(), __devices.size()}, ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
//! @brief Enable access to memory allocated through this memory resource by the supplied device
|
|
516
|
+
//!
|
|
517
|
+
//! Access is controlled through the underlying memory pool, so this
|
|
518
|
+
//! setting is shared between all memory resources created from the same pool.
|
|
519
|
+
//!
|
|
520
|
+
//! @param __device device_ref indicating for which device the access should be enabled
|
|
521
|
+
_CCCL_HOST_API void enable_access_from(device_ref __device)
|
|
522
|
+
{
|
|
523
|
+
::cuda::__mempool_set_access(__pool_, {&__device, 1}, ::CU_MEM_ACCESS_FLAGS_PROT_READWRITE);
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
//! @brief Disable access to memory allocated through this memory resource by the supplied devices
|
|
527
|
+
//!
|
|
528
|
+
//! Access is controlled through the underlying memory pool, so this
|
|
529
|
+
//! setting is shared between all memory resources created from the same pool.
|
|
530
|
+
//! Device on which this resource allocates memory can be included in the span.
|
|
531
|
+
//!
|
|
532
|
+
//! @param __devices A span of `device_ref`s listing devices to disable access for
|
|
533
|
+
_CCCL_HOST_API void disable_access_from(::cuda::std::span<const device_ref> __devices)
|
|
534
|
+
{
|
|
535
|
+
::cuda::__mempool_set_access(__pool_, {__devices.data(), __devices.size()}, ::CU_MEM_ACCESS_FLAGS_PROT_NONE);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
//! @brief Disable access to memory allocated through this memory resource by the supplied device
|
|
539
|
+
//!
|
|
540
|
+
//! Access is controlled through the underlying memory pool, so this
|
|
541
|
+
//! setting is shared between all memory resources created from the same pool.
|
|
542
|
+
//!
|
|
543
|
+
//! @param __device device_ref indicating for which device the access should be disabled
|
|
544
|
+
_CCCL_HOST_API void disable_access_from(device_ref __device)
|
|
545
|
+
{
|
|
546
|
+
::cuda::__mempool_set_access(__pool_, {&__device, 1}, ::CU_MEM_ACCESS_FLAGS_PROT_NONE);
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
//! @brief Query if memory allocated through this memory resource is accessible by the supplied device
|
|
550
|
+
//!
|
|
551
|
+
//! @param __device device for which the access is queried
|
|
552
|
+
[[nodiscard]] _CCCL_HOST_API bool is_accessible_from(device_ref __device)
|
|
553
|
+
{
|
|
554
|
+
return ::cuda::__mempool_get_access(__pool_, __device);
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
//! @brief Equality comparison with another __memory_resource_base.
|
|
558
|
+
//! @returns true if underlying \c cudaMemPool_t are equal.
|
|
559
|
+
[[nodiscard]] _CCCL_HOST_API bool operator==(__memory_resource_base const& __rhs) const noexcept
|
|
560
|
+
{
|
|
561
|
+
return __pool_ == __rhs.__pool_;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
#if _CCCL_STD_VER <= 2017
|
|
565
|
+
//! @brief Inequality comparison with another __memory_resource_base.
|
|
566
|
+
//! @returns true if underlying \c cudaMemPool_t are not equal.
|
|
567
|
+
[[nodiscard]] _CCCL_HOST_API bool operator!=(__memory_resource_base const& __rhs) const noexcept
|
|
568
|
+
{
|
|
569
|
+
return __pool_ != __rhs.__pool_;
|
|
570
|
+
}
|
|
571
|
+
#endif // _CCCL_STD_VER <= 2017
|
|
572
|
+
};
|
|
573
|
+
|
|
574
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
575
|
+
|
|
576
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
577
|
+
|
|
578
|
+
#endif // _CUDA___MEMORY_RESOURCE_MEMORY_RESOURCE_BASE_H
|