cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,2811 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* Redistribution and use in source and binary forms, with or without
|
|
5
|
+
* modification, are permitted provided that the following conditions are met:
|
|
6
|
+
* * Redistributions of source code must retain the above copyright
|
|
7
|
+
* notice, this list of conditions and the following disclaimer.
|
|
8
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
9
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
10
|
+
* documentation and/or other materials provided with the distribution.
|
|
11
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
12
|
+
* names of its contributors may be used to endorse or promote products
|
|
13
|
+
* derived from this software without specific prior written permission.
|
|
14
|
+
*
|
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
16
|
+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
17
|
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
+
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
19
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
20
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
21
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
22
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
23
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
25
|
+
*
|
|
26
|
+
******************************************************************************/
|
|
27
|
+
|
|
28
|
+
//! @file
|
|
29
|
+
//! cub::DeviceSegmentedSort provides device-wide, parallel operations for computing a batched sort across multiple,
|
|
30
|
+
//! non-overlapping sequences of data items residing within device-accessible memory.
|
|
31
|
+
|
|
32
|
+
#pragma once
|
|
33
|
+
|
|
34
|
+
#include <cub/config.cuh>
|
|
35
|
+
|
|
36
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
37
|
+
# pragma GCC system_header
|
|
38
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
39
|
+
# pragma clang system_header
|
|
40
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
41
|
+
# pragma system_header
|
|
42
|
+
#endif // no system header
|
|
43
|
+
|
|
44
|
+
#include <cub/detail/choose_offset.cuh>
|
|
45
|
+
#include <cub/device/dispatch/dispatch_segmented_sort.cuh>
|
|
46
|
+
#include <cub/util_namespace.cuh>
|
|
47
|
+
|
|
48
|
+
#include <cuda/std/cstdint>
|
|
49
|
+
|
|
50
|
+
CUB_NAMESPACE_BEGIN
|
|
51
|
+
|
|
52
|
+
//! @rst
|
|
53
|
+
//! DeviceSegmentedSort provides device-wide, parallel operations for
|
|
54
|
+
//! computing a batched sort across multiple, non-overlapping sequences of
|
|
55
|
+
//! data items residing within device-accessible memory.
|
|
56
|
+
//!
|
|
57
|
+
//! Overview
|
|
58
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
59
|
+
//!
|
|
60
|
+
//! The algorithm arranges items into ascending (or descending) order.
|
|
61
|
+
//! The underlying sorting algorithm is undefined. Depending on the segment size,
|
|
62
|
+
//! it might be radix sort, merge sort or something else. Therefore, no
|
|
63
|
+
//! assumptions on the underlying implementation should be made.
|
|
64
|
+
//!
|
|
65
|
+
//! Differences from DeviceSegmentedRadixSort
|
|
66
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
67
|
+
//!
|
|
68
|
+
//! DeviceSegmentedRadixSort is optimized for significantly large segments (tens
|
|
69
|
+
//! of thousands of items and more). Nevertheless, some domains produce a wide
|
|
70
|
+
//! range of segment sizes. DeviceSegmentedSort partitions segments into size
|
|
71
|
+
//! groups and specialize sorting algorithms for each group. This approach leads
|
|
72
|
+
//! to better resource utilization in the presence of segment size imbalance or
|
|
73
|
+
//! moderate segment sizes (up to thousands of items).
|
|
74
|
+
//! This algorithm is more complex and consists of multiple kernels. This fact
|
|
75
|
+
//! leads to longer compilation times as well as larger binaries sizes.
|
|
76
|
+
//!
|
|
77
|
+
//! Supported Types
|
|
78
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
79
|
+
//!
|
|
80
|
+
//! The algorithm has to satisfy the underlying algorithms restrictions. Radix
|
|
81
|
+
//! sort usage restricts the list of supported types. Therefore,
|
|
82
|
+
//! DeviceSegmentedSort can sort all of the built-in C++ numeric primitive types
|
|
83
|
+
//! (``unsigned char``, ``int``, ``double``, etc.) as well as CUDA's ``__half`` and
|
|
84
|
+
//! ``__nv_bfloat16`` 16-bit floating-point types.
|
|
85
|
+
//!
|
|
86
|
+
//! Segments are not required to be contiguous. Any element of input(s) or
|
|
87
|
+
//! output(s) outside the specified segments will not be accessed nor modified.
|
|
88
|
+
//!
|
|
89
|
+
//! A simple example
|
|
90
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
91
|
+
//!
|
|
92
|
+
//! .. code-block:: c++
|
|
93
|
+
//!
|
|
94
|
+
//! #include <cub/cub.cuh>
|
|
95
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
96
|
+
//!
|
|
97
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
98
|
+
//! // for sorting data
|
|
99
|
+
//! int num_items; // e.g., 7
|
|
100
|
+
//! int num_segments; // e.g., 3
|
|
101
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
102
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
103
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
104
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
105
|
+
//! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
|
|
106
|
+
//! ...
|
|
107
|
+
//!
|
|
108
|
+
//! // Determine temporary device storage requirements
|
|
109
|
+
//! void *d_temp_storage = nullptr;
|
|
110
|
+
//! size_t temp_storage_bytes = 0;
|
|
111
|
+
//! cub::DeviceSegmentedSort::SortPairs(
|
|
112
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
113
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
114
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
115
|
+
//!
|
|
116
|
+
//! // Allocate temporary storage
|
|
117
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
118
|
+
//!
|
|
119
|
+
//! // Run sorting operation
|
|
120
|
+
//! cub::DeviceSegmentedSort::SortPairs(
|
|
121
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
122
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
123
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
124
|
+
//!
|
|
125
|
+
//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
|
|
126
|
+
//! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
|
|
127
|
+
//!
|
|
128
|
+
//! @endrst
|
|
129
|
+
struct DeviceSegmentedSort
|
|
130
|
+
{
|
|
131
|
+
private:
|
|
132
|
+
// Name reported for NVTX ranges
|
|
133
|
+
_CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
|
|
134
|
+
{
|
|
135
|
+
return "cub::DeviceSegmentedSort";
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Internal version without NVTX range
|
|
139
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
140
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
|
|
141
|
+
void* d_temp_storage,
|
|
142
|
+
size_t& temp_storage_bytes,
|
|
143
|
+
const KeyT* d_keys_in,
|
|
144
|
+
KeyT* d_keys_out,
|
|
145
|
+
::cuda::std::int64_t num_items,
|
|
146
|
+
::cuda::std::int64_t num_segments,
|
|
147
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
148
|
+
EndOffsetIteratorT d_end_offsets,
|
|
149
|
+
cudaStream_t stream = 0)
|
|
150
|
+
{
|
|
151
|
+
constexpr bool is_overwrite_okay = false;
|
|
152
|
+
|
|
153
|
+
using OffsetT =
|
|
154
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
155
|
+
using DispatchT =
|
|
156
|
+
DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
157
|
+
|
|
158
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
159
|
+
DoubleBuffer<NullType> d_values;
|
|
160
|
+
|
|
161
|
+
return DispatchT::Dispatch(
|
|
162
|
+
d_temp_storage,
|
|
163
|
+
temp_storage_bytes,
|
|
164
|
+
d_keys,
|
|
165
|
+
d_values,
|
|
166
|
+
num_items,
|
|
167
|
+
num_segments,
|
|
168
|
+
d_begin_offsets,
|
|
169
|
+
d_end_offsets,
|
|
170
|
+
is_overwrite_okay,
|
|
171
|
+
stream);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
public:
|
|
175
|
+
//! @name Keys-only
|
|
176
|
+
//! @{
|
|
177
|
+
|
|
178
|
+
//! @rst
|
|
179
|
+
//! Sorts segments of keys into ascending order.
|
|
180
|
+
//! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
|
|
181
|
+
//!
|
|
182
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
183
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
184
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
185
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
186
|
+
//! the latter is specified as `segment_offsets+1`).
|
|
187
|
+
//! - SortKeys is not guaranteed to be stable. That is, suppose that ``i`` and
|
|
188
|
+
//! ``j`` are equivalent: neither one is less than the other. It is not
|
|
189
|
+
//! guaranteed that the relative order of these two elements will be
|
|
190
|
+
//! preserved by sort.
|
|
191
|
+
//! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
|
|
192
|
+
//! ``[d_keys_in, d_keys_in + num_items)``,
|
|
193
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
194
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
195
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
196
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
|
|
197
|
+
//! be accessed nor modified.
|
|
198
|
+
//!
|
|
199
|
+
//! Snippet
|
|
200
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
201
|
+
//!
|
|
202
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
203
|
+
//! (with one zero-length segment) of ``int`` keys.
|
|
204
|
+
//!
|
|
205
|
+
//! .. code-block:: c++
|
|
206
|
+
//!
|
|
207
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
208
|
+
//!
|
|
209
|
+
//! // Declare, allocate, and initialize device-accessible
|
|
210
|
+
//! // pointers for sorting data
|
|
211
|
+
//! int num_items; // e.g., 7
|
|
212
|
+
//! int num_segments; // e.g., 3
|
|
213
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
214
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
215
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
216
|
+
//! ...
|
|
217
|
+
//!
|
|
218
|
+
//! // Determine temporary device storage requirements
|
|
219
|
+
//! void *d_temp_storage = nullptr;
|
|
220
|
+
//! size_t temp_storage_bytes = 0;
|
|
221
|
+
//! cub::DeviceSegmentedSort::SortKeys(
|
|
222
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
223
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
224
|
+
//!
|
|
225
|
+
//! // Allocate temporary storage
|
|
226
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
227
|
+
//!
|
|
228
|
+
//! // Run sorting operation
|
|
229
|
+
//! cub::DeviceSegmentedSort::SortKeys(
|
|
230
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
231
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
232
|
+
//!
|
|
233
|
+
//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
|
|
234
|
+
//!
|
|
235
|
+
//! @endrst
|
|
236
|
+
//!
|
|
237
|
+
//! @tparam KeyT
|
|
238
|
+
//! **[inferred]** Key type
|
|
239
|
+
//!
|
|
240
|
+
//! @tparam BeginOffsetIteratorT
|
|
241
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
242
|
+
//! beginning offsets @iterator
|
|
243
|
+
//!
|
|
244
|
+
//! @tparam EndOffsetIteratorT
|
|
245
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
246
|
+
//! ending offsets @iterator
|
|
247
|
+
//!
|
|
248
|
+
//! @param[in] d_temp_storage
|
|
249
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
250
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
251
|
+
//! is done
|
|
252
|
+
//!
|
|
253
|
+
//! @param[in,out] temp_storage_bytes
|
|
254
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
255
|
+
//!
|
|
256
|
+
//! @param[in] d_keys_in
|
|
257
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
258
|
+
//!
|
|
259
|
+
//! @param[out] d_keys_out
|
|
260
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
261
|
+
//!
|
|
262
|
+
//! @param[in] num_items
|
|
263
|
+
//! The total number of items to sort (across all segments)
|
|
264
|
+
//!
|
|
265
|
+
//! @param[in] num_segments
|
|
266
|
+
//! The number of segments that comprise the sorting data
|
|
267
|
+
//!
|
|
268
|
+
//! @param[in] d_begin_offsets
|
|
269
|
+
//! @rst
|
|
270
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
271
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
272
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
273
|
+
//! @endrst
|
|
274
|
+
//!
|
|
275
|
+
//! @param[in] d_end_offsets
|
|
276
|
+
//! @rst
|
|
277
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
278
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
279
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
280
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is considered empty.
|
|
281
|
+
//! @endrst
|
|
282
|
+
//!
|
|
283
|
+
//! @param[in] stream
|
|
284
|
+
//! @rst
|
|
285
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
286
|
+
//! @endrst
|
|
287
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
288
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
|
|
289
|
+
void* d_temp_storage,
|
|
290
|
+
size_t& temp_storage_bytes,
|
|
291
|
+
const KeyT* d_keys_in,
|
|
292
|
+
KeyT* d_keys_out,
|
|
293
|
+
::cuda::std::int64_t num_items,
|
|
294
|
+
::cuda::std::int64_t num_segments,
|
|
295
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
296
|
+
EndOffsetIteratorT d_end_offsets,
|
|
297
|
+
cudaStream_t stream = 0)
|
|
298
|
+
{
|
|
299
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
300
|
+
return SortKeysNoNVTX(
|
|
301
|
+
d_temp_storage,
|
|
302
|
+
temp_storage_bytes,
|
|
303
|
+
d_keys_in,
|
|
304
|
+
d_keys_out,
|
|
305
|
+
num_items,
|
|
306
|
+
num_segments,
|
|
307
|
+
d_begin_offsets,
|
|
308
|
+
d_end_offsets,
|
|
309
|
+
stream);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
private:
|
|
313
|
+
// Internal version without NVTX range
|
|
314
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
315
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
|
|
316
|
+
void* d_temp_storage,
|
|
317
|
+
size_t& temp_storage_bytes,
|
|
318
|
+
const KeyT* d_keys_in,
|
|
319
|
+
KeyT* d_keys_out,
|
|
320
|
+
::cuda::std::int64_t num_items,
|
|
321
|
+
::cuda::std::int64_t num_segments,
|
|
322
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
323
|
+
EndOffsetIteratorT d_end_offsets,
|
|
324
|
+
cudaStream_t stream = 0)
|
|
325
|
+
{
|
|
326
|
+
constexpr bool is_overwrite_okay = false;
|
|
327
|
+
|
|
328
|
+
using OffsetT =
|
|
329
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
330
|
+
using DispatchT =
|
|
331
|
+
DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
332
|
+
|
|
333
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
334
|
+
DoubleBuffer<NullType> d_values;
|
|
335
|
+
|
|
336
|
+
return DispatchT::Dispatch(
|
|
337
|
+
d_temp_storage,
|
|
338
|
+
temp_storage_bytes,
|
|
339
|
+
d_keys,
|
|
340
|
+
d_values,
|
|
341
|
+
num_items,
|
|
342
|
+
num_segments,
|
|
343
|
+
d_begin_offsets,
|
|
344
|
+
d_end_offsets,
|
|
345
|
+
is_overwrite_okay,
|
|
346
|
+
stream);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
public:
|
|
350
|
+
//! @rst
|
|
351
|
+
//! Sorts segments of keys into descending order. Approximately
|
|
352
|
+
//! ``num_items + 2 * num_segments`` auxiliary storage required.
|
|
353
|
+
//!
|
|
354
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
355
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
356
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
357
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
358
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
359
|
+
//! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
|
|
360
|
+
//! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
|
|
361
|
+
//! not guaranteed that the relative order of these two elements will be
|
|
362
|
+
//! preserved by sort.
|
|
363
|
+
//! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
|
|
364
|
+
//! ``[d_keys_in, d_keys_in + num_items)``,
|
|
365
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
366
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
367
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
368
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
|
|
369
|
+
//! be accessed nor modified.
|
|
370
|
+
//!
|
|
371
|
+
//! Snippet
|
|
372
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
373
|
+
//!
|
|
374
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
375
|
+
//! (with one zero-length segment) of ``i`` nt keys.
|
|
376
|
+
//!
|
|
377
|
+
//! .. code-block:: c++
|
|
378
|
+
//!
|
|
379
|
+
//! #include <cub/cub.cuh>
|
|
380
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
381
|
+
//!
|
|
382
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
383
|
+
//! // for sorting data
|
|
384
|
+
//! int num_items; // e.g., 7
|
|
385
|
+
//! int num_segments; // e.g., 3
|
|
386
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
387
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
388
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
389
|
+
//! ...
|
|
390
|
+
//!
|
|
391
|
+
//! // Determine temporary device storage requirements
|
|
392
|
+
//! void *d_temp_storage = nullptr;
|
|
393
|
+
//! size_t temp_storage_bytes = 0;
|
|
394
|
+
//! cub::DeviceSegmentedSort::SortKeysDescending(
|
|
395
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
396
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
397
|
+
//!
|
|
398
|
+
//! // Allocate temporary storage
|
|
399
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
400
|
+
//!
|
|
401
|
+
//! // Run sorting operation
|
|
402
|
+
//! cub::DeviceSegmentedSort::SortKeysDescending(
|
|
403
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
404
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
405
|
+
//!
|
|
406
|
+
//! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
|
|
407
|
+
//!
|
|
408
|
+
//! @endrst
|
|
409
|
+
//!
|
|
410
|
+
//! @tparam KeyT
|
|
411
|
+
//! **[inferred]** Key type
|
|
412
|
+
//!
|
|
413
|
+
//! @tparam BeginOffsetIteratorT
|
|
414
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
415
|
+
//!
|
|
416
|
+
//! @tparam EndOffsetIteratorT
|
|
417
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
418
|
+
//!
|
|
419
|
+
//! @param[in] d_temp_storage
|
|
420
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
421
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done
|
|
422
|
+
//!
|
|
423
|
+
//! @param[in,out] temp_storage_bytes
|
|
424
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
425
|
+
//!
|
|
426
|
+
//! @param[in] d_keys_in
|
|
427
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
428
|
+
//!
|
|
429
|
+
//! @param[out] d_keys_out
|
|
430
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
431
|
+
//!
|
|
432
|
+
//! @param[in] num_items
|
|
433
|
+
//! The total number of items to sort (across all segments)
|
|
434
|
+
//!
|
|
435
|
+
//! @param[in] num_segments
|
|
436
|
+
//! The number of segments that comprise the sorting data
|
|
437
|
+
//!
|
|
438
|
+
//! @param[in] d_begin_offsets
|
|
439
|
+
//! @rst
|
|
440
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
441
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
442
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
443
|
+
//! @endrst
|
|
444
|
+
//!
|
|
445
|
+
//! @param[in] d_end_offsets
|
|
446
|
+
//! @rst
|
|
447
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
448
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
449
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
450
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
|
|
451
|
+
//! @endrst
|
|
452
|
+
//!
|
|
453
|
+
//! @param[in] stream
|
|
454
|
+
//! @rst
|
|
455
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
456
|
+
//! @endrst
|
|
457
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
458
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
|
|
459
|
+
void* d_temp_storage,
|
|
460
|
+
size_t& temp_storage_bytes,
|
|
461
|
+
const KeyT* d_keys_in,
|
|
462
|
+
KeyT* d_keys_out,
|
|
463
|
+
::cuda::std::int64_t num_items,
|
|
464
|
+
::cuda::std::int64_t num_segments,
|
|
465
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
466
|
+
EndOffsetIteratorT d_end_offsets,
|
|
467
|
+
cudaStream_t stream = 0)
|
|
468
|
+
{
|
|
469
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
470
|
+
return SortKeysDescendingNoNVTX(
|
|
471
|
+
d_temp_storage,
|
|
472
|
+
temp_storage_bytes,
|
|
473
|
+
d_keys_in,
|
|
474
|
+
d_keys_out,
|
|
475
|
+
num_items,
|
|
476
|
+
num_segments,
|
|
477
|
+
d_begin_offsets,
|
|
478
|
+
d_end_offsets,
|
|
479
|
+
stream);
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
private:
|
|
483
|
+
// Internal version without NVTX range
|
|
484
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
485
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysNoNVTX(
|
|
486
|
+
void* d_temp_storage,
|
|
487
|
+
size_t& temp_storage_bytes,
|
|
488
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
489
|
+
::cuda::std::int64_t num_items,
|
|
490
|
+
::cuda::std::int64_t num_segments,
|
|
491
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
492
|
+
EndOffsetIteratorT d_end_offsets,
|
|
493
|
+
cudaStream_t stream = 0)
|
|
494
|
+
{
|
|
495
|
+
constexpr bool is_overwrite_okay = true;
|
|
496
|
+
using OffsetT =
|
|
497
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
498
|
+
using DispatchT =
|
|
499
|
+
DispatchSegmentedSort<SortOrder::Ascending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
500
|
+
|
|
501
|
+
DoubleBuffer<NullType> d_values;
|
|
502
|
+
|
|
503
|
+
return DispatchT::Dispatch(
|
|
504
|
+
d_temp_storage,
|
|
505
|
+
temp_storage_bytes,
|
|
506
|
+
d_keys,
|
|
507
|
+
d_values,
|
|
508
|
+
num_items,
|
|
509
|
+
num_segments,
|
|
510
|
+
d_begin_offsets,
|
|
511
|
+
d_end_offsets,
|
|
512
|
+
is_overwrite_okay,
|
|
513
|
+
stream);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
public:
|
|
517
|
+
//! @rst
|
|
518
|
+
//! Sorts segments of keys into ascending order. Approximately ``2 * num_segments`` auxiliary storage required.
|
|
519
|
+
//!
|
|
520
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
521
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
522
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
523
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
524
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
525
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
526
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
527
|
+
//! of key bits and the targeted device architecture).
|
|
528
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
529
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
530
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
531
|
+
//! the latter is specified as ``segment_offsets +1``).
|
|
532
|
+
//! - SortKeys is not guaranteed to be stable. That is, suppose that
|
|
533
|
+
//! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
|
|
534
|
+
//! not guaranteed that the relative order of these two elements will be
|
|
535
|
+
//! preserved by sort.
|
|
536
|
+
//! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
|
|
537
|
+
//! The range ``[cur, cur + num_items)`` shall not overlap
|
|
538
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
539
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
540
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
541
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
542
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
543
|
+
//! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
|
|
544
|
+
//!
|
|
545
|
+
//! Snippet
|
|
546
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
547
|
+
//!
|
|
548
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
549
|
+
//! (with one zero-length segment) of ``i`` nt keys.
|
|
550
|
+
//!
|
|
551
|
+
//! .. code-block:: c++
|
|
552
|
+
//!
|
|
553
|
+
//! #include <cub/cub.cuh>
|
|
554
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
555
|
+
//!
|
|
556
|
+
//! // Declare, allocate, and initialize device-accessible
|
|
557
|
+
//! // pointers for sorting data
|
|
558
|
+
//! int num_items; // e.g., 7
|
|
559
|
+
//! int num_segments; // e.g., 3
|
|
560
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
561
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
562
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
563
|
+
//! ...
|
|
564
|
+
//!
|
|
565
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
566
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
567
|
+
//!
|
|
568
|
+
//! // Determine temporary device storage requirements
|
|
569
|
+
//! void *d_temp_storage = nullptr;
|
|
570
|
+
//! size_t temp_storage_bytes = 0;
|
|
571
|
+
//! cub::DeviceSegmentedSort::SortKeys(
|
|
572
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
573
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
574
|
+
//!
|
|
575
|
+
//! // Allocate temporary storage
|
|
576
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
577
|
+
//!
|
|
578
|
+
//! // Run sorting operation
|
|
579
|
+
//! cub::DeviceSegmentedSort::SortKeys(
|
|
580
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
581
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
582
|
+
//!
|
|
583
|
+
//! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
|
|
584
|
+
//!
|
|
585
|
+
//! @endrst
|
|
586
|
+
//!
|
|
587
|
+
//! @tparam KeyT
|
|
588
|
+
//! **[inferred]** Key type
|
|
589
|
+
//!
|
|
590
|
+
//! @tparam BeginOffsetIteratorT
|
|
591
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
592
|
+
//! beginning offsets @iterator
|
|
593
|
+
//!
|
|
594
|
+
//! @tparam EndOffsetIteratorT
|
|
595
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
596
|
+
//! ending offsets @iterator
|
|
597
|
+
//!
|
|
598
|
+
//! @param[in] d_temp_storage
|
|
599
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
600
|
+
//! required allocation size is written to `temp_storage_bytes` and no
|
|
601
|
+
//! work is done
|
|
602
|
+
//!
|
|
603
|
+
//! @param[in,out] temp_storage_bytes
|
|
604
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
605
|
+
//!
|
|
606
|
+
//! @param[in,out] d_keys
|
|
607
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
608
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
609
|
+
//! point to the sorted output keys
|
|
610
|
+
//!
|
|
611
|
+
//! @param[in] num_items
|
|
612
|
+
//! The total number of items to sort (across all segments)
|
|
613
|
+
//!
|
|
614
|
+
//! @param[in] num_segments
|
|
615
|
+
//! The number of segments that comprise the sorting data
|
|
616
|
+
//!
|
|
617
|
+
//! @param[in] d_begin_offsets
|
|
618
|
+
//! @rst
|
|
619
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
620
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
621
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
622
|
+
//! @endrst
|
|
623
|
+
//!
|
|
624
|
+
//! @param[in] d_end_offsets
|
|
625
|
+
//! @rst
|
|
626
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
627
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
628
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
629
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
|
|
630
|
+
//! @endrst
|
|
631
|
+
//!
|
|
632
|
+
//! @param[in] stream
|
|
633
|
+
//! @rst
|
|
634
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
635
|
+
//! @endrst
|
|
636
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
637
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
|
|
638
|
+
void* d_temp_storage,
|
|
639
|
+
size_t& temp_storage_bytes,
|
|
640
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
641
|
+
::cuda::std::int64_t num_items,
|
|
642
|
+
::cuda::std::int64_t num_segments,
|
|
643
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
644
|
+
EndOffsetIteratorT d_end_offsets,
|
|
645
|
+
cudaStream_t stream = 0)
|
|
646
|
+
{
|
|
647
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
648
|
+
return SortKeysNoNVTX(
|
|
649
|
+
d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
private:
|
|
653
|
+
// Internal version without NVTX range
|
|
654
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
655
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescendingNoNVTX(
|
|
656
|
+
void* d_temp_storage,
|
|
657
|
+
size_t& temp_storage_bytes,
|
|
658
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
659
|
+
::cuda::std::int64_t num_items,
|
|
660
|
+
::cuda::std::int64_t num_segments,
|
|
661
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
662
|
+
EndOffsetIteratorT d_end_offsets,
|
|
663
|
+
cudaStream_t stream = 0)
|
|
664
|
+
{
|
|
665
|
+
constexpr bool is_overwrite_okay = true;
|
|
666
|
+
using OffsetT =
|
|
667
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
668
|
+
using DispatchT =
|
|
669
|
+
DispatchSegmentedSort<SortOrder::Descending, KeyT, cub::NullType, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
670
|
+
|
|
671
|
+
DoubleBuffer<NullType> d_values;
|
|
672
|
+
|
|
673
|
+
return DispatchT::Dispatch(
|
|
674
|
+
d_temp_storage,
|
|
675
|
+
temp_storage_bytes,
|
|
676
|
+
d_keys,
|
|
677
|
+
d_values,
|
|
678
|
+
num_items,
|
|
679
|
+
num_segments,
|
|
680
|
+
d_begin_offsets,
|
|
681
|
+
d_end_offsets,
|
|
682
|
+
is_overwrite_okay,
|
|
683
|
+
stream);
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
public:
|
|
687
|
+
//! @rst
|
|
688
|
+
//! Sorts segments of keys into descending order. Approximately
|
|
689
|
+
//! ``2 * num_segments`` auxiliary storage required.
|
|
690
|
+
//!
|
|
691
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
692
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
693
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
694
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
695
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
696
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
697
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
698
|
+
//! of key bits and the targeted device architecture).
|
|
699
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
700
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
701
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
702
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
703
|
+
//! - SortKeysDescending is not guaranteed to be stable. That is, suppose that
|
|
704
|
+
//! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
|
|
705
|
+
//! not guaranteed that the relative order of these two elements will be
|
|
706
|
+
//! preserved by sort.
|
|
707
|
+
//! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
|
|
708
|
+
//! The range ``[cur, cur + num_items)`` shall not overlap
|
|
709
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
710
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
711
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
712
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
713
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
714
|
+
//! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
|
|
715
|
+
//!
|
|
716
|
+
//! Snippet
|
|
717
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
718
|
+
//!
|
|
719
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
720
|
+
//! (with one zero-length segment) of ``i`` nt keys.
|
|
721
|
+
//!
|
|
722
|
+
//! .. code-block:: c++
|
|
723
|
+
//!
|
|
724
|
+
//! #include <cub/cub.cuh>
|
|
725
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
726
|
+
//!
|
|
727
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
728
|
+
//! // sorting data
|
|
729
|
+
//! int num_items; // e.g., 7
|
|
730
|
+
//! int num_segments; // e.g., 3
|
|
731
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
732
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
733
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
734
|
+
//! ...
|
|
735
|
+
//!
|
|
736
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
737
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
738
|
+
//!
|
|
739
|
+
//! // Determine temporary device storage requirements
|
|
740
|
+
//! void *d_temp_storage = nullptr;
|
|
741
|
+
//! size_t temp_storage_bytes = 0;
|
|
742
|
+
//! cub::DeviceSegmentedSort::SortKeysDescending(
|
|
743
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
744
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
745
|
+
//!
|
|
746
|
+
//! // Allocate temporary storage
|
|
747
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
748
|
+
//!
|
|
749
|
+
//! // Run sorting operation
|
|
750
|
+
//! cub::DeviceSegmentedSort::SortKeysDescending(
|
|
751
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
752
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
753
|
+
//!
|
|
754
|
+
//! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
|
|
755
|
+
//!
|
|
756
|
+
//! @endrst
|
|
757
|
+
//!
|
|
758
|
+
//! @tparam KeyT
|
|
759
|
+
//! **[inferred]** Key type
|
|
760
|
+
//!
|
|
761
|
+
//! @tparam BeginOffsetIteratorT
|
|
762
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
763
|
+
//! beginning offsets @iterator
|
|
764
|
+
//!
|
|
765
|
+
//! @tparam EndOffsetIteratorT
|
|
766
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
767
|
+
//! ending offsets @iterator
|
|
768
|
+
//!
|
|
769
|
+
//! @param[in] d_temp_storage
|
|
770
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
771
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
772
|
+
//! is done
|
|
773
|
+
//!
|
|
774
|
+
//! @param[in,out] temp_storage_bytes
|
|
775
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
776
|
+
//!
|
|
777
|
+
//! @param[in,out] d_keys
|
|
778
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
779
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
780
|
+
//! point to the sorted output keys
|
|
781
|
+
//!
|
|
782
|
+
//! @param[in] num_items
|
|
783
|
+
//! The total number of items to sort (across all segments)
|
|
784
|
+
//!
|
|
785
|
+
//! @param[in] num_segments
|
|
786
|
+
//! The number of segments that comprise the sorting data
|
|
787
|
+
//!
|
|
788
|
+
//! @param[in] d_begin_offsets
|
|
789
|
+
//! @rst
|
|
790
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
791
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
792
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
793
|
+
//! @endrst
|
|
794
|
+
//!
|
|
795
|
+
//! @param[in] d_end_offsets
|
|
796
|
+
//! @rst
|
|
797
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
798
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
799
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
800
|
+
//! If ``d_end_offsets[i] - 1<= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
|
|
801
|
+
//! @endrst
|
|
802
|
+
//!
|
|
803
|
+
//! @param[in] stream
|
|
804
|
+
//! @rst
|
|
805
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
806
|
+
//! @endrst
|
|
807
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
808
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
|
|
809
|
+
void* d_temp_storage,
|
|
810
|
+
size_t& temp_storage_bytes,
|
|
811
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
812
|
+
::cuda::std::int64_t num_items,
|
|
813
|
+
::cuda::std::int64_t num_segments,
|
|
814
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
815
|
+
EndOffsetIteratorT d_end_offsets,
|
|
816
|
+
cudaStream_t stream = 0)
|
|
817
|
+
{
|
|
818
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
819
|
+
return SortKeysDescendingNoNVTX(
|
|
820
|
+
d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
//! @rst
|
|
824
|
+
//! Sorts segments of keys into ascending order. Approximately
|
|
825
|
+
//! ``num_items + 2 * num_segments`` auxiliary storage required.
|
|
826
|
+
//!
|
|
827
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
828
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
829
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
830
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
831
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
832
|
+
//! - StableSortKeys is stable: it preserves the relative ordering of
|
|
833
|
+
//! equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
834
|
+
//! ``x`` precedes ``y``, and if the two elements are equivalent (neither
|
|
835
|
+
//! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
|
|
836
|
+
//! ``x`` still precedes ``y``.
|
|
837
|
+
//! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
|
|
838
|
+
//! ``[d_keys_in, d_keys_in + num_items)``,
|
|
839
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
840
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
841
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
842
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
|
|
843
|
+
//! be accessed nor modified.
|
|
844
|
+
//!
|
|
845
|
+
//! Snippet
|
|
846
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
847
|
+
//!
|
|
848
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
849
|
+
//! (with one zero-length segment) of ``i`` nt keys.
|
|
850
|
+
//!
|
|
851
|
+
//! .. code-block:: c++
|
|
852
|
+
//!
|
|
853
|
+
//! #include <cub/cub.cuh>
|
|
854
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
855
|
+
//!
|
|
856
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
857
|
+
//! // for sorting data
|
|
858
|
+
//! int num_items; // e.g., 7
|
|
859
|
+
//! int num_segments; // e.g., 3
|
|
860
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
861
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
862
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
863
|
+
//! ...
|
|
864
|
+
//!
|
|
865
|
+
//! // Determine temporary device storage requirements
|
|
866
|
+
//! void *d_temp_storage = nullptr;
|
|
867
|
+
//! size_t temp_storage_bytes = 0;
|
|
868
|
+
//! cub::DeviceSegmentedSort::StableSortKeys(
|
|
869
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
870
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
871
|
+
//!
|
|
872
|
+
//! // Allocate temporary storage
|
|
873
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
874
|
+
//!
|
|
875
|
+
//! // Run sorting operation
|
|
876
|
+
//! cub::DeviceSegmentedSort::StableSortKeys(
|
|
877
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
878
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
879
|
+
//!
|
|
880
|
+
//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
|
|
881
|
+
//!
|
|
882
|
+
//! @endrst
|
|
883
|
+
//!
|
|
884
|
+
//! @tparam KeyT
|
|
885
|
+
//! **[inferred]** Key type
|
|
886
|
+
//!
|
|
887
|
+
//! @tparam BeginOffsetIteratorT
|
|
888
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
889
|
+
//! beginning offsets @iterator
|
|
890
|
+
//!
|
|
891
|
+
//! @tparam EndOffsetIteratorT
|
|
892
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
893
|
+
//! ending offsets @iterator
|
|
894
|
+
//!
|
|
895
|
+
//! @param[in] d_temp_storage
|
|
896
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
897
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
898
|
+
//! is done
|
|
899
|
+
//!
|
|
900
|
+
//! @param[in,out] temp_storage_bytes
|
|
901
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
902
|
+
//!
|
|
903
|
+
//! @param[in] d_keys_in
|
|
904
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
905
|
+
//!
|
|
906
|
+
//! @param[out] d_keys_out
|
|
907
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
908
|
+
//!
|
|
909
|
+
//! @param[in] num_items
|
|
910
|
+
//! The total number of items to sort (across all segments)
|
|
911
|
+
//!
|
|
912
|
+
//! @param[in] num_segments
|
|
913
|
+
//! The number of segments that comprise the sorting data
|
|
914
|
+
//!
|
|
915
|
+
//! @param[in] d_begin_offsets
|
|
916
|
+
//! @rst
|
|
917
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
918
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
919
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
920
|
+
//! @endrst
|
|
921
|
+
//!
|
|
922
|
+
//! @param[in] d_end_offsets
|
|
923
|
+
//! @rst
|
|
924
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
925
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
926
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
927
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is considered empty.
|
|
928
|
+
//! @endrst
|
|
929
|
+
//!
|
|
930
|
+
//! @param[in] stream
|
|
931
|
+
//! @rst
|
|
932
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
933
|
+
//! @endrst
|
|
934
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
935
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
|
|
936
|
+
void* d_temp_storage,
|
|
937
|
+
size_t& temp_storage_bytes,
|
|
938
|
+
const KeyT* d_keys_in,
|
|
939
|
+
KeyT* d_keys_out,
|
|
940
|
+
::cuda::std::int64_t num_items,
|
|
941
|
+
::cuda::std::int64_t num_segments,
|
|
942
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
943
|
+
EndOffsetIteratorT d_end_offsets,
|
|
944
|
+
cudaStream_t stream = 0)
|
|
945
|
+
{
|
|
946
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
947
|
+
return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
948
|
+
d_temp_storage,
|
|
949
|
+
temp_storage_bytes,
|
|
950
|
+
d_keys_in,
|
|
951
|
+
d_keys_out,
|
|
952
|
+
num_items,
|
|
953
|
+
num_segments,
|
|
954
|
+
d_begin_offsets,
|
|
955
|
+
d_end_offsets,
|
|
956
|
+
stream);
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
//! @rst
|
|
960
|
+
//! Sorts segments of keys into descending order.
|
|
961
|
+
//! Approximately ``num_items + 2 * num_segments`` auxiliary storage required.
|
|
962
|
+
//!
|
|
963
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
964
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
965
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
966
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
967
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
968
|
+
//! - StableSortKeysDescending is stable: it preserves the relative ordering of
|
|
969
|
+
//! equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
970
|
+
//! ``x`` precedes ``y``, and if the two elements are equivalent (neither ``x < y`` nor ``y < x``)
|
|
971
|
+
//! then a postcondition of stable sort is that ``x`` still precedes ``y``.
|
|
972
|
+
//! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
|
|
973
|
+
//! ``[d_keys_in, d_keys_in + num_items)``,
|
|
974
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
975
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
976
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
977
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
|
|
978
|
+
//! be accessed nor modified.
|
|
979
|
+
//!
|
|
980
|
+
//! Snippet
|
|
981
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
982
|
+
//!
|
|
983
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
984
|
+
//! (with one zero-length segment) of ``i`` nt keys.
|
|
985
|
+
//!
|
|
986
|
+
//! .. code-block:: c++
|
|
987
|
+
//!
|
|
988
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
989
|
+
//!
|
|
990
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
991
|
+
//! // for sorting data
|
|
992
|
+
//! int num_items; // e.g., 7
|
|
993
|
+
//! int num_segments; // e.g., 3
|
|
994
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
995
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
996
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
997
|
+
//! ...
|
|
998
|
+
//!
|
|
999
|
+
//! // Determine temporary device storage requirements
|
|
1000
|
+
//! void *d_temp_storage = nullptr;
|
|
1001
|
+
//! size_t temp_storage_bytes = 0;
|
|
1002
|
+
//! cub::DeviceSegmentedSort::StableSortKeysDescending(
|
|
1003
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
1004
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1005
|
+
//!
|
|
1006
|
+
//! // Allocate temporary storage
|
|
1007
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1008
|
+
//!
|
|
1009
|
+
//! // Run sorting operation
|
|
1010
|
+
//! cub::DeviceSegmentedSort::StableSortKeysDescending(
|
|
1011
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
1012
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1013
|
+
//!
|
|
1014
|
+
//! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
|
|
1015
|
+
//!
|
|
1016
|
+
//! @endrst
|
|
1017
|
+
//!
|
|
1018
|
+
//! @tparam KeyT
|
|
1019
|
+
//! **[inferred]** Key type
|
|
1020
|
+
//!
|
|
1021
|
+
//! @tparam BeginOffsetIteratorT
|
|
1022
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1023
|
+
//! beginning offsets @iterator
|
|
1024
|
+
//!
|
|
1025
|
+
//! @tparam EndOffsetIteratorT
|
|
1026
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1027
|
+
//! ending offsets @iterator
|
|
1028
|
+
//!
|
|
1029
|
+
//! @param[in] d_temp_storage
|
|
1030
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
1031
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1032
|
+
//! is done.
|
|
1033
|
+
//!
|
|
1034
|
+
//! @param[in,out] temp_storage_bytes
|
|
1035
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1036
|
+
//!
|
|
1037
|
+
//! @param[in] d_keys_in
|
|
1038
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
1039
|
+
//!
|
|
1040
|
+
//! @param[out] d_keys_out
|
|
1041
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
1042
|
+
//!
|
|
1043
|
+
//! @param[in] num_items
|
|
1044
|
+
//! The total number of items to sort (across all segments)
|
|
1045
|
+
//!
|
|
1046
|
+
//! @param[in] num_segments
|
|
1047
|
+
//! The number of segments that comprise the sorting data
|
|
1048
|
+
//!
|
|
1049
|
+
//! @param[in] d_begin_offsets
|
|
1050
|
+
//! @rst
|
|
1051
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1052
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1053
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
|
|
1054
|
+
//! ``d_values_*``
|
|
1055
|
+
//! @endrst
|
|
1056
|
+
//!
|
|
1057
|
+
//! @param[in] d_end_offsets
|
|
1058
|
+
//! @rst
|
|
1059
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1060
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1061
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1062
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
1063
|
+
//! considered empty.
|
|
1064
|
+
//! @endrst
|
|
1065
|
+
//!
|
|
1066
|
+
//! @param[in] stream
|
|
1067
|
+
//! @rst
|
|
1068
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1069
|
+
//! @endrst
|
|
1070
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1071
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
|
|
1072
|
+
void* d_temp_storage,
|
|
1073
|
+
size_t& temp_storage_bytes,
|
|
1074
|
+
const KeyT* d_keys_in,
|
|
1075
|
+
KeyT* d_keys_out,
|
|
1076
|
+
::cuda::std::int64_t num_items,
|
|
1077
|
+
::cuda::std::int64_t num_segments,
|
|
1078
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1079
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1080
|
+
cudaStream_t stream = 0)
|
|
1081
|
+
{
|
|
1082
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1083
|
+
return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
1084
|
+
d_temp_storage,
|
|
1085
|
+
temp_storage_bytes,
|
|
1086
|
+
d_keys_in,
|
|
1087
|
+
d_keys_out,
|
|
1088
|
+
num_items,
|
|
1089
|
+
num_segments,
|
|
1090
|
+
d_begin_offsets,
|
|
1091
|
+
d_end_offsets,
|
|
1092
|
+
stream);
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
//! @rst
|
|
1096
|
+
//! Sorts segments of keys into ascending order.
|
|
1097
|
+
//! Approximately ``2 * num_segments`` auxiliary storage required.
|
|
1098
|
+
//!
|
|
1099
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
1100
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
1101
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
1102
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
1103
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
1104
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
1105
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
1106
|
+
//! of key bits and the targeted device architecture).
|
|
1107
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
1108
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
1109
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
1110
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
1111
|
+
//! - StableSortKeys is stable: it preserves the relative ordering of
|
|
1112
|
+
//! equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
1113
|
+
//! ``x`` precedes ``y``, and if the two elements are equivalent (neither
|
|
1114
|
+
//! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
|
|
1115
|
+
//! ``x`` still precedes ``y``.
|
|
1116
|
+
//! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
|
|
1117
|
+
//! The range ``[cur, cur + num_items)`` shall not overlap
|
|
1118
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
1119
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1120
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1121
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
1122
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
1123
|
+
//! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
|
|
1124
|
+
//!
|
|
1125
|
+
//! Snippet
|
|
1126
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1127
|
+
//!
|
|
1128
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1129
|
+
//! (with one zero-length segment) of ``i`` nt keys.
|
|
1130
|
+
//!
|
|
1131
|
+
//! .. code-block:: c++
|
|
1132
|
+
//!
|
|
1133
|
+
//! #include <cub/cub.cuh>
|
|
1134
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
1135
|
+
//!
|
|
1136
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1137
|
+
//! // for sorting data
|
|
1138
|
+
//! int num_items; // e.g., 7
|
|
1139
|
+
//! int num_segments; // e.g., 3
|
|
1140
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1141
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1142
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
1143
|
+
//! ...
|
|
1144
|
+
//!
|
|
1145
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
1146
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
1147
|
+
//!
|
|
1148
|
+
//! // Determine temporary device storage requirements
|
|
1149
|
+
//! void *d_temp_storage = nullptr;
|
|
1150
|
+
//! size_t temp_storage_bytes = 0;
|
|
1151
|
+
//! cub::DeviceSegmentedSort::StableSortKeys(
|
|
1152
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1153
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1154
|
+
//!
|
|
1155
|
+
//! // Allocate temporary storage
|
|
1156
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1157
|
+
//!
|
|
1158
|
+
//! // Run sorting operation
|
|
1159
|
+
//! cub::DeviceSegmentedSort::StableSortKeys(
|
|
1160
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1161
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1162
|
+
//!
|
|
1163
|
+
//! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
|
|
1164
|
+
//!
|
|
1165
|
+
//! @endrst
|
|
1166
|
+
//!
|
|
1167
|
+
//! @tparam KeyT
|
|
1168
|
+
//! **[inferred]** Key type
|
|
1169
|
+
//!
|
|
1170
|
+
//! @tparam BeginOffsetIteratorT
|
|
1171
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1172
|
+
//! beginning offsets @iterator
|
|
1173
|
+
//!
|
|
1174
|
+
//! @tparam EndOffsetIteratorT
|
|
1175
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1176
|
+
//! ending offsets @iterator
|
|
1177
|
+
//!
|
|
1178
|
+
//! @param[in] d_temp_storage
|
|
1179
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
1180
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1181
|
+
//! is done
|
|
1182
|
+
//!
|
|
1183
|
+
//! @param[in,out] temp_storage_bytes
|
|
1184
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1185
|
+
//!
|
|
1186
|
+
//! @param[in,out] d_keys
|
|
1187
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1188
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1189
|
+
//! point to the sorted output keys
|
|
1190
|
+
//!
|
|
1191
|
+
//! @param[in] num_items
|
|
1192
|
+
//! The total number of items to sort (across all segments)
|
|
1193
|
+
//!
|
|
1194
|
+
//! @param[in] num_segments
|
|
1195
|
+
//! The number of segments that comprise the sorting data
|
|
1196
|
+
//!
|
|
1197
|
+
//! @param[in] d_begin_offsets
|
|
1198
|
+
//! @rst
|
|
1199
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1200
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1201
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1202
|
+
//! @endrst
|
|
1203
|
+
//!
|
|
1204
|
+
//! @param[in] d_end_offsets
|
|
1205
|
+
//! @rst
|
|
1206
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1207
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1208
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1209
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
1210
|
+
//! considered empty.
|
|
1211
|
+
//! @endrst
|
|
1212
|
+
//!
|
|
1213
|
+
//! @param[in] stream
|
|
1214
|
+
//! @rst
|
|
1215
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1216
|
+
//! @endrst
|
|
1217
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1218
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(
|
|
1219
|
+
void* d_temp_storage,
|
|
1220
|
+
size_t& temp_storage_bytes,
|
|
1221
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1222
|
+
::cuda::std::int64_t num_items,
|
|
1223
|
+
::cuda::std::int64_t num_segments,
|
|
1224
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1225
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1226
|
+
cudaStream_t stream = 0)
|
|
1227
|
+
{
|
|
1228
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1229
|
+
return SortKeysNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
1230
|
+
d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1233
|
+
//! @rst
|
|
1234
|
+
//! Sorts segments of keys into descending order.
|
|
1235
|
+
//! Approximately ``2 * num_segments`` auxiliary storage required.
|
|
1236
|
+
//!
|
|
1237
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
1238
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
1239
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
1240
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
1241
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
1242
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
1243
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
1244
|
+
//! of key bits and the targeted device architecture).
|
|
1245
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
1246
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
1247
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
1248
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
1249
|
+
//! - StableSortKeysDescending is stable: it preserves the relative ordering of
|
|
1250
|
+
//! equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
1251
|
+
//! ``x`` precedes ``y``, and if the two elements are equivalent (neither
|
|
1252
|
+
//! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
|
|
1253
|
+
//! ``x`` still precedes ``y``.
|
|
1254
|
+
//! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
|
|
1255
|
+
//! The range ``[cur, cur + num_items)`` shall not overlap
|
|
1256
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
1257
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1258
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1259
|
+
//! - Segments are not required to be contiguous. For all index values ```i`
|
|
1260
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
1261
|
+
//! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
|
|
1262
|
+
//!
|
|
1263
|
+
//! Snippet
|
|
1264
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1265
|
+
//!
|
|
1266
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1267
|
+
//! (with one zero-length segment) of ``i`` nt keys.
|
|
1268
|
+
//!
|
|
1269
|
+
//! .. code-block:: c++
|
|
1270
|
+
//!
|
|
1271
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
1272
|
+
//!
|
|
1273
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1274
|
+
//! // for sorting data
|
|
1275
|
+
//! int num_items; // e.g., 7
|
|
1276
|
+
//! int num_segments; // e.g., 3
|
|
1277
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1278
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1279
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
1280
|
+
//! ...
|
|
1281
|
+
//!
|
|
1282
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
1283
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
1284
|
+
//!
|
|
1285
|
+
//! // Determine temporary device storage requirements
|
|
1286
|
+
//! void *d_temp_storage = nullptr;
|
|
1287
|
+
//! size_t temp_storage_bytes = 0;
|
|
1288
|
+
//! cub::DeviceSegmentedSort::StableSortKeysDescending(
|
|
1289
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1290
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1291
|
+
//!
|
|
1292
|
+
//! // Allocate temporary storage
|
|
1293
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1294
|
+
//!
|
|
1295
|
+
//! // Run sorting operation
|
|
1296
|
+
//! cub::DeviceSegmentedSort::StableSortKeysDescending(
|
|
1297
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1298
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1299
|
+
//!
|
|
1300
|
+
//! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
|
|
1301
|
+
//!
|
|
1302
|
+
//! @endrst
|
|
1303
|
+
//!
|
|
1304
|
+
//! @tparam KeyT
|
|
1305
|
+
//! **[inferred]** Key type
|
|
1306
|
+
//!
|
|
1307
|
+
//! @tparam BeginOffsetIteratorT
|
|
1308
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1309
|
+
//! beginning offsets @iterator
|
|
1310
|
+
//!
|
|
1311
|
+
//! @tparam EndOffsetIteratorT
|
|
1312
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1313
|
+
//! ending offsets @iterator
|
|
1314
|
+
//!
|
|
1315
|
+
//! @param[in] d_temp_storage
|
|
1316
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
1317
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1318
|
+
//! is done.
|
|
1319
|
+
//!
|
|
1320
|
+
//! @param[in,out] temp_storage_bytes
|
|
1321
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1322
|
+
//!
|
|
1323
|
+
//! @param[in,out] d_keys
|
|
1324
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1325
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1326
|
+
//! point to the sorted output keys
|
|
1327
|
+
//!
|
|
1328
|
+
//! @param[in] num_items
|
|
1329
|
+
//! The total number of items to sort (across all segments)
|
|
1330
|
+
//!
|
|
1331
|
+
//! @param[in] num_segments
|
|
1332
|
+
//! The number of segments that comprise the sorting data
|
|
1333
|
+
//!
|
|
1334
|
+
//! @param[in] d_begin_offsets
|
|
1335
|
+
//! @rst
|
|
1336
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1337
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1338
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1339
|
+
//! @endrst
|
|
1340
|
+
//!
|
|
1341
|
+
//! @param[in] d_end_offsets
|
|
1342
|
+
//! @rst
|
|
1343
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1344
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last
|
|
1345
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
|
|
1346
|
+
//! ``d_values_*``. If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the
|
|
1347
|
+
//! ``i``-th segment is considered empty.
|
|
1348
|
+
//! @endrst
|
|
1349
|
+
//!
|
|
1350
|
+
//! @param[in] stream
|
|
1351
|
+
//! @rst
|
|
1352
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1353
|
+
//! @endrst
|
|
1354
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1355
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeysDescending(
|
|
1356
|
+
void* d_temp_storage,
|
|
1357
|
+
size_t& temp_storage_bytes,
|
|
1358
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1359
|
+
::cuda::std::int64_t num_items,
|
|
1360
|
+
::cuda::std::int64_t num_segments,
|
|
1361
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1362
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1363
|
+
cudaStream_t stream = 0)
|
|
1364
|
+
{
|
|
1365
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1366
|
+
return SortKeysDescendingNoNVTX<KeyT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
1367
|
+
d_temp_storage, temp_storage_bytes, d_keys, num_items, num_segments, d_begin_offsets, d_end_offsets, stream);
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
private:
|
|
1371
|
+
// Internal version without NVTX range
|
|
1372
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1373
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
|
|
1374
|
+
void* d_temp_storage,
|
|
1375
|
+
size_t& temp_storage_bytes,
|
|
1376
|
+
const KeyT* d_keys_in,
|
|
1377
|
+
KeyT* d_keys_out,
|
|
1378
|
+
const ValueT* d_values_in,
|
|
1379
|
+
ValueT* d_values_out,
|
|
1380
|
+
::cuda::std::int64_t num_items,
|
|
1381
|
+
::cuda::std::int64_t num_segments,
|
|
1382
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1383
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1384
|
+
cudaStream_t stream = 0)
|
|
1385
|
+
{
|
|
1386
|
+
constexpr bool is_overwrite_okay = false;
|
|
1387
|
+
|
|
1388
|
+
using OffsetT =
|
|
1389
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
1390
|
+
using DispatchT =
|
|
1391
|
+
DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1392
|
+
|
|
1393
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
1394
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
1395
|
+
|
|
1396
|
+
return DispatchT::Dispatch(
|
|
1397
|
+
d_temp_storage,
|
|
1398
|
+
temp_storage_bytes,
|
|
1399
|
+
d_keys,
|
|
1400
|
+
d_values,
|
|
1401
|
+
num_items,
|
|
1402
|
+
num_segments,
|
|
1403
|
+
d_begin_offsets,
|
|
1404
|
+
d_end_offsets,
|
|
1405
|
+
is_overwrite_okay,
|
|
1406
|
+
stream);
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
public:
|
|
1410
|
+
//! @} end member group
|
|
1411
|
+
//! @name Key-value pairs
|
|
1412
|
+
//! @{
|
|
1413
|
+
|
|
1414
|
+
//! @rst
|
|
1415
|
+
//! Sorts segments of key-value pairs into ascending order.
|
|
1416
|
+
//! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
|
|
1417
|
+
//!
|
|
1418
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
1419
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
1420
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
1421
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
1422
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
1423
|
+
//! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
|
|
1424
|
+
//! ``j`` are equivalent: neither one is less than the other. It is not
|
|
1425
|
+
//! guaranteed that the relative order of these two elements will be
|
|
1426
|
+
//! preserved by sort.
|
|
1427
|
+
//! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
|
|
1428
|
+
//! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
|
|
1429
|
+
//! not overlap ``[in, in + num_items)``,
|
|
1430
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1431
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1432
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
1433
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
|
|
1434
|
+
//! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
|
|
1435
|
+
//!
|
|
1436
|
+
//! Snippet
|
|
1437
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1438
|
+
//!
|
|
1439
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1440
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
1441
|
+
//! ``i`` nt values.
|
|
1442
|
+
//!
|
|
1443
|
+
//! .. code-block:: c++
|
|
1444
|
+
//!
|
|
1445
|
+
//! #include <cub/cub.cuh>
|
|
1446
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
1447
|
+
//!
|
|
1448
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1449
|
+
//! // for sorting data
|
|
1450
|
+
//! int num_items; // e.g., 7
|
|
1451
|
+
//! int num_segments; // e.g., 3
|
|
1452
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1453
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1454
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
1455
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
1456
|
+
//! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
|
|
1457
|
+
//! ...
|
|
1458
|
+
//!
|
|
1459
|
+
//! // Determine temporary device storage requirements
|
|
1460
|
+
//! void *d_temp_storage = nullptr;
|
|
1461
|
+
//! size_t temp_storage_bytes = 0;
|
|
1462
|
+
//! cub::DeviceSegmentedSort::SortPairs(
|
|
1463
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1464
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
1465
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1466
|
+
//!
|
|
1467
|
+
//! // Allocate temporary storage
|
|
1468
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1469
|
+
//!
|
|
1470
|
+
//! // Run sorting operation
|
|
1471
|
+
//! cub::DeviceSegmentedSort::SortPairs(
|
|
1472
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1473
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
1474
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1475
|
+
//!
|
|
1476
|
+
//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
|
|
1477
|
+
//! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
|
|
1478
|
+
//!
|
|
1479
|
+
//! @endrst
|
|
1480
|
+
//!
|
|
1481
|
+
//! @tparam KeyT
|
|
1482
|
+
//! **[inferred]** Key type
|
|
1483
|
+
//!
|
|
1484
|
+
//! @tparam ValueT
|
|
1485
|
+
//! **[inferred]** Value type
|
|
1486
|
+
//!
|
|
1487
|
+
//! @tparam BeginOffsetIteratorT
|
|
1488
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1489
|
+
//! beginning offsets @iterator
|
|
1490
|
+
//!
|
|
1491
|
+
//! @tparam EndOffsetIteratorT
|
|
1492
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1493
|
+
//! ending offsets @iterator
|
|
1494
|
+
//!
|
|
1495
|
+
//! @param[in] d_temp_storage
|
|
1496
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1497
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1498
|
+
//! is done
|
|
1499
|
+
//!
|
|
1500
|
+
//! @param[in,out] temp_storage_bytes
|
|
1501
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1502
|
+
//!
|
|
1503
|
+
//! @param[in] d_keys_in
|
|
1504
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
1505
|
+
//!
|
|
1506
|
+
//! @param[out] d_keys_out
|
|
1507
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
1508
|
+
//!
|
|
1509
|
+
//! @param[in] d_values_in
|
|
1510
|
+
//! Device-accessible pointer to the corresponding input sequence of
|
|
1511
|
+
//! associated value items
|
|
1512
|
+
//!
|
|
1513
|
+
//! @param[out] d_values_out
|
|
1514
|
+
//! Device-accessible pointer to the correspondingly-reordered output
|
|
1515
|
+
//! sequence of associated value items
|
|
1516
|
+
//!
|
|
1517
|
+
//! @param[in] num_items
|
|
1518
|
+
//! The total number of items to sort (across all segments)
|
|
1519
|
+
//!
|
|
1520
|
+
//! @param[in] num_segments
|
|
1521
|
+
//! The number of segments that comprise the sorting data
|
|
1522
|
+
//!
|
|
1523
|
+
//! @param[in] d_begin_offsets
|
|
1524
|
+
//! @rst
|
|
1525
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1526
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1527
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1528
|
+
//! @endrst
|
|
1529
|
+
//!
|
|
1530
|
+
//! @param[in] d_end_offsets
|
|
1531
|
+
//! @rst
|
|
1532
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1533
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1534
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1535
|
+
//! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
1536
|
+
//! considered empty.
|
|
1537
|
+
//! @endrst
|
|
1538
|
+
//!
|
|
1539
|
+
//! @param[in] stream
|
|
1540
|
+
//! @rst
|
|
1541
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1542
|
+
//! @endrst
|
|
1543
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1544
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
|
|
1545
|
+
void* d_temp_storage,
|
|
1546
|
+
size_t& temp_storage_bytes,
|
|
1547
|
+
const KeyT* d_keys_in,
|
|
1548
|
+
KeyT* d_keys_out,
|
|
1549
|
+
const ValueT* d_values_in,
|
|
1550
|
+
ValueT* d_values_out,
|
|
1551
|
+
::cuda::std::int64_t num_items,
|
|
1552
|
+
::cuda::std::int64_t num_segments,
|
|
1553
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1554
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1555
|
+
cudaStream_t stream = 0)
|
|
1556
|
+
{
|
|
1557
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1558
|
+
return SortPairsNoNVTX(
|
|
1559
|
+
d_temp_storage,
|
|
1560
|
+
temp_storage_bytes,
|
|
1561
|
+
d_keys_in,
|
|
1562
|
+
d_keys_out,
|
|
1563
|
+
d_values_in,
|
|
1564
|
+
d_values_out,
|
|
1565
|
+
num_items,
|
|
1566
|
+
num_segments,
|
|
1567
|
+
d_begin_offsets,
|
|
1568
|
+
d_end_offsets,
|
|
1569
|
+
stream);
|
|
1570
|
+
}
|
|
1571
|
+
|
|
1572
|
+
private:
|
|
1573
|
+
// Internal version without NVTX range
|
|
1574
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1575
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
|
|
1576
|
+
void* d_temp_storage,
|
|
1577
|
+
size_t& temp_storage_bytes,
|
|
1578
|
+
const KeyT* d_keys_in,
|
|
1579
|
+
KeyT* d_keys_out,
|
|
1580
|
+
const ValueT* d_values_in,
|
|
1581
|
+
ValueT* d_values_out,
|
|
1582
|
+
::cuda::std::int64_t num_items,
|
|
1583
|
+
::cuda::std::int64_t num_segments,
|
|
1584
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1585
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1586
|
+
cudaStream_t stream = 0)
|
|
1587
|
+
{
|
|
1588
|
+
constexpr bool is_overwrite_okay = false;
|
|
1589
|
+
|
|
1590
|
+
using OffsetT =
|
|
1591
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
1592
|
+
using DispatchT =
|
|
1593
|
+
DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1594
|
+
|
|
1595
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
1596
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
1597
|
+
|
|
1598
|
+
return DispatchT::Dispatch(
|
|
1599
|
+
d_temp_storage,
|
|
1600
|
+
temp_storage_bytes,
|
|
1601
|
+
d_keys,
|
|
1602
|
+
d_values,
|
|
1603
|
+
num_items,
|
|
1604
|
+
num_segments,
|
|
1605
|
+
d_begin_offsets,
|
|
1606
|
+
d_end_offsets,
|
|
1607
|
+
is_overwrite_okay,
|
|
1608
|
+
stream);
|
|
1609
|
+
}
|
|
1610
|
+
|
|
1611
|
+
public:
|
|
1612
|
+
//! @rst
|
|
1613
|
+
//! Sorts segments of key-value pairs into descending order.
|
|
1614
|
+
//! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
|
|
1615
|
+
//!
|
|
1616
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
1617
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
1618
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
1619
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
1620
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
1621
|
+
//! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
|
|
1622
|
+
//! ``j`` are equivalent: neither one is less than the other. It is not
|
|
1623
|
+
//! guaranteed that the relative order of these two elements will be
|
|
1624
|
+
//! preserved by sort.
|
|
1625
|
+
//! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
|
|
1626
|
+
//! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
|
|
1627
|
+
//! not overlap ``[in, in + num_items)``,
|
|
1628
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1629
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1630
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
1631
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
|
|
1632
|
+
//! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
|
|
1633
|
+
//!
|
|
1634
|
+
//! Snippet
|
|
1635
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1636
|
+
//!
|
|
1637
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1638
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
1639
|
+
//! ``i`` nt values.
|
|
1640
|
+
//!
|
|
1641
|
+
//! .. code-block:: c++
|
|
1642
|
+
//!
|
|
1643
|
+
//! #include <cub/cub.cuh>
|
|
1644
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
1645
|
+
//!
|
|
1646
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1647
|
+
//! // sorting data
|
|
1648
|
+
//! int num_items; // e.g., 7
|
|
1649
|
+
//! int num_segments; // e.g., 3
|
|
1650
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1651
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1652
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
1653
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
1654
|
+
//! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
|
|
1655
|
+
//! ...
|
|
1656
|
+
//!
|
|
1657
|
+
//! // Determine temporary device storage requirements
|
|
1658
|
+
//! void *d_temp_storage = nullptr;
|
|
1659
|
+
//! size_t temp_storage_bytes = 0;
|
|
1660
|
+
//! cub::DeviceSegmentedSort::SortPairsDescending(
|
|
1661
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1662
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
1663
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1664
|
+
//!
|
|
1665
|
+
//! // Allocate temporary storage
|
|
1666
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1667
|
+
//!
|
|
1668
|
+
//! // Run sorting operation
|
|
1669
|
+
//! cub::DeviceSegmentedSort::SortPairsDescending(
|
|
1670
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1671
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
1672
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1673
|
+
//!
|
|
1674
|
+
//! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
|
|
1675
|
+
//! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
|
|
1676
|
+
//!
|
|
1677
|
+
//! @endrst
|
|
1678
|
+
//!
|
|
1679
|
+
//! @tparam KeyT
|
|
1680
|
+
//! **[inferred]** Key type
|
|
1681
|
+
//!
|
|
1682
|
+
//! @tparam ValueT
|
|
1683
|
+
//! **[inferred]** Value type
|
|
1684
|
+
//!
|
|
1685
|
+
//! @tparam BeginOffsetIteratorT
|
|
1686
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1687
|
+
//! beginning offsets @iterator
|
|
1688
|
+
//!
|
|
1689
|
+
//! @tparam EndOffsetIteratorT
|
|
1690
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1691
|
+
//! ending offsets @iterator
|
|
1692
|
+
//!
|
|
1693
|
+
//! @param[in] d_temp_storage
|
|
1694
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
1695
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1696
|
+
//! is done.
|
|
1697
|
+
//!
|
|
1698
|
+
//! @param[in,out] temp_storage_bytes
|
|
1699
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1700
|
+
//!
|
|
1701
|
+
//! @param[in] d_keys_in
|
|
1702
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
1703
|
+
//!
|
|
1704
|
+
//! @param[out] d_keys_out
|
|
1705
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
1706
|
+
//!
|
|
1707
|
+
//! @param[in] d_values_in
|
|
1708
|
+
//! Device-accessible pointer to the corresponding input sequence of
|
|
1709
|
+
//! associated value items
|
|
1710
|
+
//!
|
|
1711
|
+
//! @param[out] d_values_out
|
|
1712
|
+
//! Device-accessible pointer to the correspondingly-reordered output
|
|
1713
|
+
//! sequence of associated value items
|
|
1714
|
+
//!
|
|
1715
|
+
//! @param[in] num_items
|
|
1716
|
+
//! The total number of items to sort (across all segments)
|
|
1717
|
+
//!
|
|
1718
|
+
//! @param[in] num_segments
|
|
1719
|
+
//! The number of segments that comprise the sorting data
|
|
1720
|
+
//!
|
|
1721
|
+
//! @param[in] d_begin_offsets
|
|
1722
|
+
//! @rst
|
|
1723
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1724
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1725
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1726
|
+
//! @endrst
|
|
1727
|
+
//!
|
|
1728
|
+
//! @param[in] d_end_offsets
|
|
1729
|
+
//! @rst
|
|
1730
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1731
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1732
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1733
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
|
|
1734
|
+
//! considered empty.
|
|
1735
|
+
//! @endrst
|
|
1736
|
+
//!
|
|
1737
|
+
//! @param[in] stream
|
|
1738
|
+
//! @rst
|
|
1739
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1740
|
+
//! @endrst
|
|
1741
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1742
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
|
|
1743
|
+
void* d_temp_storage,
|
|
1744
|
+
size_t& temp_storage_bytes,
|
|
1745
|
+
const KeyT* d_keys_in,
|
|
1746
|
+
KeyT* d_keys_out,
|
|
1747
|
+
const ValueT* d_values_in,
|
|
1748
|
+
ValueT* d_values_out,
|
|
1749
|
+
::cuda::std::int64_t num_items,
|
|
1750
|
+
::cuda::std::int64_t num_segments,
|
|
1751
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1752
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1753
|
+
cudaStream_t stream = 0)
|
|
1754
|
+
{
|
|
1755
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1756
|
+
return SortPairsDescendingNoNVTX(
|
|
1757
|
+
d_temp_storage,
|
|
1758
|
+
temp_storage_bytes,
|
|
1759
|
+
d_keys_in,
|
|
1760
|
+
d_keys_out,
|
|
1761
|
+
d_values_in,
|
|
1762
|
+
d_values_out,
|
|
1763
|
+
num_items,
|
|
1764
|
+
num_segments,
|
|
1765
|
+
d_begin_offsets,
|
|
1766
|
+
d_end_offsets,
|
|
1767
|
+
stream);
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
private:
|
|
1771
|
+
// Internal version without NVTX range
|
|
1772
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1773
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsNoNVTX(
|
|
1774
|
+
void* d_temp_storage,
|
|
1775
|
+
size_t& temp_storage_bytes,
|
|
1776
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1777
|
+
DoubleBuffer<ValueT>& d_values,
|
|
1778
|
+
::cuda::std::int64_t num_items,
|
|
1779
|
+
::cuda::std::int64_t num_segments,
|
|
1780
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1781
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1782
|
+
cudaStream_t stream = 0)
|
|
1783
|
+
{
|
|
1784
|
+
constexpr bool is_overwrite_okay = true;
|
|
1785
|
+
|
|
1786
|
+
using OffsetT =
|
|
1787
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
1788
|
+
using DispatchT =
|
|
1789
|
+
DispatchSegmentedSort<SortOrder::Ascending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1790
|
+
|
|
1791
|
+
return DispatchT::Dispatch(
|
|
1792
|
+
d_temp_storage,
|
|
1793
|
+
temp_storage_bytes,
|
|
1794
|
+
d_keys,
|
|
1795
|
+
d_values,
|
|
1796
|
+
num_items,
|
|
1797
|
+
num_segments,
|
|
1798
|
+
d_begin_offsets,
|
|
1799
|
+
d_end_offsets,
|
|
1800
|
+
is_overwrite_okay,
|
|
1801
|
+
stream);
|
|
1802
|
+
}
|
|
1803
|
+
|
|
1804
|
+
public:
|
|
1805
|
+
//! @rst
|
|
1806
|
+
//! Sorts segments of key-value pairs into ascending order.
|
|
1807
|
+
//! Approximately ``2 * num_segments`` auxiliary storage required.
|
|
1808
|
+
//!
|
|
1809
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
1810
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
1811
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
1812
|
+
//! contains the input data to be sorted).
|
|
1813
|
+
//! - The contents of both buffers within each pair may be altered by the sorting
|
|
1814
|
+
//! operation.
|
|
1815
|
+
//! - Upon completion, the sorting operation will update the "current" indicator
|
|
1816
|
+
//! within each DoubleBuffer wrapper to reference which of the two buffers
|
|
1817
|
+
//! now contains the sorted output sequence (a function of the number of key bits
|
|
1818
|
+
//! specified and the targeted device architecture).
|
|
1819
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
1820
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
1821
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
1822
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
1823
|
+
//! - SortPairs is not guaranteed to be stable. That is, suppose that ``i`` and
|
|
1824
|
+
//! ``j`` are equivalent: neither one is less than the other. It is not
|
|
1825
|
+
//! guaranteed that the relative order of these two elements will be
|
|
1826
|
+
//! preserved by sort.
|
|
1827
|
+
//! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
|
|
1828
|
+
//! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
|
|
1829
|
+
//! ``[cur, cur + num_items)`` shall not overlap
|
|
1830
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
1831
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1832
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1833
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
1834
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
1835
|
+
//! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
|
|
1836
|
+
//! ``d_values.Alternate()[i]`` will not be accessed nor modified.
|
|
1837
|
+
//!
|
|
1838
|
+
//! Snippet
|
|
1839
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1840
|
+
//!
|
|
1841
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1842
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
1843
|
+
//! ``i`` nt values.
|
|
1844
|
+
//!
|
|
1845
|
+
//! .. code-block:: c++
|
|
1846
|
+
//!
|
|
1847
|
+
//! #include <cub/cub.cuh>
|
|
1848
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
1849
|
+
//!
|
|
1850
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1851
|
+
//! // for sorting data
|
|
1852
|
+
//! int num_items; // e.g., 7
|
|
1853
|
+
//! int num_segments; // e.g., 3
|
|
1854
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1855
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1856
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
1857
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
1858
|
+
//! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
1859
|
+
//! ...
|
|
1860
|
+
//!
|
|
1861
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
1862
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
1863
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
1864
|
+
//!
|
|
1865
|
+
//! // Determine temporary device storage requirements
|
|
1866
|
+
//! void *d_temp_storage = nullptr;
|
|
1867
|
+
//! size_t temp_storage_bytes = 0;
|
|
1868
|
+
//! cub::DeviceSegmentedSort::SortPairs(
|
|
1869
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
1870
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1871
|
+
//!
|
|
1872
|
+
//! // Allocate temporary storage
|
|
1873
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1874
|
+
//!
|
|
1875
|
+
//! // Run sorting operation
|
|
1876
|
+
//! cub::DeviceSegmentedSort::SortPairs(
|
|
1877
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
1878
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1879
|
+
//!
|
|
1880
|
+
//! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
|
|
1881
|
+
//! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
|
|
1882
|
+
//!
|
|
1883
|
+
//! @endrst
|
|
1884
|
+
//!
|
|
1885
|
+
//! @tparam KeyT
|
|
1886
|
+
//! **[inferred]** Key type
|
|
1887
|
+
//!
|
|
1888
|
+
//! @tparam ValueT
|
|
1889
|
+
//! **[inferred]** Value type
|
|
1890
|
+
//!
|
|
1891
|
+
//! @tparam BeginOffsetIteratorT
|
|
1892
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1893
|
+
//! beginning offsets @iterator
|
|
1894
|
+
//!
|
|
1895
|
+
//! @tparam EndOffsetIteratorT
|
|
1896
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1897
|
+
//! ending offsets @iterator
|
|
1898
|
+
//!
|
|
1899
|
+
//! @param[in] d_temp_storage
|
|
1900
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1901
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1902
|
+
//! is done.
|
|
1903
|
+
//!
|
|
1904
|
+
//! @param[in,out] temp_storage_bytes
|
|
1905
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1906
|
+
//!
|
|
1907
|
+
//! @param[in,out] d_keys
|
|
1908
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1909
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1910
|
+
//! point to the sorted output keys
|
|
1911
|
+
//!
|
|
1912
|
+
//! @param[in,out] d_values
|
|
1913
|
+
//! Double-buffer of values whose "current" device-accessible buffer contains
|
|
1914
|
+
//! the unsorted input values and, upon return, is updated to point to the
|
|
1915
|
+
//! sorted output values
|
|
1916
|
+
//!
|
|
1917
|
+
//! @param[in] num_items
|
|
1918
|
+
//! The total number of items to sort (across all segments)
|
|
1919
|
+
//!
|
|
1920
|
+
//! @param[in] num_segments
|
|
1921
|
+
//! The number of segments that comprise the sorting data
|
|
1922
|
+
//!
|
|
1923
|
+
//! @param[in] d_begin_offsets
|
|
1924
|
+
//! @rst
|
|
1925
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1926
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1927
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1928
|
+
//! @endrst
|
|
1929
|
+
//!
|
|
1930
|
+
//! @param[in] d_end_offsets
|
|
1931
|
+
//! @rst
|
|
1932
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1933
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1934
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1935
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the i-th segment is
|
|
1936
|
+
//! considered empty.
|
|
1937
|
+
//! @endrst
|
|
1938
|
+
//!
|
|
1939
|
+
//! @param[in] stream
|
|
1940
|
+
//! @rst
|
|
1941
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1942
|
+
//! @endrst
|
|
1943
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1944
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
|
|
1945
|
+
void* d_temp_storage,
|
|
1946
|
+
size_t& temp_storage_bytes,
|
|
1947
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1948
|
+
DoubleBuffer<ValueT>& d_values,
|
|
1949
|
+
::cuda::std::int64_t num_items,
|
|
1950
|
+
::cuda::std::int64_t num_segments,
|
|
1951
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1952
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1953
|
+
cudaStream_t stream = 0)
|
|
1954
|
+
{
|
|
1955
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1956
|
+
return SortPairsNoNVTX(
|
|
1957
|
+
d_temp_storage,
|
|
1958
|
+
temp_storage_bytes,
|
|
1959
|
+
d_keys,
|
|
1960
|
+
d_values,
|
|
1961
|
+
num_items,
|
|
1962
|
+
num_segments,
|
|
1963
|
+
d_begin_offsets,
|
|
1964
|
+
d_end_offsets,
|
|
1965
|
+
stream);
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1968
|
+
private:
|
|
1969
|
+
// Internal version without NVTX range
|
|
1970
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1971
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescendingNoNVTX(
|
|
1972
|
+
void* d_temp_storage,
|
|
1973
|
+
size_t& temp_storage_bytes,
|
|
1974
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1975
|
+
DoubleBuffer<ValueT>& d_values,
|
|
1976
|
+
::cuda::std::int64_t num_items,
|
|
1977
|
+
::cuda::std::int64_t num_segments,
|
|
1978
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1979
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1980
|
+
cudaStream_t stream = 0)
|
|
1981
|
+
{
|
|
1982
|
+
constexpr bool is_overwrite_okay = true;
|
|
1983
|
+
|
|
1984
|
+
using OffsetT =
|
|
1985
|
+
detail::choose_signed_offset_t<detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>>;
|
|
1986
|
+
using DispatchT =
|
|
1987
|
+
DispatchSegmentedSort<SortOrder::Descending, KeyT, ValueT, OffsetT, BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1988
|
+
|
|
1989
|
+
return DispatchT::Dispatch(
|
|
1990
|
+
d_temp_storage,
|
|
1991
|
+
temp_storage_bytes,
|
|
1992
|
+
d_keys,
|
|
1993
|
+
d_values,
|
|
1994
|
+
num_items,
|
|
1995
|
+
num_segments,
|
|
1996
|
+
d_begin_offsets,
|
|
1997
|
+
d_end_offsets,
|
|
1998
|
+
is_overwrite_okay,
|
|
1999
|
+
stream);
|
|
2000
|
+
}
|
|
2001
|
+
|
|
2002
|
+
public:
|
|
2003
|
+
//! @rst
|
|
2004
|
+
//! Sorts segments of key-value pairs into descending order.
|
|
2005
|
+
//! Approximately ``2 * num_segments`` auxiliary storage required.
|
|
2006
|
+
//!
|
|
2007
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
2008
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
2009
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
2010
|
+
//! contains the input data to be sorted).
|
|
2011
|
+
//! - The contents of both buffers within each pair may be altered by the
|
|
2012
|
+
//! sorting operation.
|
|
2013
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
2014
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
2015
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
2016
|
+
//! of key bits specified and the targeted device architecture).
|
|
2017
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
2018
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
2019
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
2020
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
2021
|
+
//! - SortPairsDescending is not guaranteed to be stable. That is, suppose that
|
|
2022
|
+
//! ``i`` and ``j`` are equivalent: neither one is less than the other. It is
|
|
2023
|
+
//! not guaranteed that the relative order of these two elements will be
|
|
2024
|
+
//! preserved by sort.
|
|
2025
|
+
//! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
|
|
2026
|
+
//! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
|
|
2027
|
+
//! ``[cur, cur + num_items)`` shall not overlap
|
|
2028
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
2029
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
2030
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
2031
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
2032
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
2033
|
+
//! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
|
|
2034
|
+
//! ``d_values.Alternate()[i]`` will not be accessed nor modified.
|
|
2035
|
+
//!
|
|
2036
|
+
//! Snippet
|
|
2037
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
2038
|
+
//!
|
|
2039
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
2040
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
2041
|
+
//! ``i`` nt values.
|
|
2042
|
+
//!
|
|
2043
|
+
//! .. code-block:: c++
|
|
2044
|
+
//!
|
|
2045
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
2046
|
+
//!
|
|
2047
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
2048
|
+
//! // sorting data
|
|
2049
|
+
//! int num_items; // e.g., 7
|
|
2050
|
+
//! int num_segments; // e.g., 3
|
|
2051
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
2052
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2053
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
2054
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
2055
|
+
//! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
2056
|
+
//! ...
|
|
2057
|
+
//!
|
|
2058
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
2059
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
2060
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
2061
|
+
//!
|
|
2062
|
+
//! // Determine temporary device storage requirements
|
|
2063
|
+
//! void *d_temp_storage = nullptr;
|
|
2064
|
+
//! size_t temp_storage_bytes = 0;
|
|
2065
|
+
//! cub::DeviceSegmentedSort::SortPairsDescending(
|
|
2066
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
2067
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2068
|
+
//!
|
|
2069
|
+
//! // Allocate temporary storage
|
|
2070
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2071
|
+
//!
|
|
2072
|
+
//! // Run sorting operation
|
|
2073
|
+
//! cub::DeviceSegmentedSort::SortPairsDescending(
|
|
2074
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
2075
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2076
|
+
//!
|
|
2077
|
+
//! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
|
|
2078
|
+
//! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
|
|
2079
|
+
//!
|
|
2080
|
+
//! @endrst
|
|
2081
|
+
//!
|
|
2082
|
+
//! @tparam KeyT
|
|
2083
|
+
//! **[inferred]** Key type
|
|
2084
|
+
//!
|
|
2085
|
+
//! @tparam ValueT
|
|
2086
|
+
//! **[inferred]** Value type
|
|
2087
|
+
//!
|
|
2088
|
+
//! @tparam BeginOffsetIteratorT
|
|
2089
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2090
|
+
//! beginning offsets @iterator
|
|
2091
|
+
//!
|
|
2092
|
+
//! @tparam EndOffsetIteratorT
|
|
2093
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2094
|
+
//! ending offsets @iterator
|
|
2095
|
+
//!
|
|
2096
|
+
//! @param[in] d_temp_storage
|
|
2097
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
2098
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
2099
|
+
//! is done
|
|
2100
|
+
//!
|
|
2101
|
+
//! @param[in,out] temp_storage_bytes
|
|
2102
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
2103
|
+
//!
|
|
2104
|
+
//! @param[in,out] d_keys
|
|
2105
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
2106
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
2107
|
+
//! point to the sorted output keys
|
|
2108
|
+
//!
|
|
2109
|
+
//! @param[in,out] d_values
|
|
2110
|
+
//! Double-buffer of values whose "current" device-accessible buffer contains
|
|
2111
|
+
//! the unsorted input values and, upon return, is updated to point to the
|
|
2112
|
+
//! sorted output values
|
|
2113
|
+
//!
|
|
2114
|
+
//! @param[in] num_items
|
|
2115
|
+
//! The total number of items to sort (across all segments)
|
|
2116
|
+
//!
|
|
2117
|
+
//! @param[in] num_segments
|
|
2118
|
+
//! The number of segments that comprise the sorting data
|
|
2119
|
+
//!
|
|
2120
|
+
//! @param[in] d_begin_offsets
|
|
2121
|
+
//! @rst
|
|
2122
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
2123
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
2124
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
2125
|
+
//! @endrst
|
|
2126
|
+
//!
|
|
2127
|
+
//! @param[in] d_end_offsets
|
|
2128
|
+
//! @rst
|
|
2129
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
2130
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
2131
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
2132
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
2133
|
+
//! considered empty.
|
|
2134
|
+
//! @endrst
|
|
2135
|
+
//!
|
|
2136
|
+
//! @param[in] stream
|
|
2137
|
+
//! @rst
|
|
2138
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
2139
|
+
//! @endrst
|
|
2140
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
2141
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
|
|
2142
|
+
void* d_temp_storage,
|
|
2143
|
+
size_t& temp_storage_bytes,
|
|
2144
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
2145
|
+
DoubleBuffer<ValueT>& d_values,
|
|
2146
|
+
::cuda::std::int64_t num_items,
|
|
2147
|
+
::cuda::std::int64_t num_segments,
|
|
2148
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
2149
|
+
EndOffsetIteratorT d_end_offsets,
|
|
2150
|
+
cudaStream_t stream = 0)
|
|
2151
|
+
{
|
|
2152
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2153
|
+
return SortPairsDescendingNoNVTX(
|
|
2154
|
+
d_temp_storage,
|
|
2155
|
+
temp_storage_bytes,
|
|
2156
|
+
d_keys,
|
|
2157
|
+
d_values,
|
|
2158
|
+
num_items,
|
|
2159
|
+
num_segments,
|
|
2160
|
+
d_begin_offsets,
|
|
2161
|
+
d_end_offsets,
|
|
2162
|
+
stream);
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2165
|
+
//! @rst
|
|
2166
|
+
//! Sorts segments of key-value pairs into ascending order.
|
|
2167
|
+
//! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
|
|
2168
|
+
//!
|
|
2169
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
2170
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
2171
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
2172
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
2173
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
2174
|
+
//! - StableSortPairs is stable: it preserves the relative ordering of
|
|
2175
|
+
//! equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
2176
|
+
//! ``x`` precedes ``y``, and if the two elements are equivalent (neither
|
|
2177
|
+
//! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
|
|
2178
|
+
//! ``x`` still precedes ``y``.
|
|
2179
|
+
//! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
|
|
2180
|
+
//! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
|
|
2181
|
+
//! not overlap ``[in, in + num_items)``,
|
|
2182
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
2183
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
2184
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
2185
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
|
|
2186
|
+
//! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
|
|
2187
|
+
//!
|
|
2188
|
+
//! Snippet
|
|
2189
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
2190
|
+
//!
|
|
2191
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
2192
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
2193
|
+
//! ``i`` nt values.
|
|
2194
|
+
//!
|
|
2195
|
+
//! .. code-block:: c++
|
|
2196
|
+
//!
|
|
2197
|
+
//! #include <cub/cub.cuh>
|
|
2198
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
2199
|
+
//!
|
|
2200
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
2201
|
+
//! // for sorting data
|
|
2202
|
+
//! int num_items; // e.g., 7
|
|
2203
|
+
//! int num_segments; // e.g., 3
|
|
2204
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
2205
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2206
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
2207
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
2208
|
+
//! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
|
|
2209
|
+
//! ...
|
|
2210
|
+
//!
|
|
2211
|
+
//! // Determine temporary device storage requirements
|
|
2212
|
+
//! void *d_temp_storage = nullptr;
|
|
2213
|
+
//! size_t temp_storage_bytes = 0;
|
|
2214
|
+
//! cub::DeviceSegmentedSort::StableSortPairs(
|
|
2215
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
2216
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
2217
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2218
|
+
//!
|
|
2219
|
+
//! // Allocate temporary storage
|
|
2220
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2221
|
+
//!
|
|
2222
|
+
//! // Run sorting operation
|
|
2223
|
+
//! cub::DeviceSegmentedSort::StableSortPairs(
|
|
2224
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
2225
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
2226
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2227
|
+
//!
|
|
2228
|
+
//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
|
|
2229
|
+
//! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
|
|
2230
|
+
//!
|
|
2231
|
+
//! @endrst
|
|
2232
|
+
//!
|
|
2233
|
+
//! @tparam KeyT
|
|
2234
|
+
//! **[inferred]** Key type
|
|
2235
|
+
//!
|
|
2236
|
+
//! @tparam ValueT
|
|
2237
|
+
//! **[inferred]** Value type
|
|
2238
|
+
//!
|
|
2239
|
+
//! @tparam BeginOffsetIteratorT
|
|
2240
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2241
|
+
//! beginning offsets @iterator
|
|
2242
|
+
//!
|
|
2243
|
+
//! @tparam EndOffsetIteratorT
|
|
2244
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2245
|
+
//! ending offsets @iterator
|
|
2246
|
+
//!
|
|
2247
|
+
//! @param[in] d_temp_storage
|
|
2248
|
+
//! Device-accessible allocation of temporary storage. When nullptr, the
|
|
2249
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
2250
|
+
//!
|
|
2251
|
+
//! @param[in,out] temp_storage_bytes
|
|
2252
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
2253
|
+
//!
|
|
2254
|
+
//! @param[in] d_keys_in
|
|
2255
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
2256
|
+
//!
|
|
2257
|
+
//! @param[out] d_keys_out
|
|
2258
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
2259
|
+
//!
|
|
2260
|
+
//! @param[in] d_values_in
|
|
2261
|
+
//! Device-accessible pointer to the corresponding input sequence of
|
|
2262
|
+
//! associated value items
|
|
2263
|
+
//!
|
|
2264
|
+
//! @param[out] d_values_out
|
|
2265
|
+
//! Device-accessible pointer to the correspondingly-reordered output
|
|
2266
|
+
//! sequence of associated value items
|
|
2267
|
+
//!
|
|
2268
|
+
//! @param[in] num_items
|
|
2269
|
+
//! The total number of items to sort (across all segments)
|
|
2270
|
+
//!
|
|
2271
|
+
//! @param[in] num_segments
|
|
2272
|
+
//! The number of segments that comprise the sorting data
|
|
2273
|
+
//!
|
|
2274
|
+
//! @param[in] d_begin_offsets
|
|
2275
|
+
//! @rst
|
|
2276
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
2277
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
2278
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
2279
|
+
//! @endrst
|
|
2280
|
+
//!
|
|
2281
|
+
//! @param[in] d_end_offsets
|
|
2282
|
+
//! @rst
|
|
2283
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
2284
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
2285
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
2286
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
2287
|
+
//! considered empty.
|
|
2288
|
+
//! @endrst
|
|
2289
|
+
//!
|
|
2290
|
+
//! @param[in] stream
|
|
2291
|
+
//! @rst
|
|
2292
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
2293
|
+
//! @endrst
|
|
2294
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
2295
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
|
|
2296
|
+
void* d_temp_storage,
|
|
2297
|
+
size_t& temp_storage_bytes,
|
|
2298
|
+
const KeyT* d_keys_in,
|
|
2299
|
+
KeyT* d_keys_out,
|
|
2300
|
+
const ValueT* d_values_in,
|
|
2301
|
+
ValueT* d_values_out,
|
|
2302
|
+
::cuda::std::int64_t num_items,
|
|
2303
|
+
::cuda::std::int64_t num_segments,
|
|
2304
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
2305
|
+
EndOffsetIteratorT d_end_offsets,
|
|
2306
|
+
cudaStream_t stream = 0)
|
|
2307
|
+
{
|
|
2308
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2309
|
+
return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
2310
|
+
d_temp_storage,
|
|
2311
|
+
temp_storage_bytes,
|
|
2312
|
+
d_keys_in,
|
|
2313
|
+
d_keys_out,
|
|
2314
|
+
d_values_in,
|
|
2315
|
+
d_values_out,
|
|
2316
|
+
num_items,
|
|
2317
|
+
num_segments,
|
|
2318
|
+
d_begin_offsets,
|
|
2319
|
+
d_end_offsets,
|
|
2320
|
+
stream);
|
|
2321
|
+
}
|
|
2322
|
+
|
|
2323
|
+
//! @rst
|
|
2324
|
+
//! Sorts segments of key-value pairs into descending order.
|
|
2325
|
+
//! Approximately ``2 * num_items + 2 * num_segments`` auxiliary storage required.
|
|
2326
|
+
//!
|
|
2327
|
+
//! - The contents of the input data are not altered by the sorting operation.
|
|
2328
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
2329
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
2330
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
2331
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
2332
|
+
//! - StableSortPairsDescending is stable: it preserves the relative ordering
|
|
2333
|
+
//! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
2334
|
+
//! ``x`` precedes ``y``, and if the two elements are equivalent (neither
|
|
2335
|
+
//! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
|
|
2336
|
+
//! ``x`` still precedes ``y``.
|
|
2337
|
+
//! - Let `in` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
|
|
2338
|
+
//! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
|
|
2339
|
+
//! not overlap ``[in, in + num_items)``,
|
|
2340
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
2341
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
2342
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
2343
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
|
|
2344
|
+
//! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
|
|
2345
|
+
//!
|
|
2346
|
+
//! Snippet
|
|
2347
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
2348
|
+
//!
|
|
2349
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
2350
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
2351
|
+
//! ``i`` nt values.
|
|
2352
|
+
//!
|
|
2353
|
+
//! .. code-block:: c++
|
|
2354
|
+
//!
|
|
2355
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
2356
|
+
//!
|
|
2357
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
2358
|
+
//! // for sorting data
|
|
2359
|
+
//! int num_items; // e.g., 7
|
|
2360
|
+
//! int num_segments; // e.g., 3
|
|
2361
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
2362
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2363
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
2364
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
2365
|
+
//! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
|
|
2366
|
+
//! ...
|
|
2367
|
+
//!
|
|
2368
|
+
//! // Determine temporary device storage requirements
|
|
2369
|
+
//! void *d_temp_storage = nullptr;
|
|
2370
|
+
//! size_t temp_storage_bytes = 0;
|
|
2371
|
+
//! cub::DeviceSegmentedSort::StableSortPairsDescending(
|
|
2372
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
2373
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
2374
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2375
|
+
//!
|
|
2376
|
+
//! // Allocate temporary storage
|
|
2377
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2378
|
+
//!
|
|
2379
|
+
//! // Run sorting operation
|
|
2380
|
+
//! cub::DeviceSegmentedSort::StableSortPairsDescending(
|
|
2381
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
2382
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
2383
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2384
|
+
//!
|
|
2385
|
+
//! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
|
|
2386
|
+
//! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
|
|
2387
|
+
//!
|
|
2388
|
+
//! @endrst
|
|
2389
|
+
//!
|
|
2390
|
+
//! @tparam KeyT
|
|
2391
|
+
//! **[inferred]** Key type
|
|
2392
|
+
//!
|
|
2393
|
+
//! @tparam ValueT
|
|
2394
|
+
//! **[inferred]** Value type
|
|
2395
|
+
//!
|
|
2396
|
+
//! @tparam BeginOffsetIteratorT
|
|
2397
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2398
|
+
//! beginning offsets @iterator
|
|
2399
|
+
//!
|
|
2400
|
+
//! @tparam EndOffsetIteratorT
|
|
2401
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2402
|
+
//! ending offsets @iterator
|
|
2403
|
+
//!
|
|
2404
|
+
//! @param[in] d_temp_storage
|
|
2405
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
2406
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
2407
|
+
//! is done
|
|
2408
|
+
//!
|
|
2409
|
+
//! @param[in,out] temp_storage_bytes
|
|
2410
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
2411
|
+
//!
|
|
2412
|
+
//! @param[in] d_keys_in
|
|
2413
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
2414
|
+
//!
|
|
2415
|
+
//! @param[out] d_keys_out
|
|
2416
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
2417
|
+
//!
|
|
2418
|
+
//! @param[in] d_values_in
|
|
2419
|
+
//! Device-accessible pointer to the corresponding input sequence of
|
|
2420
|
+
//! associated value items
|
|
2421
|
+
//!
|
|
2422
|
+
//! @param[out] d_values_out
|
|
2423
|
+
//! Device-accessible pointer to the correspondingly-reordered output
|
|
2424
|
+
//! sequence of associated value items
|
|
2425
|
+
//!
|
|
2426
|
+
//! @param[in] num_items
|
|
2427
|
+
//! The total number of items to sort (across all segments)
|
|
2428
|
+
//!
|
|
2429
|
+
//! @param[in] num_segments
|
|
2430
|
+
//! The number of segments that comprise the sorting data
|
|
2431
|
+
//!
|
|
2432
|
+
//! @param[in] d_begin_offsets
|
|
2433
|
+
//! @rst
|
|
2434
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
2435
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
2436
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
2437
|
+
//! @endrst
|
|
2438
|
+
//!
|
|
2439
|
+
//! @param[in] d_end_offsets
|
|
2440
|
+
//! @rst
|
|
2441
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
2442
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
2443
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
2444
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
2445
|
+
//! considered empty.
|
|
2446
|
+
//! @endrst
|
|
2447
|
+
//!
|
|
2448
|
+
//! @param[in] stream
|
|
2449
|
+
//! @rst
|
|
2450
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
2451
|
+
//! @endrst
|
|
2452
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
2453
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
|
|
2454
|
+
void* d_temp_storage,
|
|
2455
|
+
size_t& temp_storage_bytes,
|
|
2456
|
+
const KeyT* d_keys_in,
|
|
2457
|
+
KeyT* d_keys_out,
|
|
2458
|
+
const ValueT* d_values_in,
|
|
2459
|
+
ValueT* d_values_out,
|
|
2460
|
+
::cuda::std::int64_t num_items,
|
|
2461
|
+
::cuda::std::int64_t num_segments,
|
|
2462
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
2463
|
+
EndOffsetIteratorT d_end_offsets,
|
|
2464
|
+
cudaStream_t stream = 0)
|
|
2465
|
+
{
|
|
2466
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2467
|
+
return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
2468
|
+
d_temp_storage,
|
|
2469
|
+
temp_storage_bytes,
|
|
2470
|
+
d_keys_in,
|
|
2471
|
+
d_keys_out,
|
|
2472
|
+
d_values_in,
|
|
2473
|
+
d_values_out,
|
|
2474
|
+
num_items,
|
|
2475
|
+
num_segments,
|
|
2476
|
+
d_begin_offsets,
|
|
2477
|
+
d_end_offsets,
|
|
2478
|
+
stream);
|
|
2479
|
+
}
|
|
2480
|
+
|
|
2481
|
+
//! @rst
|
|
2482
|
+
//! Sorts segments of key-value pairs into ascending order.
|
|
2483
|
+
//! Approximately ``2 * num_segments`` auxiliary storage required.
|
|
2484
|
+
//!
|
|
2485
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
2486
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
2487
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
2488
|
+
//! contains the input data to be sorted).
|
|
2489
|
+
//! - The contents of both buffers within each pair may be altered by the
|
|
2490
|
+
//! sorting operation.
|
|
2491
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
2492
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
2493
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
2494
|
+
//! of key bits specified and the targeted device architecture).
|
|
2495
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
2496
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
2497
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
2498
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
2499
|
+
//! - StableSortPairs is stable: it preserves the relative ordering
|
|
2500
|
+
//! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
2501
|
+
//! ``x`` precedes `y`, and if the two elements are equivalent (neither
|
|
2502
|
+
//! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
|
|
2503
|
+
//! ``x`` still precedes ``y``.
|
|
2504
|
+
//! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
|
|
2505
|
+
//! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
|
|
2506
|
+
//! ``[cur, cur + num_items)`` shall not overlap
|
|
2507
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
2508
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
2509
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
2510
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
2511
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
2512
|
+
//! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
|
|
2513
|
+
//! ``d_values.Alternate()[i]`` will not be accessed nor modified.
|
|
2514
|
+
//!
|
|
2515
|
+
//! Snippet
|
|
2516
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
2517
|
+
//!
|
|
2518
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
2519
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
2520
|
+
//! ``i`` nt values.
|
|
2521
|
+
//!
|
|
2522
|
+
//! .. code-block:: c++
|
|
2523
|
+
//!
|
|
2524
|
+
//! #include <cub/cub.cuh>
|
|
2525
|
+
//! // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
2526
|
+
//!
|
|
2527
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
2528
|
+
//! // for sorting data
|
|
2529
|
+
//! int num_items; // e.g., 7
|
|
2530
|
+
//! int num_segments; // e.g., 3
|
|
2531
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
2532
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2533
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
2534
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
2535
|
+
//! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
2536
|
+
//! ...
|
|
2537
|
+
//!
|
|
2538
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
2539
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
2540
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
2541
|
+
//!
|
|
2542
|
+
//! // Determine temporary device storage requirements
|
|
2543
|
+
//! void *d_temp_storage = nullptr;
|
|
2544
|
+
//! size_t temp_storage_bytes = 0;
|
|
2545
|
+
//! cub::DeviceSegmentedSort::StableSortPairs(
|
|
2546
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
2547
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2548
|
+
//!
|
|
2549
|
+
//! // Allocate temporary storage
|
|
2550
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2551
|
+
//!
|
|
2552
|
+
//! // Run sorting operation
|
|
2553
|
+
//! cub::DeviceSegmentedSort::StableSortPairs(
|
|
2554
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
2555
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2556
|
+
//!
|
|
2557
|
+
//! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
|
|
2558
|
+
//! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
|
|
2559
|
+
//!
|
|
2560
|
+
//! @endrst
|
|
2561
|
+
//!
|
|
2562
|
+
//! @tparam KeyT
|
|
2563
|
+
//! **[inferred]** Key type
|
|
2564
|
+
//!
|
|
2565
|
+
//! @tparam ValueT
|
|
2566
|
+
//! **[inferred]** Value type
|
|
2567
|
+
//!
|
|
2568
|
+
//! @tparam BeginOffsetIteratorT
|
|
2569
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2570
|
+
//! beginning offsets @iterator
|
|
2571
|
+
//!
|
|
2572
|
+
//! @tparam EndOffsetIteratorT
|
|
2573
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2574
|
+
//! ending offsets @iterator
|
|
2575
|
+
//!
|
|
2576
|
+
//! @param[in] d_temp_storage
|
|
2577
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
2578
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
2579
|
+
//! is done
|
|
2580
|
+
//!
|
|
2581
|
+
//! @param[in,out] temp_storage_bytes
|
|
2582
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
2583
|
+
//!
|
|
2584
|
+
//! @param[in,out] d_keys
|
|
2585
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
2586
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
2587
|
+
//! point to the sorted output keys
|
|
2588
|
+
//!
|
|
2589
|
+
//! @param[in,out] d_values
|
|
2590
|
+
//! Double-buffer of values whose "current" device-accessible buffer contains
|
|
2591
|
+
//! the unsorted input values and, upon return, is updated to point to the
|
|
2592
|
+
//! sorted output values
|
|
2593
|
+
//!
|
|
2594
|
+
//! @param[in] num_items
|
|
2595
|
+
//! The total number of items to sort (across all segments)
|
|
2596
|
+
//!
|
|
2597
|
+
//! @param[in] num_segments
|
|
2598
|
+
//! The number of segments that comprise the sorting data
|
|
2599
|
+
//!
|
|
2600
|
+
//! @param[in] d_begin_offsets
|
|
2601
|
+
//! @rst
|
|
2602
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
2603
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
2604
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
2605
|
+
//! @endrst
|
|
2606
|
+
//!
|
|
2607
|
+
//! @param[in] d_end_offsets
|
|
2608
|
+
//! @rst
|
|
2609
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
2610
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
2611
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
2612
|
+
//! If ``d_end_offsets[i]-1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
2613
|
+
//! considered empty.
|
|
2614
|
+
//! @endrst
|
|
2615
|
+
//!
|
|
2616
|
+
//! @param[in] stream
|
|
2617
|
+
//! @rst
|
|
2618
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
2619
|
+
//! @endrst
|
|
2620
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
2621
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(
|
|
2622
|
+
void* d_temp_storage,
|
|
2623
|
+
size_t& temp_storage_bytes,
|
|
2624
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
2625
|
+
DoubleBuffer<ValueT>& d_values,
|
|
2626
|
+
::cuda::std::int64_t num_items,
|
|
2627
|
+
::cuda::std::int64_t num_segments,
|
|
2628
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
2629
|
+
EndOffsetIteratorT d_end_offsets,
|
|
2630
|
+
cudaStream_t stream = 0)
|
|
2631
|
+
{
|
|
2632
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2633
|
+
return SortPairsNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
2634
|
+
d_temp_storage,
|
|
2635
|
+
temp_storage_bytes,
|
|
2636
|
+
d_keys,
|
|
2637
|
+
d_values,
|
|
2638
|
+
num_items,
|
|
2639
|
+
num_segments,
|
|
2640
|
+
d_begin_offsets,
|
|
2641
|
+
d_end_offsets,
|
|
2642
|
+
stream);
|
|
2643
|
+
}
|
|
2644
|
+
|
|
2645
|
+
//! @rst
|
|
2646
|
+
//! Sorts segments of key-value pairs into descending order.
|
|
2647
|
+
//! Approximately ``2 * num_segments`` auxiliary storage required.
|
|
2648
|
+
//!
|
|
2649
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
2650
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
2651
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
2652
|
+
//! contains the input data to be sorted).
|
|
2653
|
+
//! - The contents of both buffers within each pair may be altered by the sorting
|
|
2654
|
+
//! operation.
|
|
2655
|
+
//! - Upon completion, the sorting operation will update the "current" indicator
|
|
2656
|
+
//! within each DoubleBuffer wrapper to reference which of the two buffers
|
|
2657
|
+
//! now contains the sorted output sequence (a function of the number of key bits
|
|
2658
|
+
//! specified and the targeted device architecture).
|
|
2659
|
+
//! - When the input is a contiguous sequence of segments, a single sequence
|
|
2660
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
2661
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
2662
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
2663
|
+
//! - StableSortPairsDescending is stable: it preserves the relative ordering
|
|
2664
|
+
//! of equivalent elements. That is, if ``x`` and ``y`` are elements such that
|
|
2665
|
+
//! ``x`` precedes ``y``, and if the two elements are equivalent (neither
|
|
2666
|
+
//! ``x < y`` nor ``y < x``) then a postcondition of stable sort is that
|
|
2667
|
+
//! ``x`` still precedes ``y``.
|
|
2668
|
+
//! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
|
|
2669
|
+
//! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
|
|
2670
|
+
//! ``[cur, cur + num_items)`` shall not overlap
|
|
2671
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
2672
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
2673
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
2674
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
2675
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
2676
|
+
//! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
|
|
2677
|
+
//! ``d_values.Alternate()[i]`` will not be accessed nor modified.
|
|
2678
|
+
//!
|
|
2679
|
+
//! Snippet
|
|
2680
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
2681
|
+
//!
|
|
2682
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
2683
|
+
//! (with one zero-length segment) of ``i`` nt keys with associated vector of
|
|
2684
|
+
//! ``i`` nt values.
|
|
2685
|
+
//!
|
|
2686
|
+
//! .. code-block:: c++
|
|
2687
|
+
//!
|
|
2688
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_sort.cuh>
|
|
2689
|
+
//!
|
|
2690
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
2691
|
+
//! // for sorting data
|
|
2692
|
+
//! int num_items; // e.g., 7
|
|
2693
|
+
//! int num_segments; // e.g., 3
|
|
2694
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
2695
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2696
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
2697
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
2698
|
+
//! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
2699
|
+
//! ...
|
|
2700
|
+
//!
|
|
2701
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
2702
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
2703
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
2704
|
+
//!
|
|
2705
|
+
//! // Determine temporary device storage requirements
|
|
2706
|
+
//! void *d_temp_storage = nullptr;
|
|
2707
|
+
//! size_t temp_storage_bytes = 0;
|
|
2708
|
+
//! cub::DeviceSegmentedSort::StableSortPairsDescending(
|
|
2709
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
2710
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2711
|
+
//!
|
|
2712
|
+
//! // Allocate temporary storage
|
|
2713
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2714
|
+
//!
|
|
2715
|
+
//! // Run sorting operation
|
|
2716
|
+
//! cub::DeviceSegmentedSort::StableSortPairsDescending(
|
|
2717
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
2718
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
2719
|
+
//!
|
|
2720
|
+
//! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
|
|
2721
|
+
//! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
|
|
2722
|
+
//!
|
|
2723
|
+
//! @endrst
|
|
2724
|
+
//!
|
|
2725
|
+
//! @tparam KeyT
|
|
2726
|
+
//! **[inferred]** Key type
|
|
2727
|
+
//!
|
|
2728
|
+
//! @tparam ValueT
|
|
2729
|
+
//! **[inferred]** Value type
|
|
2730
|
+
//!
|
|
2731
|
+
//! @tparam BeginOffsetIteratorT
|
|
2732
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2733
|
+
//! beginning offsets @iterator
|
|
2734
|
+
//!
|
|
2735
|
+
//! @tparam EndOffsetIteratorT
|
|
2736
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
2737
|
+
//! ending offsets @iterator
|
|
2738
|
+
//!
|
|
2739
|
+
//! @param[in] d_temp_storage
|
|
2740
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
2741
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
2742
|
+
//! is done
|
|
2743
|
+
//!
|
|
2744
|
+
//! @param[in,out] temp_storage_bytes
|
|
2745
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
2746
|
+
//!
|
|
2747
|
+
//! @param[in,out] d_keys
|
|
2748
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
2749
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
2750
|
+
//! point to the sorted output keys
|
|
2751
|
+
//!
|
|
2752
|
+
//! @param[in,out] d_values
|
|
2753
|
+
//! Double-buffer of values whose "current" device-accessible buffer contains
|
|
2754
|
+
//! the unsorted input values and, upon return, is updated to point to the
|
|
2755
|
+
//! sorted output values
|
|
2756
|
+
//!
|
|
2757
|
+
//! @param[in] num_items
|
|
2758
|
+
//! The total number of items to sort (across all segments)
|
|
2759
|
+
//!
|
|
2760
|
+
//! @param[in] num_segments
|
|
2761
|
+
//! The number of segments that comprise the sorting data
|
|
2762
|
+
//!
|
|
2763
|
+
//! @param[in] d_begin_offsets
|
|
2764
|
+
//! @rst
|
|
2765
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
2766
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
2767
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
2768
|
+
//! @endrst
|
|
2769
|
+
//!
|
|
2770
|
+
//! @param[in] d_end_offsets
|
|
2771
|
+
//! @rst
|
|
2772
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
2773
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
2774
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
2775
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the ``i``-th segment is
|
|
2776
|
+
//! considered empty.
|
|
2777
|
+
//! @endrst
|
|
2778
|
+
//!
|
|
2779
|
+
//! @param[in] stream
|
|
2780
|
+
//! @rst
|
|
2781
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
2782
|
+
//! @endrst
|
|
2783
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
2784
|
+
CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairsDescending(
|
|
2785
|
+
void* d_temp_storage,
|
|
2786
|
+
size_t& temp_storage_bytes,
|
|
2787
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
2788
|
+
DoubleBuffer<ValueT>& d_values,
|
|
2789
|
+
::cuda::std::int64_t num_items,
|
|
2790
|
+
::cuda::std::int64_t num_segments,
|
|
2791
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
2792
|
+
EndOffsetIteratorT d_end_offsets,
|
|
2793
|
+
cudaStream_t stream = 0)
|
|
2794
|
+
{
|
|
2795
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
2796
|
+
return SortPairsDescendingNoNVTX<KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT>(
|
|
2797
|
+
d_temp_storage,
|
|
2798
|
+
temp_storage_bytes,
|
|
2799
|
+
d_keys,
|
|
2800
|
+
d_values,
|
|
2801
|
+
num_items,
|
|
2802
|
+
num_segments,
|
|
2803
|
+
d_begin_offsets,
|
|
2804
|
+
d_end_offsets,
|
|
2805
|
+
stream);
|
|
2806
|
+
}
|
|
2807
|
+
|
|
2808
|
+
//! @} end member group
|
|
2809
|
+
};
|
|
2810
|
+
|
|
2811
|
+
CUB_NAMESPACE_END
|